In [None]:


from datasets import load_dataset
import pandas as pd


dataset = load_dataset("Ateeqq/AI-and-Human-Generated-Text")


df = dataset["train"].to_pandas()

print(df.head())
print(df["label"].value_counts())




                                               title  \
0  Epigenetic inheritance of circadian period in ...   
1  Pediatric Airway Stent Designed to Facilitate ...   
2  Infection prevention and control in paediatric...   
3  Correlation between thyroid function, testoste...   
4  Natural intertypic and intratypic recombinants...   

                                            abstract  label  
0  \n\nThis study focuses on the epigenetic inher...      1  
1  Objective: The goal was to develop a pediatric...      0  
2  Transmission of infection in the paediatric of...      0  
3  STUDY DESIGN: Prospective case series. OBJECTI...      0  
4  \n\nThis study aims to analyze the recombinant...      1  
label
1    11465
0    11465
Name: count, dtype: int64


In [None]:
import pandas as pd
import re
import os


os.makedirs(BASE_DIR, exist_ok=True)






print("Before cleaning:", df.shape)


df["text"] = df["title"].fillna("") + " " + df["abstract"].fillna("")


def clean_text(text):
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[^a-z\s]", "", text)
    return text.strip()

df["text"] = df["text"].apply(clean_text)


df["word_count"] = df["text"].apply(lambda x: len(x.split()))


df = df[(df["word_count"] >= 80) & (df["word_count"] <= 400)]


df = df[["text", "label"]].reset_index(drop=True)

print("After cleaning:", df.shape)
print(df["label"].value_counts())






Before cleaning: (22930, 3)
After cleaning: (21612, 2)
label
0    10821
1    10791
Name: count, dtype: int64
âœ… Saved: /content/drive/MyDrive/ai_human_detection/final_clean_ai_human_text.csv


In [None]:

# FINAL FIX: NLTK POS TAGGER


import nltk

nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")
nltk.download("averaged_perceptron_tagger_eng")



 All NLTK resources installed correctly


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [None]:
# STEP: Linguistic Feature Extraction

import pandas as pd
import textstat
from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

print("Dataset loaded:", df.shape)

def extract_linguistic_features(text):
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    pos_tags = pos_tag(words)

    wc = len(words)
    sc = len(sentences)

    lexical_div = len(set(words)) / wc if wc else 0

    noun_ratio = sum(1 for _, t in pos_tags if t.startswith("NN")) / wc if wc else 0
    verb_ratio = sum(1 for _, t in pos_tags if t.startswith("VB")) / wc if wc else 0
    adj_ratio  = sum(1 for _, t in pos_tags if t.startswith("JJ")) / wc if wc else 0
    stop_ratio = sum(1 for w in words if w in stop_words) / wc if wc else 0

    readability = textstat.flesch_reading_ease(text)

    return [
        wc,
        sc,
        wc / sc if sc else 0,
        lexical_div,
        noun_ratio,
        verb_ratio,
        adj_ratio,
        stop_ratio,
        readability
    ]

print("Extracting linguistic features...")

features = df["text"].apply(extract_linguistic_features)

ling_df = pd.DataFrame(
    features.tolist(),
    columns=[
        "word_count",
        "sentence_count",
        "avg_sentence_length",
        "lexical_diversity",
        "noun_ratio",
        "verb_ratio",
        "adj_ratio",
        "stopword_ratio",
        "readability"
    ]
)

ling_df["label"] = df["label"]

print("Linguistic features extracted successfully")
print(ling_df.head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Dataset loaded: (21612, 2)
 Extracting linguistic features (this WILL take time)...
âœ… Linguistic features extracted
   word_count  sentence_count  avg_sentence_length  lexical_diversity  \
0         203               1                203.0           0.783251   
1         250               1                250.0           0.536000   
2         103               1                103.0           0.669903   
3         264               1                264.0           0.465909   
4         132               1                132.0           0.757576   

   noun_ratio  verb_ratio  adj_ratio  stopword_ratio  readability  label  
0    0.320197    0.162562   0.182266        0.320197  -190.497685      1  
1    0.336000    0.180000   0.112000        0.352000  -212.392600      0  
2    0.436893    0.116505   0.126214        0.330097   -69.374078      0  
3    0.420455    0.132576   0.159091        0.318182  -221.352273      0  
4    0.303030    0.136364   0.189394        0.378788  -104.035909   

In [None]:
# STEP: TF-IDF Feature Extraction

from sklearn.feature_extraction.text import TfidfVectorizer

print("Dataset loaded:", df.shape)

tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.9,
    stop_words="english"
)

print("Fitting TF-IDF vectorizer...")
X_tfidf = tfidf.fit_transform(df["text"])

print("TF-IDF extraction completed")
print("TF-IDF shape:", X_tfidf.shape)

Dataset loaded: (21612, 2)
ðŸ”„ Fitting TF-IDF vectorizer...
âœ… TF-IDF extraction completed
TF-IDF shape: (21612, 5000)


In [None]:
# STEP: Feature Fusion + Train/Test Split

import numpy as np
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split

# Labels
y = df["label"].values

# Linguistic features (drop label)
ling_features = ling_df.drop(columns=["label"]).values

print("Text samples:", df.shape)
print("Linguistic features:", ling_features.shape)
print("TF-IDF shape:", X_tfidf.shape)

# Feature fusion
X_combined = hstack([X_tfidf, ling_features])
print("Combined feature shape:", X_combined.shape)

# Train/Test split (80/20, stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X_combined,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Split complete")
print("Training samples:", X_train.shape[0])
print("Testing samples :", X_test.shape[0])

Text samples: (21612, 2)
Linguistic features: (21612, 9)
TF-IDF shape: (21612, 5000)
Combined feature shape: (21612, 5009)
âœ… Split complete
Training samples: 17289
Testing samples : 4323


In [None]:
# STEP: Train & Evaluate Random Forest

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# Convert sparse matrix to dense (required for Random Forest)
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

# Initialize classifier
rf_clf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

# Train model
print("Training Random Forest classifier...")
rf_clf.fit(X_train_dense, y_train)

# Evaluate model
y_pred = rf_clf.predict(X_test_dense)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="macro")
report = classification_report(y_test, y_pred, digits=4)
cm = confusion_matrix(y_test, y_pred)

print("\nEvaluation Results")
print(f"Accuracy : {accuracy:.4f}")
print(f"Macro F1 : {f1:.4f}")
print("\nClassification Report:\n", report)
print("\nConfusion Matrix:\n", cm)

Training Random Forest classifier...

 Evaluation Results
Accuracy : 0.9783
Macro F1 : 0.9783

Classification Report:
               precision    recall  f1-score   support

           0     0.9868    0.9695    0.9781      2165
           1     0.9699    0.9870    0.9784      2158

    accuracy                         0.9783      4323
   macro avg     0.9784    0.9783    0.9783      4323
weighted avg     0.9784    0.9783    0.9783      4323


Confusion Matrix:
 [[2099   66]
 [  28 2130]]

 Random Forest model saved


In [None]:
# STEP: Test on New / Unseen Text

import re
import numpy as np
import pandas as pd
import textstat
from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.corpus import stopwords
from scipy.sparse import hstack

stop_words = set(stopwords.words("english"))

def extract_linguistic_features(text):
    text = str(text).lower()
    text = re.sub(r"\s+", " ", text).strip()

    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    pos_tags = pos_tag(words)

    wc = len(words)
    sc = len(sentences)

    lexical_div = len(set(words)) / wc if wc else 0
    noun_ratio = sum(1 for _, t in pos_tags if t.startswith("NN")) / wc if wc else 0
    verb_ratio = sum(1 for _, t in pos_tags if t.startswith("VB")) / wc if wc else 0
    adj_ratio  = sum(1 for _, t in pos_tags if t.startswith("JJ")) / wc if wc else 0
    stop_ratio = sum(1 for w in words if w in stop_words) / wc if wc else 0
    readability = textstat.flesch_reading_ease(text)

    return [
        wc, sc, wc / sc if sc else 0,
        lexical_div, noun_ratio, verb_ratio,
        adj_ratio, stop_ratio, readability
    ]

def predict_text(texts):
    if isinstance(texts, str):
        texts = [texts]

    X_tfidf_new = tfidf.transform(texts)
    ling_features = np.array([extract_linguistic_features(t) for t in texts])

    X_combined = hstack([X_tfidf_new, ling_features]).toarray()

    preds = rf_clf.predict(X_combined)
    probs = rf_clf.predict_proba(X_combined)

    results = []
    for text, p, prob in zip(texts, preds, probs):
        label = "Human-Written" if p == 0 else "AI-Generated"
        results.append({
            "text": text,
            "prediction": label,
            "prob_human": round(prob[0], 4),
            "prob_ai": round(prob[1], 4)
        })

    return pd.DataFrame(results)

# Example
samples = [
    "This paper presents an experimental evaluation conducted by the authors.",
    "As an AI language model, I generate responses based on training data."
]

print(predict_text(samples))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


                                                text     prediction  \
0  This paper presents an experimental evaluation...   AI-Generated   
1  As an AI language model, I generate responses ...  Human-Written   

   prob_human  prob_ai  
0       0.165    0.835  
1       0.755    0.245  
