In [3]:
import pandas as pd
import numpy as np
import json
import re
import joblib
import textstat
import nltk
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, roc_curve

nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sakshisinha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
with open("final_dataset.json", "r") as f:
    raw_data = json.load(f)

df = pd.DataFrame(raw_data)
print("dataset loaded")


with open("config.json", "r") as f:
    config = json.load(f)

buzzwords = config["regex_blacklist"]
print("Buzzwords loaded from config.json:", buzzwords)


dataset loaded
Buzzwords loaded from config.json: ['\\bpassionate about\\b', '\\bresults-driven\\b', '\\bteam player\\b', '\\bhighly motivated\\b', '\\bdynamic\\b', '\\bAI enthusiast\\b', '\\bthought leader\\b', '\\bproblem solver\\b', '\\binnovative thinker\\b', '\\bproven track record\\b', '\\bstrategic thinker\\b', '\\bfast learner\\b', '\\bself-starter\\b', '\\bgo-getter\\b', '\\bout-of-the-box thinker\\b', '\\bvisionary\\b', '\\bleading-edge\\b', '\\bdriven by excellence\\b', '\\bchange agent\\b', '\\bdisruptive mindset\\b', '\\bresults-oriented\\b', '\\bexpert in\\b', '\\bstrong communication skills\\b', '\\bexceptional interpersonal skills\\b', '\\bdedicated professional\\b', '\\benthusiastic learner\\b', '\\bworked on multiple projects\\b', 'skilled in Python, Java, and C\\+\\+', '\\binterested in AI and ML\\b', '\\bblockchain believer\\b', '\\btech-savvy\\b', '\\blifelong learner\\b', '\\bdetail-oriented\\b']


In [5]:
print(df.columns.tolist())

['user_id', 'profile_data', 'authenticity_score', 'verdict', 'reason', 'flagged_fields']


In [6]:
def clean_text(text):
    return re.sub(r'\s+', ' ', re.sub(r'[^A-Za-z0-9\s]', '', text)).strip().lower()

def count_buzzwords_regex(text):
    count = 0
    for pattern in buzzwords:
        try:
            if re.search(pattern, text, re.IGNORECASE):
                count += 1
        except re.error as e:
            print(f"Invalid regex pattern: {pattern} – {e}")
    return count

def compute_readability(text):
    try:
        return textstat.flesch_reading_ease(text)
    except:
        return 0.0  # fallback if textstat fails


def preprocess(row):
    profile = row.get("profile_data", {})
    full_text = f"{profile.get('headline', '')} {profile.get('bio', '')}"
    cleaned = clean_text(full_text)
    return pd.Series({
        "text": cleaned,
        "buzzword_count": count_buzzwords_regex(cleaned),
        "readability": compute_readability(full_text)
    })


processed = df.apply(preprocess, axis=1)
df = pd.concat([df, processed], axis=1)
df.head()


Unnamed: 0,user_id,profile_data,authenticity_score,verdict,reason,flagged_fields,text,buzzword_count,readability
0,Aditi Gupta,{'headline': 'E-commerce Content Writer / Copy...,0.39,authentic,Buzzword match and low uniqueness,"[headline, bio]",ecommerce content writer copywriter branding s...,0,36.03
1,Aditya Padhi,"{'headline': 'Java(DSA), Python (TensorFlow, P...",0.86,authentic,Buzzword match and low uniqueness,"[headline, bio]",javadsa python tensorflow pytorch keras numpy ...,0,-0.696709
2,Basava Kusumanjali,"{'headline': 'IT Student', 'bio': '3rd year B....",0.26,authentic,"Natural language, few red flags",[],it student 3rd year btech student at sridevi w...,0,24.031029
3,Challa Venkata ramana,{'headline': 'MERN Stack and Machine Learning ...,0.68,authentic,Buzzword match and low uniqueness,"[headline, bio]",mern stack and machine learning enthusiast raj...,1,20.825606
4,Debbati Saikrishna,{'headline': 'Fellow at NxtWave's CCBP 4.0 Aca...,0.3,authentic,"Natural language, few red flags",[],fellow at nxtwaves ccbp 40 academy learning fu...,1,17.848772


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=1000)
X_text = vectorizer.fit_transform(df['text'])

import numpy as np
X_numeric = df[['buzzword_count', 'readability']].values

from scipy.sparse import csr_matrix
X = hstack([X_text, csr_matrix(X_numeric)])


In [11]:
df['label'] = df['verdict'].map({
    'authentic': 1,
    'likely_fabricated': 0,
    'borderline': 0  
})
y = df['label'].values


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


              precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.80      1.00      0.89         8

    accuracy                           0.82        11
   macro avg       0.90      0.67      0.69        11
weighted avg       0.85      0.82      0.78        11

ROC-AUC: 1.0


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

df['target'] = df['verdict'].apply(lambda x: 1 if x == 'authentic' else 0)

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, df['target'], test_size=0.2, random_state=42
)

# Initialize and train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Calculate accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")

# Optional: detailed classification report
print(classification_report(y_test, y_pred, target_names=['fake', 'authentic']))


Accuracy: 1.0000
              precision    recall  f1-score   support

        fake       1.00      1.00      1.00         4
   authentic       1.00      1.00      1.00         7

    accuracy                           1.00        11
   macro avg       1.00      1.00      1.00        11
weighted avg       1.00      1.00      1.00        11



In [16]:
import joblib

joblib.dump(model, 'model/model.pkl')
joblib.dump(vectorizer, 'model/vectorizer.pkl')

print ("model saved in /model")


model saved in /model
