<a href="https://colab.research.google.com/github/PrakashG321/FakeNewsDetection/blob/main/NLP_enhanced_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import string
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# Download required NLTK data
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")

# Define the dataset folder path
dataset_folder = "/content/drive/MyDrive/liar_dataset"  # Adjust this path as needed

def read_dataset() -> tuple:
    # these are tsv files, so put separator as '\t'
    train_df = pd.read_csv(f"{dataset_folder}/train.tsv", sep="\t")
    test_df = pd.read_csv(f"{dataset_folder}/test.tsv", sep="\t")
    valid_df = pd.read_csv(f"{dataset_folder}/valid.tsv", sep="\t")

    # define columns
    columns = ["id", "label", "statement", "subject", "speaker",
               "speaker_job", "state_info", "party_affiliation",
               "barely_true_counts", "false_counts", "half_true_counts",
               "mostly_true_counts", "pants_on_fire_counts", "context"]

    train_df.columns = columns
    test_df.columns = columns
    valid_df.columns = columns

    return train_df, test_df, valid_df

def preprocess_text(text: str) -> str:
    if pd.isna(text):
        return ""
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))
    text = str(text).lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r'\d+', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    processed_text = " ".join(tokens)
    return processed_text

# Read the dataset
train_df, test_df, valid_df = read_dataset()

# Feature engineering
def engineer_features(df):
    df['processed_statement'] = df['statement'].apply(preprocess_text)
    df['statement_length'] = df['statement'].apply(lambda x: len(str(x)))
    df['speaker_job_encoded'] = pd.Categorical(df['speaker_job']).codes
    df['party_encoded'] = pd.Categorical(df['party_affiliation']).codes
    df['subject_encoded'] = pd.Categorical(df['subject']).codes
    return df

train_df = engineer_features(train_df)
valid_df = engineer_features(valid_df)
test_df = engineer_features(test_df)

# Prepare features
feature_columns = ['processed_statement', 'statement_length', 'speaker_job_encoded',
                   'party_encoded', 'subject_encoded', 'barely_true_counts',
                   'false_counts', 'half_true_counts', 'mostly_true_counts',
                   'pants_on_fire_counts']

# Combine train and validation sets
combined_df = pd.concat([train_df, valid_df])

# Vectorize text
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,3))
text_features = vectorizer.fit_transform(combined_df['processed_statement'])

# Prepare other features
other_features = combined_df[feature_columns[1:]].values

# Handle NaN values
imputer = SimpleImputer(strategy='mean')
other_features_imputed = imputer.fit_transform(other_features)

scaler = StandardScaler()
other_features_scaled = scaler.fit_transform(other_features_imputed)

# Combine all features
X = np.hstack((text_features.toarray(), other_features_scaled))
y = combined_df['label'].apply(lambda x: 1 if x in ["half-true", "mostly-true", "true"] else 0)

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE for class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Train Random Forest with GridSearchCV
rf_classifier = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(rf_classifier, param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_resampled, y_train_resampled)

# Evaluate on validation set
y_val_pred = grid_search.predict(X_val)
print("Validation set performance:")
print(accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))

# Prepare test set
test_text_features = vectorizer.transform(test_df['processed_statement'])
test_other_features = imputer.transform(test_df[feature_columns[1:]])
test_other_features_scaled = scaler.transform(test_other_features)
X_test = np.hstack((test_text_features.toarray(), test_other_features_scaled))
y_test = test_df['label'].apply(lambda x: 1 if x in ["half-true", "mostly-true", "true"] else 0)

# Evaluate on test set
y_test_pred = grid_search.predict(X_test)
print("Test set performance:")
print(accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Fitting 3 folds for each of 24 candidates, totalling 72 fits
Validation set performance:
0.7357917570498915
              precision    recall  f1-score   support

           0       0.71      0.67      0.69      1010
           1       0.75      0.79      0.77      1295

    accuracy                           0.74      2305
   macro avg       0.73      0.73      0.73      2305
weighted avg       0.73      0.74      0.73      2305





Test set performance:
0.7251184834123223
              precision    recall  f1-score   support

           0       0.69      0.68      0.68       553
           1       0.75      0.76      0.76       713

    accuracy                           0.73      1266
   macro avg       0.72      0.72      0.72      1266
weighted avg       0.72      0.73      0.73      1266



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
