In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import torch
import time
from tqdm import tqdm
tqdm.pandas()

from transformers import pipeline
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix, classification_report, precision_recall_fscore_support

import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

SEED=42

In [None]:
data_dir = Path('/kaggle/input/fake-new-detection-dataset-mi')
df_trn = pd.read_csv(data_dir/'train.csv')
df_val = pd.read_csv(data_dir/'valid.csv')
df_tst = pd.read_csv(data_dir/'test.csv')

In [None]:
df_trn['labels'] = df_trn['labels'].map({'true': 1, 'fake': 0})
df_tst['labels'] = df_tst['labels'].map({'true': 1, 'fake': 0})

In [None]:
vector = TfidfVectorizer()

X_trn = df_trn['text']
y_trn = df_trn['labels']
X_tst = df_tst['text']
y_tst = df_tst['labels']

X_trn = vector.fit_transform(X_trn)
X_tst = vector.transform(X_tst)

In [None]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(random_state=SEED),
}

In [None]:
for name, model in models.items():
    model.fit(X_trn, y_trn)
    y_preds = model.predict(X_tst)
    
    LABELS = ['Fake', 'True']
    conf_matrix = confusion_matrix(y_tst, y_preds)
    
    plt.figure(figsize =(6,5))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=LABELS, yticklabels=LABELS)
    plt.title(f'{name} Performance On {len(y_tst)} Samples')
    plt.ylabel('True class')
    plt.xlabel('Predicted class')
    plt.show()
    
    acc = accuracy_score(y_tst, y_preds)
    precision = precision_score(y_tst, y_preds)
    recall = recall_score(y_tst, y_preds)
    f1 = f1_score(y_tst, y_preds)
    mcc = matthews_corrcoef(y_tst, y_preds)
    
    print(f"Correct Predictions: {(y_preds == y_tst).sum()}/{len(y_tst)}")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"Matthews Correlation Coefficient: {mcc:.4f}")