In [6]:
!pip install --quiet xgboost scikit-learn pandas numpy

In [7]:
import pandas as pd
import numpy as np
import re, string

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier

In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [10]:
import xgboost as xgb

In [11]:
data_train=pd.read_csv("real_vs_fake_train.csv")
data_test=pd.read_csv("real_vs_fake_test.csv")

In [12]:
data_train =data_train.dropna(subset=["Text","label"])
data_test=data_test.dropna(subset=["Text","label"])

In [13]:
def clean_text(text):
    text=text.lower()
    text=re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text= re.sub(r"\d+", "", text)
    return text

In [14]:
data_train["Text"]=data_train["Text"].astype(str).apply(clean_text)
data_test["Text"]=data_test["Text"].astype(str).apply(clean_text)

In [15]:
label_map={"Fake":0,"Real":1}
data_train["label"] =data_train["label"].map(label_map)
data_test["label"]=data_test["label"].map(label_map)

In [16]:
data_train=data_train.dropna(subset=["label"])
data_test=data_test.dropna(subset=["label"])

In [17]:
X_train=data_train["Text"]
y_train=data_train["label"]
X_test=data_test["Text"]
y_test=data_test["label"]

In [22]:
vectorizer=TfidfVectorizer(max_features=5000,stop_words="english")
X_train_tfidf=vectorizer.fit_transform(X_train)
X_test_tfidf =vectorizer.transform(X_test)

In [23]:
log_reg=LogisticRegression(max_iter=1000)
mlp=MLPClassifier(hidden_layer_sizes=(128,), max_iter=500)
xgb_model=xgb.XGBClassifier(eval_metric='logloss')

In [24]:
ensemble=VotingClassifier(estimators=[('lr',log_reg),('mlp',mlp),('xgb',xgb_model)],voting='hard')

In [27]:
ensemble.fit(X_train_tfidf,y_train)

In [28]:
y_pred=ensemble.predict(X_test_tfidf)

In [29]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, pos_label=1))
print("Recall:", recall_score(y_test, y_pred, pos_label=1))
print("F1-score:", f1_score(y_test, y_pred, pos_label=1))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9949494949494949
Precision: 0.9930761622156281
Recall: 0.997020854021847
F1-score: 0.9950445986124876
Confusion Matrix:
 [[ 966    7]
 [   3 1004]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99       973
           1       0.99      1.00      1.00      1007

    accuracy                           0.99      1980
   macro avg       0.99      0.99      0.99      1980
weighted avg       0.99      0.99      0.99      1980

