In [1]:
#Step 1: Merging the Two Text Files (Train and Test Sets)
import pandas as pd

In [2]:
train_df = pd.read_csv('train_English_Data_Complete_FakeNews.txt', delimiter='\t')

In [3]:
test_df = pd.read_csv('test_English_Data_Complete_FakeNews.txt', delimiter='\t')

In [4]:
# Merge train and test DataFrames
merged_df = pd.concat([train_df, test_df], ignore_index=True)

In [5]:
# Save the merged dataset to a new text file or CSV
merged_df.to_csv('merged_dataset.csv', index=False)

In [6]:
merged_df.head()


Unnamed: 0,Domain,Topic,News,Label
0,Sports,Roger Federer beats Frances Tiafoe on return ...,Roger Federer beat American teenager Frances T...,Legit
1,Celebrity,'It was a long time coming': Heidi Klum ends s...,Seal and Heidi Klum insisted their split was '...,Fake
2,Celebrity,Liam Payne Just Dissed Harry Styles' Solo Music,"Harry Styles' new music isn't for everyone, no...",Fake
3,Entertainment,We all know why the right is angry at Tomi Lahren,Conservative darling-turned-pariah Tomi Lahren...,Legit
4,Celebrity,Inside Beyoncé's baby shower,Don't act like you didn't know Beyoncé was goi...,Legit


In [59]:
#check if large enough data set 
print(merged_df.shape)


(980, 4)


In [61]:
print(merged_df.memory_usage(deep=True)) 


Index        132
Domain     64680
Topic     142381
News      499405
Label      60270
dtype: int64


Step 2: Build an AI Model for Fake News Detection

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb


In [8]:
# Load the merged dataset
df = pd.read_csv('merged_dataset.csv')


In [9]:
# Drop any rows with missing values (optional)
df.dropna(subset=['News', 'Label'], inplace=True)


In [10]:
# Encode the 'Label' column (Fake -> 1, Legit -> 0)
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Label'])


In [11]:
# Split the dataset into training and testing sets
X = df['News']  # Features (news articles)
y = df['Label']  # Labels (0: Legit, 1: Fake)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
# Drop rows where 'text' column is NaN
X_train = X_train.dropna()
X_test = X_test.dropna()

In [13]:
# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [14]:
# Initialize and train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


In [15]:
# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)


In [16]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [17]:
print(f"Logistic Regression Model Accuracy: {accuracy*100:.2f}%")
print("Confusion Matrix:")
print(conf_matrix)

Logistic Regression Model Accuracy: 46.43%
Confusion Matrix:
[[43 60]
 [45 48]]


# Using Random Forest as LogisticRegression model is not so accurate.

In [18]:
#from sklearn.ensemble import RandomForestClassifier above already have 


In [19]:
# Initialize Random Forest model
rf_model = RandomForestClassifier(random_state=42)


In [20]:
# Fit the model to the training data
rf_model.fit(X_train_tfidf, y_train)


In [21]:
# Make predictions
y_pred_rf = rf_model.predict(X_test_tfidf)

In [22]:
# Evaluate performance
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

Random Forest Accuracy: 0.5561224489795918


#XGBoost Classifier
!pip install xgboost

#Because my labels y_train contain the strings 'Fake' and 'Legit', while XGBoost expects numerical labels like 0 and 1. I need to convert my categorical labels ('Fake' and 'Legit') into numerical labels before training the model by using Label Encoder.

In [23]:
#from sklearn.preprocessing import LabelEncoder already above have 

In [24]:
# Initialize LabelEncoder
#label_encoder = LabelEncoder()

In [25]:
#!pip install xgboost

In [26]:
#import xgboost as xgb

In [27]:
# Initialize XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42)

#Because my labels y_train contain the strings 'Fake' and 'Legit', while XGBoost expects numerical labels like 0 and 1. I need to convert my categorical labels ('Fake' and 'Legit') into numerical labels before training the model by using Label Encoder.

In [28]:
#from sklearn.preprocessing import LabelEncoder


In [29]:
# Initialize LabelEncoder
#label_encoder = LabelEncoder()

In [30]:
# Fit and transform the labels in y_train and y_test
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [31]:
# Fit the model to the training data
xgb_model.fit(X_train_tfidf, y_train)

In [34]:
# Make predictions
y_pred_xgb = xgb_model.predict(X_test_tfidf)

In [35]:
# Evaluate performance
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))

XGBoost Accuracy: 0.5918367346938775


#Neural Network (MLPClassifier)

In [36]:
from sklearn.neural_network import MLPClassifier

In [37]:
# Initialize MLP (Neural Network) model
mlp_model = MLPClassifier(random_state=42, max_iter=500)


In [38]:
# Fit the model to the training data
mlp_model.fit(X_train_tfidf, y_train)

In [41]:
# Make predictions
y_pred_mlp = mlp_model.predict(X_test_tfidf)


In [42]:
# Evaluate performance
print("Neural Network Accuracy:", accuracy_score(y_test, y_pred_mlp))

Neural Network Accuracy: 0.4336734693877551


#Grid Search Example (for Random Forest)

In [43]:
from sklearn.model_selection import GridSearchCV

In [44]:
# Define the hyperparameters and the grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [45]:
# Initialize Random Forest model
rf_model = RandomForestClassifier(random_state=42)

In [46]:
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)


In [47]:
# Fit GridSearchCV
grid_search.fit(X_train_tfidf, y_train)


Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [48]:
# Best parameters from Grid Search
print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}


In [49]:
# Make predictions using the best estimator
y_pred_rf_grid = grid_search.best_estimator_.predict(X_test_tfidf)

In [50]:
# Evaluate performance - a little better only 
print("Grid Search Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf_grid))

Grid Search Random Forest Accuracy: 0.6020408163265306


# Random Search Example (for XGBoost)

In [51]:
from sklearn.model_selection import RandomizedSearchCV

In [52]:
# Define the hyperparameter space for Random Search
param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}


In [53]:
# Initialize XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42)

In [54]:
# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist, n_iter=10, cv=5, random_state=42, n_jobs=-1, verbose=2)


In [55]:
# Fit RandomizedSearchCV
random_search.fit(X_train_tfidf, y_train)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [56]:
# Best parameters from Random Search
print("Best Hyperparameters:", random_search.best_params_)

Best Hyperparameters: {'subsample': 0.8, 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.1, 'colsample_bytree': 0.9}


In [57]:
# Make predictions using the best estimator
y_pred_xgb_random = random_search.best_estimator_.predict(X_test_tfidf)

In [58]:
# Evaluate performance
print("Random Search XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb_random))

Random Search XGBoost Accuracy: 0.5918367346938775
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.3s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.3s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   0.4s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=   0.3s
[CV] END max_depth=10, min_samples_le