In [1]:
import pandas as pd  
import plotly.express as px
import matplotlib.pyplot as plt
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix



In [2]:
df = pd.read_csv('/kaggle/input/cleaned/cleaned_welfake.csv')

In [3]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,title,text,label,total_content
0,0,law enforcement high alert following threat co...,comment member movement hanging white people c...,1,law enforcement high alert following threat co...
1,2,unbelievable attorney general say rioter peace...,demonstrator last night constitutional right p...,1,unbelievable attorney general say rioter peace...
2,3,bobby raised us story conversion woo potential...,dozen politically active pastor came private d...,0,bobby raised us story conversion woo potential...


In [4]:
df = df.dropna()

# Split

In [5]:
# Split the data into train, test, and validation sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [6]:
# Print the shape of each split
print("Train set shape:", train_df.shape)
print("Validation set shape:", val_df.shape)
print("Test set shape:", test_df.shape)

Train set shape: (45102, 5)
Validation set shape: (11276, 5)
Test set shape: (14095, 5)


In [7]:
# Replace missing values with empty strings
train_df['total_content'].fillna('', inplace=True)
val_df['total_content'].fillna('', inplace=True)
test_df['total_content'].fillna('', inplace=True)

In [8]:
train_df['total_content'] = train_df['total_content'].astype(str).fillna('')
val_df['total_content'] = val_df['total_content'].astype(str).fillna('')
test_df['total_content'] = test_df['total_content'].astype(str).fillna('')


In [9]:
vectorizer = TfidfVectorizer()

In [10]:
# Fit and transform the training data
X_train = vectorizer.fit_transform(train_df['total_content'])
y_train = train_df['label']

In [11]:
# Transform the validation and test data
X_val = vectorizer.transform(val_df['total_content'])
y_val = val_df['label']

In [12]:
X_test = vectorizer.transform(test_df['text'])
y_test = test_df['label']

# SVM

In [13]:
from sklearn import svm
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm


In [14]:
svm_model = svm.SVC(kernel='linear')

In [15]:
# Train the model on the training data
svm_model.fit(X_train, y_train)

In [16]:
# Evaluate the model on the validation and test data
val_accuracy = svm_model.score(X_val, y_val)
test_accuracy = svm_model.score(X_test, y_test)

In [17]:
print("Validation Accuracy:", val_accuracy)
print("Test Accuracy:", test_accuracy)

Validation Accuracy: 0.9326002128414331
Test Accuracy: 0.9258602341255765


In [21]:
# Calculate predictions on the validation set
val_predictions = svm_model.predict(X_val)

In [23]:
# Calculate predictions on the test set
test_predictions = svm_model.predict(X_test)

In [24]:
# Calculate precision, recall, and F1-score for the validation set
val_precision = precision_score(y_val, val_predictions)
val_recall = recall_score(y_val, val_predictions)
val_f1 = f1_score(y_val, val_predictions)

In [25]:
# Calculate precision, recall, and F1-score for the test set
test_precision = precision_score(y_test, test_predictions)
test_recall = recall_score(y_test, test_predictions)
test_f1 = f1_score(y_test, test_predictions)


In [26]:
# Print the results
print("Validation Set:")
print("Precision:", val_precision)
print("Recall:", val_recall)
print("F1-score:", val_f1)
print()

print("Test Set:")
print("Precision:", test_precision)
print("Recall:", test_recall)
print("F1-score:", test_f1)

Validation Set:
Precision: 0.9304286718200984
Recall: 0.9356890459363958
F1-score: 0.9330514446793516

Test Set:
Precision: 0.9362183566680808
Recall: 0.917660105350707
F1-score: 0.9268463423171158


# Naive Bayes 

In [27]:
from sklearn.naive_bayes import MultinomialNB

In [28]:
# Create a Naive Bayes model
nb_model = MultinomialNB()

# Train the model on the training data
nb_model.fit(X_train, y_train)

In [29]:
# Evaluate the model on the validation and test data
val_accuracy = nb_model.score(X_val, y_val)
test_accuracy = nb_model.score(X_test, y_test)

In [30]:
print("Validation Accuracy:", val_accuracy)
print("Test Accuracy:", test_accuracy)

Validation Accuracy: 0.8416105001773678
Test Accuracy: 0.8349769421780774


In [31]:

# Calculate predictions on the validation set
val_predictions = nb_model.predict(X_val)

# Calculate predictions on the test set
test_predictions = nb_model.predict(X_test)

# Calculate precision, recall, and F1-score for the validation set
val_precision = precision_score(y_val, val_predictions)
val_recall = recall_score(y_val, val_predictions)
val_f1 = f1_score(y_val, val_predictions)

# Calculate precision, recall, and F1-score for the test set
test_precision = precision_score(y_test, test_predictions)
test_recall = recall_score(y_test, test_predictions)
test_f1 = f1_score(y_test, test_predictions)

# Print the results
print("Validation Set:")
print("Precision:", val_precision)
print("Recall:", val_recall)
print("F1-score:", val_f1)
print()

print("Test Set:")
print("Precision:", test_precision)
print("Recall:", test_recall)
print("F1-score:", test_f1)


Validation Set:
Precision: 0.8391106442577031
Recall: 0.8468197879858658
F1-score: 0.842947590573338

Test Set:
Precision: 0.8398220244716351
Recall: 0.837260881619074
F1-score: 0.8385394974316256


# Logisitic Regression 


In [33]:
from sklearn.linear_model import LogisticRegression

In [34]:
lr_model = LogisticRegression()

# Train the model on the training data
lr_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = lr_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.917062788222774


In [44]:
val_accuracy = lr_model.score(X_val, y_val)
print("Validation Accuracy:", val_accuracy)

Validation Accuracy: 0.9263923377084072


In [35]:

# Calculate predictions on the validation set
val_predictions = lr_model.predict(X_val)

# Calculate predictions on the test set
test_predictions = lr_model.predict(X_test)

# Calculate precision, recall, and F1-score for the validation set
val_precision = precision_score(y_val, val_predictions)
val_recall = recall_score(y_val, val_predictions)
val_f1 = f1_score(y_val, val_predictions)

# Calculate precision, recall, and F1-score for the test set
test_precision = precision_score(y_test, test_predictions)
test_recall = recall_score(y_test, test_predictions)
test_f1 = f1_score(y_test, test_predictions)

# Print the results
print("Validation Set:")
print("Precision:", val_precision)
print("Recall:", val_recall)
print("F1-score:", val_f1)
print()

print("Test Set:")
print("Precision:", test_precision)
print("Recall:", test_recall)
print("F1-score:", test_f1)

Validation Set:
Precision: 0.9235356015433182
Recall: 0.9303886925795053
F1-score: 0.9269494807252243

Test Set:
Precision: 0.9244488133689088
Recall: 0.9125311893540339
F1-score: 0.9184513428671085


# Random forest

In [36]:
from sklearn.ensemble import RandomForestClassifier

In [38]:
# Create a Random Forest model
rf_model = RandomForestClassifier()

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.91301880099326


In [45]:
val_accuracy = rf_model.score(X_val, y_val)
print("Validation Accuracy:", val_accuracy)

Validation Accuracy: 0.9228449804895353


In [39]:

# Calculate predictions on the validation set
val_predictions = rf_model.predict(X_val)

# Calculate predictions on the test set
test_predictions = rf_model.predict(X_test)

# Calculate precision, recall, and F1-score for the validation set
val_precision = precision_score(y_val, val_predictions)
val_recall = recall_score(y_val, val_predictions)
val_f1 = f1_score(y_val, val_predictions)

# Calculate precision, recall, and F1-score for the test set
test_precision = precision_score(y_test, test_predictions)
test_recall = recall_score(y_test, test_predictions)
test_f1 = f1_score(y_test, test_predictions)

# Print the results
print("Validation Set:")
print("Precision:", val_precision)
print("Recall:", val_recall)
print("F1-score:", val_f1)
print()

print("Test Set:")
print("Precision:", test_precision)
print("Recall:", test_recall)
print("F1-score:", test_f1)


Validation Set:
Precision: 0.9231448763250883
Recall: 0.9231448763250883
F1-score: 0.9231448763250883

Test Set:
Precision: 0.9157178561510692
Recall: 0.9141946215691711
F1-score: 0.9149556048834628


# Gradient Boosting Models

In [40]:
from sklearn.ensemble import GradientBoostingClassifier

In [43]:
# Create an XGBoost model
xgb_model = GradientBoostingClassifier()

# Train the model on the training data
xgb_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = xgb_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.8766229159276339


In [46]:
val_accuracy = xgb_model.score(X_val, y_val)
print("Validation Accuracy:", val_accuracy)

Validation Accuracy: 0.8965945370698829


In [47]:

# Calculate predictions on the validation set
val_predictions = xgb_model.predict(X_val)

# Calculate predictions on the test set
test_predictions = xgb_model.predict(X_test)

# Calculate precision, recall, and F1-score for the validation set
val_precision = precision_score(y_val, val_predictions)
val_recall = recall_score(y_val, val_predictions)
val_f1 = f1_score(y_val, val_predictions)

# Calculate precision, recall, and F1-score for the test set
test_precision = precision_score(y_test, test_predictions)
test_recall = recall_score(y_test, test_predictions)
test_f1 = f1_score(y_test, test_predictions)

# Print the results
print("Validation Set:")
print("Precision:", val_precision)
print("Recall:", val_recall)
print("F1-score:", val_f1)
print()

print("Test Set:")
print("Precision:", test_precision)
print("Recall:", test_recall)
print("F1-score:", test_f1)


Validation Set:
Precision: 0.8920097697138869
Recall: 0.903356890459364
F1-score: 0.8976474719101124

Test Set:
Precision: 0.8764956677210838
Recall: 0.8834211255891322
F1-score: 0.8799447704521919
