In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder 
import matplotlib.pyplot as plt 
import pickle 
import joblib 
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
# Perform text preprocessing using NLTK library
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
import nltk


In [2]:
#Load dataset
df = pd.read_csv("/content/Datasets for the Detection.csv", delimiter='\t')

# Change the column name
new_col_names = {'ï»¿Label': 'Label'}
df = df.rename(columns=new_col_names)

# Drop an irrelevant column from the dataset
df.drop(['Unnamed: 2'], axis=1, inplace=True)


In [3]:

# Take a random sample of 30,000 rows from the dataset
df = df.sample(n=10000, random_state=4)
# Reset the index of df_sample
df.reset_index(drop=True, inplace=True)

# Check the total number of occurrences for each label
label_counts = df['Label'].value_counts()
print(label_counts)




# Create a copy of the dataset
df2 = df.copy()

# Remove rows with missing values from the dataset
df = df.dropna()

# Reset the index of the dataset
df2.reset_index(inplace=True)



Fake    5295
Real    4705
Name: Label, dtype: int64


In [4]:
df

Unnamed: 0,Label,News
0,Fake,MOTHER OF CRYING BABY AT TRUMP RALLY: “Mr. Tru...
1,Real,Xi says China will let the market play decisiv...
2,Fake,WOW! OBAMA MEDIA ALLY EXPOSES HUGE LIE: Why Ob...
3,Real,"U.S. firm Air Products, China's Yankuang plan ..."
4,Real,Law to let Museveni extend rule brought to Uga...
...,...,...
9995,Real,U.S. Supreme Court revival on Trump travel ban...
9996,Real,The global climate negotiations scheduled to t...
9997,Real,Trump FBI nominee's corporate legal work could...
9998,Fake,WATCH: Donald Trump Is An Imbecile Who Doesn’...


In [6]:
# Download stopwords
nltk.download('stopwords')

# Initialize stemmer
ps = PorterStemmer()



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# Preprocess the data
corpus = []
for i in range(len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['News'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

corpus = np.array(corpus)



In [8]:
# Create a TF-IDF vectorizer
tfidf_v = TfidfVectorizer(max_features=5000, ngram_range=(1,3))

# Vectorize the corpus
X = tfidf_v.fit_transform(corpus).toarray()
y = df['Label']



In [9]:

# Convert labels to binary classes
y_binary = np.where(y == 'Real', 1, 0)

# Perform K-fold cross-validation with k=3
kf = KFold(n_splits=3, shuffle=True, random_state=10)



In [11]:

# Create an SVM classifier
model = SVC()
i=1
print("SVM Model:  ")

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y_binary[train_index], y_binary[test_index]

    # Train the model
    model.fit(X_train, y_train)

    # Evaluate the model on the test set
    y_pred = model.predict(X_test)


    # Evaluate the model on the train set
    y_train_pred = model.predict(X_train)


    print("Training Metrics: ")
    # Print the training metrics
    print('Training Accuracy:', accuracy_score(y_train, y_train_pred))
    print('Training Classification Report:')
    print(classification_report(y_train, y_train_pred))
    print('Training Confusion Matrix:')
    print(confusion_matrix(y_train, y_train_pred))

    print("\n\nTesting Metrics: ")
    # Evaluate the model for test metrics
    print('Test Accuracy:', accuracy_score(y_test, y_pred))
    print('Test Classification Report:')
    print(classification_report(y_test, y_pred))
    print('Test Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))



SVM Model:  
SVM Model:  
Training Metrics: 
Training Accuracy: 0.9863486348634863
Training Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3512
           1       0.99      0.98      0.99      3154

    accuracy                           0.99      6666
   macro avg       0.99      0.99      0.99      6666
weighted avg       0.99      0.99      0.99      6666

Training Confusion Matrix:
[[3474   38]
 [  53 3101]]


Testing Metrics: 
Test Accuracy: 0.8593281343731254
Test Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      1783
           1       0.85      0.85      0.85      1551

    accuracy                           0.86      3334
   macro avg       0.86      0.86      0.86      3334
weighted avg       0.86      0.86      0.86      3334

Test Confusion Matrix:
[[1545  238]
 [ 231 1320]]
SVM Model:  
Training Metrics: 
Training Accu