In [2]:
# Import necessary libraries for data analysis and preprocessing
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import pickle
import joblib
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
import nltk
from sklearn.cluster import KMeans


In [4]:
#mounting drive 
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
#Load dataset
df = pd.read_csv("/content/drive/MyDrive/ML Project/Datasets for the Detection.csv", delimiter='\t')


# Change the column name
new_col_names = {'ï»¿Label': 'Label'}
df = df.rename(columns=new_col_names)


# Drop an irrelevant column from the dataset
df.drop(['Unnamed: 2'], axis=1, inplace=True)

# Create a copy of the dataset
df2 = df.copy()

# Remove rows with missing values from the dataset
df = df.dropna()

# Reset the index of the dataset
df2.reset_index(inplace=True)



In [6]:
# Download stopwords
nltk.download('stopwords')

# Initialize stemmer
ps = PorterStemmer()



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
# Preprocess the data
corpus = []
for i in range(len(df)):

    review = re.sub('[^a-zA-Z]', ' ', df['News'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)




In [8]:
# Create a TF-IDF vectorizer
tfidf_v = TfidfVectorizer(max_features=5000, ngram_range=(1,3))

# Vectorize the corpus
X = tfidf_v.fit_transform(corpus).toarray()
y = df['Label']



In [10]:
# Convert labels to binary classes
y_binary = np.where(y == 'Real', 1, 0)

# Perform k-fold cross-validation with k=3
kfold = KFold(n_splits=3, shuffle=True, random_state=0)

In [11]:
i=1
# Create a K-means classifier
model = KMeans(n_clusters=2)
print("K-Means Model:  ")


for train_index, test_index in kfold.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y_binary[train_index], y_binary[test_index]

    # Train the model
    model.fit(X_train)

    # Predict cluster labels for the test set
    y_pred_test = model.predict(X_test)
    # Predict cluster labels for the test set
    y_pred_train = model.predict(X_train)

    # Convert cluster labels to binary classes
    y_pred_test_binary = np.where(y_pred_test == 1, 1, 0)
    # Convert cluster labels to binary classes
    y_pred_train_binary = np.where(y_pred_train == 1, 1, 0)

    print("Fold:  ",i)
    print("Training Metrics: ")
    # Print the training metrics
    # Evaluate the model
    print('Train Accuracy:', accuracy_score(y_train, y_pred_train_binary))
    print('Train Classification Report:')
    print(classification_report(y_train, y_pred_train_binary))
    print('Train Confusion Matrix:')
    print(confusion_matrix(y_train, y_pred_train_binary))
    # Calculate ROC AUC
    roc_auc = roc_auc_score(y_train, y_pred_train_binary)
    print('Train ROC AUC:', roc_auc)

    print("\n\nTesting Metrics: ")
    # Print the testing metrics
    print('Test Accuracy:', accuracy_score(y_test, y_pred_test_binary))
    print('Test Classification Report:')
    print(classification_report(y_test, y_pred_test_binary))
    print('Test Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred_test_binary))
    # Calculate ROC AUC
    roc_auc = roc_auc_score(y_test, y_pred_test_binary)
    print('Test ROC AUC:', roc_auc)

    i=i+1


K-Means Model:  




Fold:   1
Training Metrics: 
Train Accuracy: 0.47040238138757606
Train Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.68      0.57     20019
           1       0.41      0.24      0.30     18278

    accuracy                           0.47     38297
   macro avg       0.45      0.46      0.44     38297
weighted avg       0.45      0.47      0.44     38297

Train Confusion Matrix:
[[13644  6375]
 [13907  4371]]
Train ROC AUC: 0.4603462373837096


Testing Metrics: 
Test Accuracy: 0.47616063502010547
Test Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.69      0.58     10016
           1       0.42      0.24      0.31      9133

    accuracy                           0.48     19149
   macro avg       0.46      0.47      0.44     19149
weighted avg       0.46      0.48      0.45     19149

Test Confusion Matrix:
[[6905 3111]
 [6920 2213]]
Test ROC AUC: 0.46585253914551



Fold:   2
Training Metrics: 
Train Accuracy: 0.527743687495104
Train Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.31      0.41     19933
           1       0.51      0.76      0.61     18364

    accuracy                           0.53     38297
   macro avg       0.55      0.54      0.51     38297
weighted avg       0.55      0.53      0.50     38297

Train Confusion Matrix:
[[ 6265 13668]
 [ 4418 13946]]
Train ROC AUC: 0.5368617601485124


Testing Metrics: 
Test Accuracy: 0.5278604626873465
Test Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.32      0.42     10102
           1       0.50      0.76      0.60      9047

    accuracy                           0.53     19149
   macro avg       0.55      0.54      0.51     19149
weighted avg       0.55      0.53      0.50     19149

Test Confusion Matrix:
[[3227 6875]
 [2166 6881]]
Test ROC AUC: 0.5400126567965522




Fold:   3
Training Metrics: 
Train Accuracy: 0.5257976917854719
Train Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.32      0.41     20118
           1       0.50      0.76      0.60     18180

    accuracy                           0.53     38298
   macro avg       0.55      0.54      0.51     38298
weighted avg       0.55      0.53      0.50     38298

Train Confusion Matrix:
[[ 6338 13780]
 [ 4381 13799]]
Train ROC AUC: 0.5370310793381754


Testing Metrics: 
Test Accuracy: 0.5314393148109463
Test Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.32      0.41      9917
           1       0.51      0.76      0.61      9231

    accuracy                           0.53     19148
   macro avg       0.55      0.54      0.51     19148
weighted avg       0.55      0.53      0.51     19148

Test Confusion Matrix:
[[3150 6767]
 [2205 7026]]
Test ROC AUC: 0.5393836768480305

K-Means Model:  
Training Metrics: 
Train Accuracy: 0.4741709461223779
Train Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.64      0.56     24029
           1       0.42      0.29      0.34     21927

    accuracy                           0.47     45956
   macro avg       0.46      0.47      0.45     45956
weighted avg       0.46      0.47      0.46     45956

Train Confusion Matrix:
[[15475  8554]
 [15611  6316]]
Train ROC AUC: 0.46603009206110035


Testing Metrics: 
Test Accuracy: 0.4820713664055701
Test Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.66      0.57      6006
           1       0.44      0.29      0.35      5484

    accuracy                           0.48     11490
   macro avg       0.47      0.47      0.46     11490
weighted avg       0.47      0.48      0.46     11490

Test Confusion Matrix:
[[3935 2071]
 [3880 1604]]
Test ROC AUC: 0.47383269