In [1]:
# Import necessary libraries for data analysis and preprocessing
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split,KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import pickle
import joblib
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.linear_model import LogisticRegression

import re
import nltk
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


In [2]:
#mounting drive 
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
#Load dataset
df = pd.read_csv("/content/drive/MyDrive/ML Project/Datasets for the Detection.csv", delimiter='\t')


# Change the column name
new_col_names = {'ï»¿Label': 'Label'}
df = df.rename(columns=new_col_names)


# Drop an irrelevant column from the dataset
df.drop(['Unnamed: 2'], axis=1, inplace=True)

# Create a copy of the dataset
df2 = df.copy()

# Remove rows with missing values from the dataset
df = df.dropna()

# Reset the index of the dataset
df2.reset_index(inplace=True)



In [4]:
# Download stopwords
nltk.download('stopwords')

# Initialize stemmer
ps = PorterStemmer()



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Preprocess the data
corpus = []
for i in range(len(df)):

    review = re.sub('[^a-zA-Z]', ' ', df['News'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)




In [6]:
# Create a TF-IDF vectorizer
tfidf_v = TfidfVectorizer(max_features=5000, ngram_range=(1,3))

# Vectorize the corpus
X = tfidf_v.fit_transform(corpus).toarray()
y = df['Label']



In [7]:
# Convert labels to binary classes
y_binary = np.where(y == 'Real', 1, 0)

# Perform k-fold cross-validation with k=3
kfold = KFold(n_splits=3, shuffle=True, random_state=0)

In [8]:
# Create and train the logistic regression model
model = LogisticRegression()

print("PCA Model:  ")
i=1
for train_index, test_index in kfold.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y_binary[train_index], y_binary[test_index]

    # Apply PCA for dimensionality reduction
    pca = PCA(n_components=1000)


    # Fit PCA on the training data and transform both training and testing data
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    # Normalize the data
    scaler = StandardScaler()
    X_train_pca = scaler.fit_transform(X_train_pca)
    X_test_pca = scaler.transform(X_test_pca)


    model.fit(X_train_pca, y_train)

    # Predict the labels for the test data
    y_pred = model.predict(X_test_pca)

    # Predict the labels for the training data
    y_train_pred = model.predict(X_train_pca)

    print("Fold:  ",i)

    print("PCA Model:  ")
    print("Training Metrics: ")
    # Print the training metrics
    print('Training Accuracy:', accuracy_score(y_train, y_train_pred))
    print('Training Classification Report:')
    print(classification_report(y_train, y_train_pred))
    print('Training Confusion Matrix:')
    print(confusion_matrix(y_train, y_train_pred))

    # Calculate ROC AUC
    roc_auc = roc_auc_score(y_train, y_train_pred)
    print('Test ROC AUC:', roc_auc)


    print("\n\nTesting Metrics: ")


    # Evaluate the model
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))

    # Calculate ROC AUC
    roc_auc = roc_auc_score(y_test, y_pred)
    print('Test ROC AUC:', roc_auc)

    i=i+1


PCA Model:  
Fold:   1
PCA Model:  
Training Metrics: 
Training Accuracy: 0.8766744131394104
Training Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.88      0.88     20019
           1       0.87      0.87      0.87     18278

    accuracy                           0.88     38297
   macro avg       0.88      0.88      0.88     38297
weighted avg       0.88      0.88      0.88     38297

Training Confusion Matrix:
[[17614  2405]
 [ 2318 15960]]
Test ROC AUC: 0.876522501129125


Testing Metrics: 
Accuracy: 0.8633348999947777
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87     10016
           1       0.86      0.86      0.86      9133

    accuracy                           0.86     19149
   macro avg       0.86      0.86      0.86     19149
weighted avg       0.86      0.86      0.86     19149

Confusion Matrix:
[[8716 1300]
 [1317 7816]]
Test ROC AUC: 