In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder 
import matplotlib.pyplot as plt 
import pickle 
import joblib 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
# Perform text preprocessing using NLTK library
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
import nltk


In [2]:
#Load dataset
df = pd.read_csv("/content/Datasets for the Detection.csv", delimiter='\t')


# Change the column name
new_col_names = {'ï»¿Label': 'Label'}
df = df.rename(columns=new_col_names)


# Drop an irrelevant column from the dataset
df.drop(['Unnamed: 2'], axis=1, inplace=True)

# Create a copy of the dataset
df2 = df.copy()

# Remove rows with missing values from the dataset
df = df.dropna()

# Reset the index of the dataset
df2.reset_index(inplace=True)



In [3]:
# Download stopwords
nltk.download('stopwords')

# Initialize stemmer
ps = PorterStemmer()



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
# Preprocess the data
corpus = []
for i in range(len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['News'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

corpus = np.array(corpus)



In [5]:
# Create a TF-IDF vectorizer
tfidf_v = TfidfVectorizer(max_features=5000, ngram_range=(1,3))

# Vectorize the corpus
X = tfidf_v.fit_transform(corpus).toarray()
y = df['Label']



In [6]:
# Convert labels to binary classes
y_binary = np.where(y == 'Real', 1, 0)

# Perform k-fold cross-validation with k=3
kf = KFold(n_splits=3, shuffle=True, random_state=0)


In [7]:
i=1
print("Logistic Regression Model:  ")

for train_index, test_index in kf.split(X):
    # Split the dataset into training and testing sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y_binary[train_index], y_binary[test_index]


    # Create a logistic regression model
    logreg_model = LogisticRegression()
    # Train the model
    logreg_model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = np.where(logreg_model.predict(X_test) >= 0.5, 1, 0)
    # Make predictions on the training set
    y_train_pred = np.where(logreg_model.predict(X_train) >= 0.5, 1, 0)

    print("Fold:   ",i)

    print("Training Metrics: ")
    # Print the training metrics
    print('Training Accuracy:', accuracy_score(y_train, y_train_pred))
    print('Training Classification Report:')
    print(classification_report(y_train, y_train_pred))
    print('Training Confusion Matrix:')
    print(confusion_matrix(y_train, y_train_pred))

    # Calculate ROC AUC score
    roc_auc = roc_auc_score(y_train, logreg_model.predict_proba(X_train)[:, 1])
    # Print ROC AUC score
    print('ROC AUC:', roc_auc)


    print("\n\nTesting Metrics: ")
    # Evaluate the model for test metrics
    print('Test Accuracy:', accuracy_score(y_test, y_pred))
    print('Test Classification Report:')
    print(classification_report(y_test, y_pred))
    print('Test Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))

    # Calculate ROC AUC score
    roc_auc = roc_auc_score(y_test, logreg_model.predict_proba(X_test)[:, 1])
    # Print ROC AUC score
    print('ROC AUC:', roc_auc)

    i=i+1

Logistic Regression Model:  
Fold:    1
Training Metrics: 
Training Accuracy: 0.8983732407238165
Training Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.90      0.90     20019
           1       0.89      0.90      0.89     18278

    accuracy                           0.90     38297
   macro avg       0.90      0.90      0.90     38297
weighted avg       0.90      0.90      0.90     38297

Training Confusion Matrix:
[[18041  1978]
 [ 1914 16364]]
ROC AUC: 0.9631624385108575


Testing Metrics: 
Test Accuracy: 0.8727870907096976
Test Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.88      0.88     10016
           1       0.87      0.87      0.87      9133

    accuracy                           0.87     19149
   macro avg       0.87      0.87      0.87     19149
weighted avg       0.87      0.87      0.87     19149

Test Confusion Matrix:
[[8783 1233]
 [1203 7930]]