In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import train_test_split, KFold 
from sklearn.preprocessing import LabelEncoder 
import matplotlib.pyplot as plt 
import pickle 
import joblib 
from sklearn.metrics import mean_squared_error
# Perform text preprocessing using NLTK library
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
import nltk


In [None]:
#mounting drive 
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Load dataset
df = pd.read_csv("/content/drive/MyDrive/ML Project/Datasets for the Detection.csv", delimiter='\t')


# Change the column name
new_col_names = {'ï»¿Label': 'Label'}
df = df.rename(columns=new_col_names)


# Drop an irrelevant column from the dataset
df.drop(['Unnamed: 2'], axis=1, inplace=True)

# Create a copy of the dataset
df2 = df.copy()

# Remove rows with missing values from the dataset
df = df.dropna()

# Reset the index of the dataset
df2.reset_index(inplace=True)



In [None]:
# Download stopwords
nltk.download('stopwords')

# Initialize stemmer
ps = PorterStemmer()



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Preprocess the data
corpus = []
for i in range(len(df)):

    review = re.sub('[^a-zA-Z]', ' ', df['News'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)




In [11]:
# Create a TF-IDF vectorizer
tfidf_v = TfidfVectorizer(max_features=5000, ngram_range=(1,3))

# Vectorize the corpus
X = tfidf_v.fit_transform(corpus).toarray()
y = df['Label']

In [12]:
# Convert labels to binary classes
y_binary = np.where(y == 'Real', 1, 0)

# Perform k-fold cross-validation with k=3
kf = KFold(n_splits=3, shuffle=True, random_state=0)


In [14]:
i=1
print("Linear Regression Model:  ")
for train_index, test_index in kf.split(X):
    # Split the dataset into training and testing sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y_binary[train_index], y_binary[test_index]

    # Train the linear regression model
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = np.where(regressor.predict(X_test) >= 0.5, 1, 0)
    
    # Make predictions on the test set
    y_train_pred = np.where(regressor.predict(X_train) >= 0.5, 1, 0)


    # Calculate accuracy and MSE for the current fold
    accuracy = accuracy_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    print("Fold:  ",i)
    print("Training Metrics: ")
    # Print the training metrics
    print('Training Accuracy:', accuracy_score(y_train, y_train_pred))
    print('Training Classification Report:')
    print(classification_report(y_train, y_train_pred))
    print('Training Confusion Matrix:')
    print(confusion_matrix(y_train, y_train_pred))

    # Calculate training MSE
    train_mse = mean_squared_error(y_train, y_train_pred)
    print('Training Mean Squared Error (MSE):', train_mse)

    print("\n\nTesting Metrics: ")
    # Evaluate the model for test metrics
    print('Test Accuracy:', accuracy_score(y_test, y_pred))
    print('Test Classification Report:')
    print(classification_report(y_test, y_pred))
    print('Test Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    # Calculate test MSE
    mse = mean_squared_error(y_test, y_pred)
    print('Test Mean Squared Error (MSE):', mse)

    i=i+1

Linear Regression Model:  
Fold:   1
Training Metrics: 
Training Accuracy: 0.9098623913100243
Training Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.91      0.91     20019
           1       0.91      0.90      0.91     18278

    accuracy                           0.91     38297
   macro avg       0.91      0.91      0.91     38297
weighted avg       0.91      0.91      0.91     38297

Training Confusion Matrix:
[[18317  1702]
 [ 1750 16528]]
Training Mean Squared Error (MSE): 0.09013760868997571


Testing Metrics: 
Test Accuracy: 0.8610893519243825
Test Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87     10016
           1       0.85      0.85      0.85      9133

    accuracy                           0.86     19149
   macro avg       0.86      0.86      0.86     19149
weighted avg       0.86      0.86      0.86     19149

Test Confusion Matrix:
[[