In [2]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

ModuleNotFoundError: No module named 'pandas'

In [7]:
# Load Dataset
correct_news = pd.read_csv(r"D:\Important Documents\Infosys Spring Boot Internship\Boosting Financial Market Stabality With Fake News Detection\Notebooks\Data\True.csv")
fake_news = pd.read_csv(r"D:\Important Documents\Infosys Spring Boot Internship\Boosting Financial Market Stabality With Fake News Detection\Notebooks\Data\Fake.csv")

# Add a label column: 1 for correct news, 0 for fake news
correct_news['label'] = 1
fake_news['label'] = 0

# Combine the datasets
data = pd.concat([correct_news, fake_news], ignore_index=True)

# Shuffle the dataset
data = data.sample(frac=1).reset_index(drop=True)

# Display the combined dataset
print(data.head())

                                               title  \
0  HILLARY PAYS Professional Trolls $1 MILLION To...   
1   The Daily Show Absolutely NAILED Absurdity Of...   
2  Senator McCain says Russia hacking probe not i...   
3   Wounded Veteran CONDEMNS Trump For Using Him ...   
4   Donald Trump Storms Glenn Beck’s Ted Cruz Ral...   

                                                text       subject  \
0  It s good to be worth tens of millions of doll...     left-news   
1  When listening to Congress discuss matters of ...          News   
2  WASHINGTON (Reuters) - Republican U.S. Senator...  politicsNews   
3  Donald Trump thought he could get away with us...          News   
4  Awkward does not even come close to describing...          News   

                 date  label  
0        Apr 25, 2016      0  
1   February 24, 2016      0  
2    January 5, 2017       1  
3  September 28, 2017      0  
4   February 24, 2016      0  


In [8]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

data['text'] = data['text'].apply(preprocess_text)
print(data['text'].head())


0    good worth ten million dollar discover need va...
1    listening congress discus matter importance la...
2    washington reuters republican u senator john m...
3    donald trump thought could get away using mili...
4    awkward even come close describing cringeworth...
Name: text, dtype: object


In [9]:
# Split the data
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
test_data, val_data = train_test_split(temp_data, test_size=0.33, random_state=42)

# Display the size of each split
print(f'Train data size: {train_data.shape}')
print(f'Test data size: {test_data.shape}')
print(f'Validation data size: {val_data.shape}')

Train data size: (31428, 5)
Test data size: (9024, 5)
Validation data size: (4446, 5)


In [10]:
# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_data['text'])
X_test = vectorizer.transform(test_data['text'])
X_val = vectorizer.transform(val_data['text'])

y_train = train_data['label']
y_test = test_data['label']
y_val = val_data['label']

In [11]:
# Initialize and train the Logistic Regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Predict and evaluate
y_train_pred = log_reg.predict(X_train)
y_test_pred = log_reg.predict(X_test)

# Classification report for training data
print("Logistic Regression - Training Data Classification Report")
print(classification_report(y_train, y_train_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))

# Classification report for test data
print("Logistic Regression - Test Data Classification Report")
print(classification_report(y_test, y_test_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

Logistic Regression - Training Data Classification Report
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     16479
           1       0.99      0.99      0.99     14949

    accuracy                           0.99     31428
   macro avg       0.99      0.99      0.99     31428
weighted avg       0.99      0.99      0.99     31428

Confusion Matrix:
[[16301   178]
 [  124 14825]]
Logistic Regression - Test Data Classification Report
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4736
           1       0.99      0.99      0.99      4288

    accuracy                           0.99      9024
   macro avg       0.99      0.99      0.99      9024
weighted avg       0.99      0.99      0.99      9024

Confusion Matrix:
[[4675   61]
 [  52 4236]]


In [12]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)
print("Random Forest - Training Data Classification Report")
print(classification_report(y_train, y_train_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred_rf))
print("Random Forest - Test Data Classification Report")
print(classification_report(y_test, y_test_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred_rf))

Random Forest - Training Data Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16479
           1       1.00      1.00      1.00     14949

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

Confusion Matrix:
[[16479     0]
 [    0 14949]]
Random Forest - Test Data Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4736
           1       1.00      1.00      1.00      4288

    accuracy                           1.00      9024
   macro avg       1.00      1.00      1.00      9024
weighted avg       1.00      1.00      1.00      9024

Confusion Matrix:
[[4720   16]
 [   9 4279]]


In [13]:
# SVM
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
y_train_pred_svm = svm_model.predict(X_train)
y_test_pred_svm = svm_model.predict(X_test)
print("SVM - Training Data Classification Report")
print(classification_report(y_train, y_train_pred_svm))
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred_svm))
print("SVM - Test Data Classification Report")
print(classification_report(y_test, y_test_pred_svm))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred_svm))

SVM - Training Data Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16479
           1       1.00      1.00      1.00     14949

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

Confusion Matrix:
[[16410    69]
 [   37 14912]]
SVM - Test Data Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      4736
           1       0.99      0.99      0.99      4288

    accuracy                           0.99      9024
   macro avg       0.99      0.99      0.99      9024
weighted avg       0.99      0.99      0.99      9024

Confusion Matrix:
[[4693   43]
 [  23 4265]]
