In [1]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC  
import nltk
from sklearn.svm import SVC
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Load Dataset
correct_news = pd.read_csv(r"C:\Users\krish\OneDrive\Desktop\Boosting Financial Market Stabality With Fake News Detection\artifacts\True.csv")
fake_news = pd.read_csv(r"C:\Users\krish\OneDrive\Desktop\Boosting Financial Market Stabality With Fake News Detection\artifacts\Fake.csv")

# Add a label column: 1 for correct news, 0 for fake news
correct_news['label'] = 1
fake_news['label'] = 0

# Combine the datasets
data = pd.concat([correct_news, fake_news], ignore_index=True)

# Shuffle the dataset
data = data.sample(frac=1).reset_index(drop=True)

# Display the combined dataset
print(data.head())

                                               title  \
0  Obama’s Gas-Guzzling Motorcade To Paris Climat...   
1  Obama paints Trump as no friend of the working...   
2  BREAKING: Obama Administration To Expand Numbe...   
3   Paul Ryan BREAKS DOWN, Admits GOP’s Trumpcare...   
4  Justice Department seeks warrant to seize anci...   

                                                text       subject  \
0  Are you sitting down? The phony baloney Climat...     left-news   
1  PHILADELPHIA (Reuters) - With Hillary Clinton ...  politicsNews   
2  So basically the Obama administration is makin...      politics   
3  The Republican Party has had it rough. Not onl...          News   
4  WASHINGTON (Reuters) - The U.S. Justice Depart...  politicsNews   

                  date  label  
0          Dec 2, 2015      0  
1  September 13, 2016       1  
2         Jan 15, 2016      0  
3       April 27, 2017      0  
4    December 6, 2017       1  


In [3]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

data['text'] = data['text'].apply(preprocess_text)
print(data['text'].head())

# Save preprocessed data into separate files
true_news_file = r"C:\Users\krish\OneDrive\Desktop\Boosting Financial Market Stabality With Fake News Detection\artifacts\preprocessed_true.csv"
fake_news_file = r"C:\Users\krish\OneDrive\Desktop\Boosting Financial Market Stabality With Fake News Detection\artifacts\preprocessed_fake.csv"

# Split data into true and fake news
true_data = data[data['label'] == 1]
fake_data = data[data['label'] == 0]

# Save to CSV
true_data.to_csv(true_news_file, index=False)
fake_data.to_csv(fake_news_file, index=False)

print(f"Preprocessed true news saved to {true_news_file}")
print(f"Preprocessed fake news saved to {fake_news_file}")

0    sitting phony baloney climate change summit pu...
1    philadelphia reuters hillary clinton sidelined...
2    basically obama administration making easier c...
3    republican party rough admit presidential cand...
4    washington reuters u justice department said w...
Name: text, dtype: object
Preprocessed true news saved to C:\Users\krish\OneDrive\Desktop\Boosting Financial Market Stabality With Fake News Detection\artifacts\preprocessed_true.csv
Preprocessed fake news saved to C:\Users\krish\OneDrive\Desktop\Boosting Financial Market Stabality With Fake News Detection\artifacts\preprocessed_fake.csv


In [4]:
# Split the data
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
test_data, val_data = train_test_split(temp_data, test_size=0.33, random_state=42)

# Display the size of each split
print(f'Train data size: {train_data.shape}')
print(f'Test data size: {test_data.shape}')
print(f'Validation data size: {val_data.shape}')

Train data size: (31428, 5)
Test data size: (9024, 5)
Validation data size: (4446, 5)


In [5]:
# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_data['text'])
X_test = vectorizer.transform(test_data['text'])
X_val = vectorizer.transform(val_data['text'])

y_train = train_data['label']
y_test = test_data['label']
y_val = val_data['label']

In [6]:
# Initialize and train the Logistic Regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Predict and evaluate
y_train_pred = log_reg.predict(X_train)
y_test_pred = log_reg.predict(X_test)

# Classification report for training data
print("Logistic Regression - Training Data Classification Report")
print(classification_report(y_train, y_train_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))

# Classification report for test data
print("Logistic Regression - Test Data Classification Report")
print(classification_report(y_test, y_test_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

Logistic Regression - Training Data Classification Report
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     16461
           1       0.99      0.99      0.99     14967

    accuracy                           0.99     31428
   macro avg       0.99      0.99      0.99     31428
weighted avg       0.99      0.99      0.99     31428

Confusion Matrix:
[[16291   170]
 [  122 14845]]
Logistic Regression - Test Data Classification Report
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      4732
           1       0.98      0.99      0.98      4292

    accuracy                           0.99      9024
   macro avg       0.99      0.99      0.99      9024
weighted avg       0.99      0.99      0.99      9024

Confusion Matrix:
[[4644   88]
 [  43 4249]]


In [7]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)
print("Random Forest - Training Data Classification Report")
print(classification_report(y_train, y_train_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred_rf))
print("Random Forest - Test Data Classification Report")
print(classification_report(y_test, y_test_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred_rf))

Random Forest - Training Data Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16461
           1       1.00      1.00      1.00     14967

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428

Confusion Matrix:
[[16461     0]
 [    1 14966]]
Random Forest - Test Data Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4732
           1       1.00      1.00      1.00      4292

    accuracy                           1.00      9024
   macro avg       1.00      1.00      1.00      9024
weighted avg       1.00      1.00      1.00      9024

Confusion Matrix:
[[4719   13]
 [   7 4285]]


In [None]:
# SVM
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
y_train_pred_svm = svm_model.predict(X_train)
y_test_pred_svm = svm_model.predict(X_test)
print("SVM - Training Data Classification Report")
print(classification_report(y_train, y_train_pred_svm))
print("Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred_svm))
print("SVM - Test Data Classification Report")
print(classification_report(y_test, y_test_pred_svm))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred_svm))