In [7]:
#Install the necessary library

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re

Task 1 -- Data Loading and Exploring

In [3]:
# Load the datasets
true_df = pd.read_csv('True.csv')
fake_df = pd.read_csv('Fake.csv')

# Add a label column to the datasets
true_df['label'] = 1
fake_df['label'] = 0

# Combine the datasets
df = pd.concat([true_df, fake_df])

# Shuffle the data
df = df.sample(frac=1).reset_index(drop=True)

# Explore the data
print(df.head())
print(df.info())
print(df['label'].value_counts())

                                               title  \
0  DEMOCRAT PLAN TO INFILTRATE TRADITIONALLY RED ...   
1   SNL’s ‘Bern Your Enthusiasm’ Explains How Ber...   
2   Trump’s Deputy Campaign Manager Just Blamed H...   
3  Russia's Lavrov says hopes Syrian congress to ...   
4  Trump names six U.S. governors as agricultural...   

                                                text       subject  \
0  The fundamental transformation of America El S...      politics   
1  After this past Monday s Iowa Caucus where the...          News   
2  As Donald Trump continues to sink with female ...          News   
3  MOSCOW (Reuters) - Russian Foreign Minister Se...     worldnews   
4  CHICAGO (Reuters) - U.S. Republican presidenti...  politicsNews   

                 date  label  
0        Jul 27, 2015      0  
1    February 7, 2016      0  
2  September 28, 2016      0  
3   November 7, 2017       1  
4    August 16, 2016       1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 

Task 2 -- Data Preprocessing(
                    Data cleaning,
                    Missing Values Handling,
                    Data Tokenization,
                    Stop Word removal,
                    Apply Stemming and Lemmatization,
                    Data Vectorization using TF-IDF)


In [4]:
# Data cleaning
def clean_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    return text

df['text'] = df['text'].apply(clean_text)

# Missing Values Handling
df.isnull().sum()

# Data Tokenization
df['text'] = df['text'].apply(word_tokenize)

# Stop Word removal
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: [word for word in x if word not in stop_words])

# Apply Stemming and Lemmatization
nltk.download('wordnet')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

df['text'] = df['text'].apply(lambda x: [stemmer.stem(word) for word in x])
df['text'] = df['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df['text'] = df['text'].apply(lambda x: ' '.join(x))

# Data Vectorization using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df['text']).toarray()
y = df['label']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...


Task 3 -- Model Training(Logistic Regression, Random Forest, Decision Tree, Gradient Boosting and use Hyperparameters tuning)


In [8]:
# Train-Test Split1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Model Training with Hyperparameter Tuning

# Logistic Regression
param_grid_lr = {'C': [0.01, 0.1, 1, 10, 100]}
grid_search_lr = GridSearchCV(LogisticRegression(), param_grid_lr, cv=5, scoring='accuracy')
grid_search_lr.fit(X_train, y_train)
best_lr = grid_search_lr.best_estimator_
y_pred_lr = best_lr.predict(X_test)

# Random Forest
param_grid_rf = {'n_estimators': [50, 100, 200], 'max_depth': [10, 20, 30]}
grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)
best_rf = grid_search_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)

# Decision Tree
param_grid_dt = {'max_depth': [10, 20, 30], 'min_samples_split': [2, 5, 10]}
grid_search_dt = GridSearchCV(DecisionTreeClassifier(), param_grid_dt, cv=5, scoring='accuracy')
grid_search_dt.fit(X_train, y_train)
best_dt = grid_search_dt.best_estimator_
y_pred_dt = best_dt.predict(X_test)

# Gradient Boosting
param_grid_gb = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1]}
grid_search_gb = GridSearchCV(GradientBoostingClassifier(), param_grid_gb, cv=5, scoring='accuracy')
grid_search_gb.fit(X_train, y_train)
best_gb = grid_search_gb.best_estimator_
y_pred_gb = best_gb.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

KeyboardInterrupt: 

Task 4 -- Model Evaluation

In [None]:
# Model Evaluation
def evaluate_model(y_test, y_pred, model_name):
    print(f"Model: {model_name}")
    print(f"Best Parameters: {grid_search_lr.best_params_ if model_name == 'Logistic Regression' else grid_search_rf.best_params_ if model_name == 'Random Forest' else grid_search_dt.best_params_ if model_name == 'Decision Tree' else grid_search_gb.best_params_}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

evaluate_model(y_test, y_pred_lr, "Logistic Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest")
evaluate_model(y_test, y_pred_dt, "Decision Tree")
evaluate_model(y_test, y_pred_gb, "Gradient Boosting")

# Compare the performance of all models
models = ['Logistic Regression', 'Random Forest', 'Decision Tree', 'Gradient Boosting']
accuracies = [accuracy_score(y_test, y_pred_lr), 
              accuracy_score(y_test, y_pred_rf), 
              accuracy_score(y_test, y_pred_dt), 
              accuracy_score(y_test, y_pred_gb)]

performance_df = pd.DataFrame({'Model': models, 'Accuracy': accuracies})
print(performance_df)