In [16]:

import pandas as pd
import numpy as np
import warnings
import os

import nltk


from nltk.stem import WordNetLemmatizer # to lemmatize the words
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet # to get the wordnet pos tags
from nltk.corpus import stopwords # to remove the stopwords
from sklearn.feature_extraction.text import CountVectorizer # to create a bag of words

# Machine Learning
from sklearn.cluster import KMeans
from kneed import KneeLocator
from sklearn.metrics import silhouette_score

<h1 style="color:blue;">Data Extracting</h1>

In [17]:
current_path = os.getcwd()
data_path = os.path.join(current_path, 'dataset')
data_csv = os.path.join(data_path, 'training_data.csv')

In [18]:
data = pd.read_csv(data_csv, sep='\t')

In [19]:
df_1 = data.copy()

<h1 style="color:blue;">Data Cleaning</h1>

In [20]:
# snake_columns
def snake_columns(df):
	if any(column.isupper() for column in df.columns):
		df.columns = [column.strip().lower().replace(' ', '_').replace('-', '_') for column in df.columns]
	return df

In [21]:
df_1.rename(columns={df_1.columns[0]: 'label', df_1.columns[1]: 'headline'}, inplace=True)
print(df_1.head())

   label                                           headline
0      0  drunk bragging trump staffer started russian c...
1      0  sheriff david clarke becomes an internet joke ...
2      0  trump is so obsessed he even has obama‚s name ...
3      0  pope francis just called out donald trump duri...
4      0  racist alabama cops brutalize black boy while ...


In [22]:
df_1.duplicated().sum()
df_1.drop_duplicates(inplace=True)
df_1.shape


(32205, 2)

In [23]:
df_1.head()

Unnamed: 0,label,headline
0,0,drunk bragging trump staffer started russian c...
1,0,sheriff david clarke becomes an internet joke ...
2,0,trump is so obsessed he even has obama‚s name ...
3,0,pope francis just called out donald trump duri...
4,0,racist alabama cops brutalize black boy while ...


<h1 style="color:blue;">PreProcessing</h1>

In [24]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove punctuation
    tokens = [word for word in tokens if word not in string.punctuation]
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming
    tokens = [stemmer.stem(word) for word in tokens]
    
    return ' '.join(tokens)

[nltk_data] Downloading package punkt to /Users/selinwork/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/selinwork/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<h1 style="color:blue;">Train-Test Split</h1>

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_1['headline'], df_1['label'], test_size=0.2, random_state=42)
 

In [26]:
X_train_processed = X_train.apply(preprocess_text)

<h1 style="color:blue;">Tfid Vectorizer</h1>

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train_processed)

In [28]:
X_test_processed = X_test.apply(preprocess_text)
X_test_vectorized = vectorizer.transform(X_test_processed)

<h1 style="color:blue;">Model: Logistic Regression</h1>

In [29]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)
accuracy = model.score(X_test_vectorized, y_test)
print(f"Model Accuracy: {accuracy}")

Model Accuracy: 0.9310666045645086


<h1 style="color:blue;">Adding ngrams</h1>

In [30]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train_processed)
X_test_vectorized = vectorizer.transform(X_test_processed)

In [31]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)
accuracy = model.score(X_test_vectorized, y_test)
print(f"Model Accuracy: {accuracy}")

Model Accuracy: 0.9321533923303835


In [32]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(LogisticRegression(), X_train_vectorized, y_train, cv=5)
print(scores)
print(f"Mean Accuracy: {scores.mean():.2f} (+/- {scores.std() * 2:.2f})")

[0.9280031  0.93071997 0.92664467 0.9280031  0.92488354]
Mean Accuracy: 0.93 (+/- 0.00)


In [33]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)
train_accuracy = model.score(X_train_vectorized, y_train)
test_accuracy = model.score(X_test_vectorized, y_test)
print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Testing Accuracy: {test_accuracy:.2f}")

Training Accuracy: 0.95
Testing Accuracy: 0.93


<h1 style="color:blue;">Model: Random Forest Classifier</h1>

In [34]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train_vectorized, y_train)
train_accuracy = model.score(X_train_vectorized, y_train)
test_accuracy = model.score(X_test_vectorized, y_test)
print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Testing Accuracy: {test_accuracy:.2f}")

Training Accuracy: 1.00
Testing Accuracy: 0.91


In [35]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Logistic Regression Model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train_vectorized, y_train)
y_pred_logistic = logistic_model.predict(X_test_vectorized)

print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_logistic))

# Random Forest Classifier with Hyperparameter Tuning
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_grid_search = GridSearchCV(RandomForestClassifier(), rf_param_grid, cv=3, n_jobs=-1, verbose=2)
rf_grid_search.fit(X_train_vectorized, y_train)

best_rf_model = rf_grid_search.best_estimator_
y_pred_rf = best_rf_model.predict(X_test_vectorized)

print("Random Forest Classifier Classification Report:")
print(classification_report(y_test, y_pred_rf))

# Display Best Hyperparameters for Random Forest
print("Best Parameters for Random Forest:", rf_grid_search.best_params_)

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.92      0.93      3232
           1       0.92      0.94      0.93      3209

    accuracy                           0.93      6441
   macro avg       0.93      0.93      0.93      6441
weighted avg       0.93      0.93      0.93      6441

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   4.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   4.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samp

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

# Hyperparameters for Random Forest Classifier
rf_param_grid = {
    'n_estimators': [300],
    'max_depth': [None],
    'min_samples_split': [10],
    'min_samples_leaf': [1]
}

# Initialize and train the Random Forest model
model = RandomForestClassifier(
    n_estimators=rf_param_grid['n_estimators'][0],
    max_depth=rf_param_grid['max_depth'][0],
    min_samples_split=rf_param_grid['min_samples_split'][0],
    min_samples_leaf=rf_param_grid['min_samples_leaf'][0]
)
model.fit(X_train_vectorized, y_train)

# Calculate accuracy for training and test sets
train_accuracy = model.score(X_train_vectorized, y_train)
test_accuracy = model.score(X_test_vectorized, y_test)

# Predict test set labels
y_pred_rf = model.predict(X_test_vectorized)

# Print results
print("Random Forest Classifier Training Accuracy:", train_accuracy)
print("Random Forest Classifier Test Accuracy:", test_accuracy)
print("\nRandom Forest Classifier Classification Report:")
print(classification_report(y_test, y_pred_rf))



Random Forest Classifier Training Accuracy: 0.998563887595094
Random Forest Classifier Test Accuracy: 0.9163173420276355

Random Forest Classifier Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.91      0.92      3232
           1       0.91      0.92      0.92      3209

    accuracy                           0.92      6441
   macro avg       0.92      0.92      0.92      6441
weighted avg       0.92      0.92      0.92      6441

