## Loading DATA

In [None]:
# Import Libraries

import pandas as pd

In [None]:
# header row unexistent, and separate to diferent a diferent column at \t
training_df = pd.read_csv('/content/training_data.csv', sep='\t', header=None)


## Exploratory Data analysis

In [None]:
display(training_df.head())

Unnamed: 0,0,1
0,0,donald trump sends out embarrassing new year‚s...
1,0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...


In [None]:
training_df.describe()

Unnamed: 0,0
count,34152.0
mean,0.485477
std,0.499796
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [None]:
training_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34152 entries, 0 to 34151
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       34152 non-null  int64 
 1   1       34152 non-null  object
dtypes: int64(1), object(1)
memory usage: 533.8+ KB


In [None]:
training_df.shape

(34152, 2)

## Data engineering





*   Lower Case




In [None]:
# Convert the text in the second column to lowercase
training_df[1] = training_df[1].astype(str).str.lower()

display(training_df.head())

Unnamed: 0,0,1
0,0,donald trump sends out embarrassing new year‚s...
1,0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...




*   Remove the Ponctuation and special characters



In [None]:
import re
import string

# Remove the ponctuation and special characters

def remove_punctuation_and_special_chars(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

training_df[1] = training_df[1].apply(remove_punctuation_and_special_chars)

display(training_df.head())

Unnamed: 0,0,1
0,0,donald trump sends out embarrassing new years ...
1,0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obamas name c...
4,0,pope francis just called out donald trump duri...




*   Tokenization




In [None]:
# Import requiered libraries
import math
import numpy as np


import nltk
nltk.download('punkt')
nltk.download('punkt_tab') # Added to fix the LookupError
from nltk.tokenize import word_tokenize, sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
# tokenization function
def tokenize_text(text):
    if isinstance(text, str):
        return word_tokenize(text)
    return []

training_df['tokenized_text'] = training_df[1].apply(tokenize_text)

display(training_df.head())

Unnamed: 0,0,1,tokenized_text
0,0,donald trump sends out embarrassing new years ...,"[donald, trump, sends, out, embarrassing, new,..."
1,0,drunk bragging trump staffer started russian c...,"[drunk, bragging, trump, staffer, started, rus..."
2,0,sheriff david clarke becomes an internet joke ...,"[sheriff, david, clarke, becomes, an, internet..."
3,0,trump is so obsessed he even has obamas name c...,"[trump, is, so, obsessed, he, even, has, obama..."
4,0,pope francis just called out donald trump duri...,"[pope, francis, just, called, out, donald, tru..."




*   Lemmatization



In [None]:
# Download lemmatization
try:
    nltk.data.find('corpora/wordnet')
except LookupError: # Changed from nltk.downloader.DownloadError
    nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Initialize WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# lemmatization function
def lemmatize_tokens(tokens):
    if isinstance(tokens, list):
        return [lemmatizer.lemmatize(token) for token in tokens]
    return []


training_df['lemmatized_text'] = training_df['tokenized_text'].apply(lemmatize_tokens)

display(training_df.head())

Unnamed: 0,0,1,tokenized_text,lemmatized_text
0,0,donald trump sends out embarrassing new years ...,"[donald, trump, sends, out, embarrassing, new,...","[donald, trump, sends, out, embarrassing, new,..."
1,0,drunk bragging trump staffer started russian c...,"[drunk, bragging, trump, staffer, started, rus...","[drunk, bragging, trump, staffer, started, rus..."
2,0,sheriff david clarke becomes an internet joke ...,"[sheriff, david, clarke, becomes, an, internet...","[sheriff, david, clarke, becomes, an, internet..."
3,0,trump is so obsessed he even has obamas name c...,"[trump, is, so, obsessed, he, even, has, obama...","[trump, is, so, obsessed, he, even, ha, obamas..."
4,0,pope francis just called out donald trump duri...,"[pope, francis, just, called, out, donald, tru...","[pope, francis, just, called, out, donald, tru..."




*   Pos Tag



In [None]:
# Download POS tagging,
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger_eng') # Added to download the required resource

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [None]:
# Define a POS tagging function
def tag_tokens(tokens):
    if isinstance(tokens, list):
        return pos_tag(tokens)
    return []


training_df['pos_tagged_text'] = training_df['lemmatized_text'].apply(tag_tokens)

print("\nPoS Tagging Result for first entry:")
# Access the first entry's POS tags from the DataFrame column
first_entry_pos_tags = training_df['pos_tagged_text'].iloc[0]
for word, pos_tag in first_entry_pos_tags:
    print(f"{word}: {pos_tag}")

display(training_df.head())


PoS Tagging Result for first entry:
donald: JJ
trump: NN
sends: VBZ
out: RP
embarrassing: VBG
new: JJ
year: NN
eve: VBP
message: NN
this: DT
is: VBZ
disturbing: VBG


Unnamed: 0,0,1,tokenized_text,lemmatized_text,pos_tagged_text
0,0,donald trump sends out embarrassing new years ...,"[donald, trump, sends, out, embarrassing, new,...","[donald, trump, sends, out, embarrassing, new,...","[(donald, JJ), (trump, NN), (sends, VBZ), (out..."
1,0,drunk bragging trump staffer started russian c...,"[drunk, bragging, trump, staffer, started, rus...","[drunk, bragging, trump, staffer, started, rus...","[(drunk, NN), (bragging, VBG), (trump, NN), (s..."
2,0,sheriff david clarke becomes an internet joke ...,"[sheriff, david, clarke, becomes, an, internet...","[sheriff, david, clarke, becomes, an, internet...","[(sheriff, NN), (david, NN), (clarke, NN), (be..."
3,0,trump is so obsessed he even has obamas name c...,"[trump, is, so, obsessed, he, even, has, obama...","[trump, is, so, obsessed, he, even, ha, obamas...","[(trump, NN), (is, VBZ), (so, RB), (obsessed, ..."
4,0,pope francis just called out donald trump duri...,"[pope, francis, just, called, out, donald, tru...","[pope, francis, just, called, out, donald, tru...","[(pope, NN), (francis, NN), (just, RB), (calle..."




*   Remove StopWords to the lemmatized text



In [None]:
# Download stopwords
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from nltk.corpus import stopwords

# Get English stop words
stop_words = set(stopwords.words('english'))

# Define a function to remove stop words
def remove_stopwords(tokens):
    if isinstance(tokens, list):
        return [word for word in tokens if word not in stop_words]
    return []

# Apply the function to the 'lemmatized_text' column
training_df['filtered_text'] = training_df['lemmatized_text'].apply(remove_stopwords)

# Display the DataFrame with the new filtered text column
display(training_df.head())

Unnamed: 0,0,1,tokenized_text,lemmatized_text,pos_tagged_text,filtered_text
0,0,donald trump sends out embarrassing new years ...,"[donald, trump, sends, out, embarrassing, new,...","[donald, trump, sends, out, embarrassing, new,...","[(donald, JJ), (trump, NN), (sends, VBZ), (out...","[donald, trump, sends, embarrassing, new, year..."
1,0,drunk bragging trump staffer started russian c...,"[drunk, bragging, trump, staffer, started, rus...","[drunk, bragging, trump, staffer, started, rus...","[(drunk, NN), (bragging, VBG), (trump, NN), (s...","[drunk, bragging, trump, staffer, started, rus..."
2,0,sheriff david clarke becomes an internet joke ...,"[sheriff, david, clarke, becomes, an, internet...","[sheriff, david, clarke, becomes, an, internet...","[(sheriff, NN), (david, NN), (clarke, NN), (be...","[sheriff, david, clarke, becomes, internet, jo..."
3,0,trump is so obsessed he even has obamas name c...,"[trump, is, so, obsessed, he, even, has, obama...","[trump, is, so, obsessed, he, even, ha, obamas...","[(trump, NN), (is, VBZ), (so, RB), (obsessed, ...","[trump, obsessed, even, ha, obamas, name, code..."
4,0,pope francis just called out donald trump duri...,"[pope, francis, just, called, out, donald, tru...","[pope, francis, just, called, out, donald, tru...","[(pope, NN), (francis, NN), (just, RB), (calle...","[pope, francis, called, donald, trump, christm..."




*   Used the stop words removal to the pos tagged text



In [None]:
from nltk.corpus import stopwords

# Ensure stop_words are loaded (they should be from previous steps)
stop_words = set(stopwords.words('english'))


def remove_stopwords_from_pos_tags(pos_tagged_tokens):
    if isinstance(pos_tagged_tokens, list):

        return [ (word, tag) for word, tag in pos_tagged_tokens if word not in stop_words]
    return []

# Apply the function to the 'pos_tagged_text' column
training_df['filtered_pos_tagged_text'] = training_df['pos_tagged_text'].apply(remove_stopwords_from_pos_tags)

# Display the DataFrame with the new filtered POS tagged column
display(training_df.head())

Unnamed: 0,0,1,tokenized_text,lemmatized_text,pos_tagged_text,filtered_text,filtered_pos_tagged_text
0,0,donald trump sends out embarrassing new years ...,"[donald, trump, sends, out, embarrassing, new,...","[donald, trump, sends, out, embarrassing, new,...","[(donald, JJ), (trump, NN), (sends, VBZ), (out...","[donald, trump, sends, embarrassing, new, year...","[(donald, JJ), (trump, NN), (sends, VBZ), (emb..."
1,0,drunk bragging trump staffer started russian c...,"[drunk, bragging, trump, staffer, started, rus...","[drunk, bragging, trump, staffer, started, rus...","[(drunk, NN), (bragging, VBG), (trump, NN), (s...","[drunk, bragging, trump, staffer, started, rus...","[(drunk, NN), (bragging, VBG), (trump, NN), (s..."
2,0,sheriff david clarke becomes an internet joke ...,"[sheriff, david, clarke, becomes, an, internet...","[sheriff, david, clarke, becomes, an, internet...","[(sheriff, NN), (david, NN), (clarke, NN), (be...","[sheriff, david, clarke, becomes, internet, jo...","[(sheriff, NN), (david, NN), (clarke, NN), (be..."
3,0,trump is so obsessed he even has obamas name c...,"[trump, is, so, obsessed, he, even, has, obama...","[trump, is, so, obsessed, he, even, ha, obamas...","[(trump, NN), (is, VBZ), (so, RB), (obsessed, ...","[trump, obsessed, even, ha, obamas, name, code...","[(trump, NN), (obsessed, JJ), (even, RB), (ha,..."
4,0,pope francis just called out donald trump duri...,"[pope, francis, just, called, out, donald, tru...","[pope, francis, just, called, out, donald, tru...","[(pope, NN), (francis, NN), (just, RB), (calle...","[pope, francis, called, donald, trump, christm...","[(pope, NN), (francis, NN), (called, VBN), (do..."




*   Bag of words



In [None]:
# Turn column into single strings
training_df['joined_filtered_text'] = training_df['filtered_text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')
display(training_df.head())

Unnamed: 0,0,1,tokenized_text,lemmatized_text,pos_tagged_text,filtered_text,filtered_pos_tagged_text,joined_filtered_text
0,0,donald trump sends out embarrassing new years ...,"[donald, trump, sends, out, embarrassing, new,...","[donald, trump, sends, out, embarrassing, new,...","[(donald, JJ), (trump, NN), (sends, VBZ), (out...","[donald, trump, sends, embarrassing, new, year...","[(donald, JJ), (trump, NN), (sends, VBZ), (emb...",donald trump sends embarrassing new year eve m...
1,0,drunk bragging trump staffer started russian c...,"[drunk, bragging, trump, staffer, started, rus...","[drunk, bragging, trump, staffer, started, rus...","[(drunk, NN), (bragging, VBG), (trump, NN), (s...","[drunk, bragging, trump, staffer, started, rus...","[(drunk, NN), (bragging, VBG), (trump, NN), (s...",drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...,"[sheriff, david, clarke, becomes, an, internet...","[sheriff, david, clarke, becomes, an, internet...","[(sheriff, NN), (david, NN), (clarke, NN), (be...","[sheriff, david, clarke, becomes, internet, jo...","[(sheriff, NN), (david, NN), (clarke, NN), (be...",sheriff david clarke becomes internet joke thr...
3,0,trump is so obsessed he even has obamas name c...,"[trump, is, so, obsessed, he, even, has, obama...","[trump, is, so, obsessed, he, even, ha, obamas...","[(trump, NN), (is, VBZ), (so, RB), (obsessed, ...","[trump, obsessed, even, ha, obamas, name, code...","[(trump, NN), (obsessed, JJ), (even, RB), (ha,...",trump obsessed even ha obamas name coded websi...
4,0,pope francis just called out donald trump duri...,"[pope, francis, just, called, out, donald, tru...","[pope, francis, just, called, out, donald, tru...","[(pope, NN), (francis, NN), (just, RB), (calle...","[pope, francis, called, donald, trump, christm...","[(pope, NN), (francis, NN), (called, VBN), (do...",pope francis called donald trump christmas speech




*   CountVectorizeer


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
bigram_vectorizer = CountVectorizer(ngram_range=(2,2))

traindataset = bigram_vectorizer.fit_transform(training_df['joined_filtered_text'])

print("Shape", traindataset.shape)
print("First 5 feature:")
print(bigram_vectorizer.get_feature_names_out()[:5])

Shape (34152, 167233)
First 5 feature:
['00 young' '0149 gmt' '02 second' '02 yr' '025 percent']




*   TF-IDF





In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))

# Apply TF-IDF to the joined_filtered_text column
tfidf_dataset = tfidf_vectorizer.fit_transform(training_df['joined_filtered_text'])

print("Shape of TF-IDF matrix:", tfidf_dataset.shape)
print("First 5 TF-IDF feature names:", tfidf_vectorizer.get_feature_names_out()[:5])

Shape of TF-IDF matrix: (34152, 187325)
First 5 TF-IDF feature names: ['00' '00 young' '0149' '0149 gmt' '02']


## Model Creation

Train-test split

In [None]:
# Define features and target
X = traindataset # Count vectorizer bag of words
y = training_df[0]

print(f"Shape of X (features): {X.shape}")
print(f"Shape of y (target): {y.shape}")

Shape of X (features): (34152, 167233)
Shape of y (target): (34152,)


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data split into training and testing sets successfully.")
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Data split into training and testing sets successfully.
Shape of X_train: (27321, 167233)
Shape of X_test: (6831, 167233)
Shape of y_train: (27321,)
Shape of y_test: (6831,)


Random forest + Count vectorizeer

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Initialize the RandomForestClassifier
# Using a random_state for reproducibility
rf_model = RandomForestClassifier(random_state=42)

# Train the model using the training data
rf_model.fit(X_train, y_train)

print("RandomForestClassifier model trained successfully.")

RandomForestClassifier model trained successfully.


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Display a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.7794
Precision: 0.8769
Recall: 0.6323
F1-Score: 0.7348

Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.92      0.81      3529
           1       0.88      0.63      0.73      3302

    accuracy                           0.78      6831
   macro avg       0.80      0.77      0.77      6831
weighted avg       0.80      0.78      0.77      6831



The classification report provides some interesting insights into how our Random Forest Classifier is performing:

Overall Accuracy: The model achieved an accuracy of approximately 77.94%, meaning it correctly classified about 78% of the instances in the test set.

Class 0 (Assumed Negative/Real):

Precision (0.73): When the model predicts an instance belongs to Class 0, it's correct about 73% of the time. This means 27% of its predictions for Class 0 were actually Class 1.
Recall (0.92): The model is very good at identifying actual Class 0 instances, correctly catching about 92% of them. It misses very few true Class 0 instances.
F1-Score (0.81): This is a good balance between precision and recall for Class 0, indicating strong performance in this category.
Class 1 (Assumed Positive/Fake):

Precision (0.88): When the model predicts an instance belongs to Class 1, it's highly accurate, being correct about 88% of the time.
Recall (0.63): This is where the model shows a relative weakness. It only identifies about 63% of the actual Class 1 instances, meaning it misses a significant portion (37%) of them, often misclassifying them as Class 0.
F1-Score (0.73): This score reflects the imbalance between the high precision and lower recall for Class 1.
Key Insights:

Bias towards Class 0: The model appears to be more inclined to predict Class 0. It's very good at finding most of the true Class 0 instances (high recall for Class 0), but it sometimes incorrectly labels Class 1 instances as Class 0 (lower precision for Class 0, higher recall for Class 0). This also explains the lower recall for Class 1.
Reliable Class 1 Predictions: When the model does predict Class 1, it's quite reliable (high precision for Class 1). However, it struggles to identify all actual Class 1 instances.
Depending on the specific problem (e.g., is it more critical to catch all fake news, or ensure any identified fake news is truly fake?), you might prioritize different metrics. For example, if missing fake news (Class 1) is very costly, you might want to improve the recall for Class 1, even if it slightly reduces precision.


*   Train-test Split






In [None]:
# Defining features and Target
X_tfidf = tfidf_dataset
y = training_df[0]

print(f"Shape of X_tfidf (features): {X_tfidf.shape}")
print(f"Shape of y (target): {y.shape}")

Shape of X_tfidf (features): (34152, 187325)
Shape of y (target): (34152,)


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets using X_tfidf
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

print("TF-IDF data split into training and testing sets successfully.")
print(f"Shape of X_train_tfidf: {X_train_tfidf.shape}")
print(f"Shape of X_test_tfidf: {X_test_tfidf.shape}")
print(f"Shape of y_train_tfidf: {y_train_tfidf.shape}")
print(f"Shape of y_test_tfidf: {y_test_tfidf.shape}")

TF-IDF data split into training and testing sets successfully.
Shape of X_train_tfidf: (27321, 187325)
Shape of X_test_tfidf: (6831, 187325)
Shape of y_train_tfidf: (27321,)
Shape of y_test_tfidf: (6831,)




*   RandomForestClassifier + TF-IDF





In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the RandomForestClassifier with a random_state for reproducibility
rf_model_tfidf = RandomForestClassifier(random_state=42)

# Train the model using the TF-IDF training data
rf_model_tfidf.fit(X_train_tfidf, y_train_tfidf)

print("RandomForestClassifier model trained successfully using TF-IDF features.")

RandomForestClassifier model trained successfully using TF-IDF features.


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Make predictions on the TF-IDF test set
y_pred_tfidf = rf_model_tfidf.predict(X_test_tfidf)

# Calculate evaluation metrics
accuracy_tfidf = accuracy_score(y_test_tfidf, y_pred_tfidf)
precision_tfidf = precision_score(y_test_tfidf, y_pred_tfidf)
recall_tfidf = recall_score(y_test_tfidf, y_pred_tfidf)
f1_tfidf = f1_score(y_test_tfidf, y_pred_tfidf)

print(f"Accuracy: {accuracy_tfidf:.4f}")
print(f"Precision: {precision_tfidf:.4f}")
print(f"Recall: {recall_tfidf:.4f}")
print(f"F1-Score: {f1_tfidf:.4f}")

print("\nClassification Report (TF-IDF features):")
print(classification_report(y_test_tfidf, y_pred_tfidf))

Accuracy: 0.9272
Precision: 0.9343
Recall: 0.9137
F1-Score: 0.9239

Classification Report (TF-IDF features):
              precision    recall  f1-score   support

           0       0.92      0.94      0.93      3529
           1       0.93      0.91      0.92      3302

    accuracy                           0.93      6831
   macro avg       0.93      0.93      0.93      6831
weighted avg       0.93      0.93      0.93      6831



The Random Forest Classifier, when trained with TF-IDF features, demonstrated significantly improved performance compared to the CountVectorizer approach.

**Key Metrics:**
*   **Accuracy:** 0.9272
*   **Precision:** 0.9343
*   **Recall:** 0.9137
*   **F1-Score:** 0.9239

**Insights from the Classification Report:**
*   **Overall Performance:** The model achieved an accuracy of approximately 92.72%, which is a substantial improvement over the 77.94% accuracy obtained with CountVectorizer features.
*   **Balanced Performance:** Unlike the previous model, this model shows a much more balanced performance across both classes. Both precision and recall for Class 0 (real news) and Class 1 (fake news) are high and very close to each other, indicating the model is not biased towards one class.
    *   For **Class 0 (Real News)**:
        *   Precision: `0.92` (92% of predicted real news are actually real)
        *   Recall: `0.94` (The model correctly identified 94% of all actual real news)
        *   F1-Score: `0.93`
    *   For **Class 1 (Fake News)**:
        *   Precision: `0.93` (93% of predicted fake news are actually fake)
        *   Recall: `0.91` (The model correctly identified 91% of all actual fake news)
        *   F1-Score: `0.92`


## Model for POS-tag

*   Data preparation



In [None]:
# join POS-tagged tokens into a single string
def join_pos_tags(pos_tagged_tokens):
    if isinstance(pos_tagged_tokens, list):
        return ' '.join([f'{word}_{tag}' for word, tag in pos_tagged_tokens])
    return ''

training_df['pos_tagged_joined_text'] = training_df['filtered_pos_tagged_text'].apply(join_pos_tags)

display(training_df.head())

Unnamed: 0,0,1,tokenized_text,lemmatized_text,pos_tagged_text,filtered_text,filtered_pos_tagged_text,joined_filtered_text,pos_tagged_joined_text
0,0,donald trump sends out embarrassing new years ...,"[donald, trump, sends, out, embarrassing, new,...","[donald, trump, sends, out, embarrassing, new,...","[(donald, JJ), (trump, NN), (sends, VBZ), (out...","[donald, trump, sends, embarrassing, new, year...","[(donald, JJ), (trump, NN), (sends, VBZ), (emb...",donald trump sends embarrassing new year eve m...,donald_JJ trump_NN sends_VBZ embarrassing_VBG ...
1,0,drunk bragging trump staffer started russian c...,"[drunk, bragging, trump, staffer, started, rus...","[drunk, bragging, trump, staffer, started, rus...","[(drunk, NN), (bragging, VBG), (trump, NN), (s...","[drunk, bragging, trump, staffer, started, rus...","[(drunk, NN), (bragging, VBG), (trump, NN), (s...",drunk bragging trump staffer started russian c...,drunk_NN bragging_VBG trump_NN staffer_NN star...
2,0,sheriff david clarke becomes an internet joke ...,"[sheriff, david, clarke, becomes, an, internet...","[sheriff, david, clarke, becomes, an, internet...","[(sheriff, NN), (david, NN), (clarke, NN), (be...","[sheriff, david, clarke, becomes, internet, jo...","[(sheriff, NN), (david, NN), (clarke, NN), (be...",sheriff david clarke becomes internet joke thr...,sheriff_NN david_NN clarke_NN becomes_VBZ inte...
3,0,trump is so obsessed he even has obamas name c...,"[trump, is, so, obsessed, he, even, has, obama...","[trump, is, so, obsessed, he, even, ha, obamas...","[(trump, NN), (is, VBZ), (so, RB), (obsessed, ...","[trump, obsessed, even, ha, obamas, name, code...","[(trump, NN), (obsessed, JJ), (even, RB), (ha,...",trump obsessed even ha obamas name coded websi...,trump_NN obsessed_JJ even_RB ha_VBZ obamas_JJ ...
4,0,pope francis just called out donald trump duri...,"[pope, francis, just, called, out, donald, tru...","[pope, francis, just, called, out, donald, tru...","[(pope, NN), (francis, NN), (just, RB), (calle...","[pope, francis, called, donald, trump, christm...","[(pope, NN), (francis, NN), (called, VBN), (do...",pope francis called donald trump christmas speech,pope_NN francis_NN called_VBN donald_JJ trump_...




*   Count Vectorizeer



In [None]:


# Initialize CountVectorizer for POS-tagged text.
pos_vectorizer = CountVectorizer(ngram_range=(1,1))

pos_traindataset = pos_vectorizer.fit_transform(training_df['pos_tagged_joined_text'])

print("Shape of POS CountVectorizer matrix:", pos_traindataset.shape)
print("First 5 feature names (POS):")
print(pos_vectorizer.get_feature_names_out()[:5])

Shape of POS CountVectorizer matrix: (34152, 30950)
First 5 feature names (POS):
['00_cd' '0149_cd' '025_cd' '02_cd' '0330_cd']




*  Train-test Split



In [None]:


# Define features (X_pos) and target (y_pos) for POS-tagged data
X_pos = pos_traindataset
y_pos = training_df[0]

# Split the data into training and testing sets
X_train_pos, X_test_pos, y_train_pos, y_test_pos = train_test_split(X_pos, y_pos, test_size=0.2, random_state=42)

print("POS-tagged data split into training and testing sets successfully.")
print(f"Shape of X_train_pos: {X_train_pos.shape}")
print(f"Shape of X_test_pos: {X_test_pos.shape}")
print(f"Shape of y_train_pos: {y_train_pos.shape}")
print(f"Shape of y_test_pos: {y_test_pos.shape}")

POS-tagged data split into training and testing sets successfully.
Shape of X_train_pos: (27321, 30950)
Shape of X_test_pos: (6831, 30950)
Shape of y_train_pos: (27321,)
Shape of y_test_pos: (6831,)




*   Model Creation



In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the RandomForestClassifier with a random_state for reproducibility
rf_model_pos = RandomForestClassifier(random_state=42)

# Train the model using the POS-tagged training data
rf_model_pos.fit(X_train_pos, y_train_pos)

print("RandomForestClassifier model trained successfully using POS-tagged features.")

RandomForestClassifier model trained successfully using POS-tagged features.


Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Make predictions on the POS-tagged test set
y_pred_pos = rf_model_pos.predict(X_test_pos)

# Calculate evaluation metrics
accuracy_pos = accuracy_score(y_test_pos, y_pred_pos)
precision_pos = precision_score(y_test_pos, y_pred_pos)
recall_pos = recall_score(y_test_pos, y_pred_pos)
f1_pos = f1_score(y_test_pos, y_pred_pos)

print("\n--- Random Forest Classifier Performance (POS-tagged features) ---")
print(f"Accuracy: {accuracy_pos:.4f}")
print(f"Precision: {precision_pos:.4f}")
print(f"Recall: {recall_pos:.4f}")
print(f"F1-Score: {f1_pos:.4f}")

print("\nClassification Report (POS-tagged features):")
print(classification_report(y_test_pos, y_pred_pos))


--- Random Forest Classifier Performance (POS-tagged features) ---
Accuracy: 0.9073
Precision: 0.9201
Recall: 0.8852
F1-Score: 0.9023

Classification Report (POS-tagged features):
              precision    recall  f1-score   support

           0       0.90      0.93      0.91      3529
           1       0.92      0.89      0.90      3302

    accuracy                           0.91      6831
   macro avg       0.91      0.91      0.91      6831
weighted avg       0.91      0.91      0.91      6831



### Saving the Trained Models

### Naive Bayes Classifier with TF-IDF features

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the Multinomial Naive Bayes classifier
nb_model_tfidf = MultinomialNB()

# Train the model using the TF-IDF training data
nb_model_tfidf.fit(X_train_tfidf, y_train_tfidf)

print("Multinomial Naive Bayes model trained successfully using TF-IDF features.")

Multinomial Naive Bayes model trained successfully using TF-IDF features.


### Evaluate Naive Bayes Classifier with TF-IDF features

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Make predictions on the TF-IDF test set
y_pred_nb_tfidf = nb_model_tfidf.predict(X_test_tfidf)

# Calculate evaluation metrics
accuracy_nb_tfidf = accuracy_score(y_test_tfidf, y_pred_nb_tfidf)
precision_nb_tfidf = precision_score(y_test_tfidf, y_pred_nb_tfidf)
recall_nb_tfidf = recall_score(y_test_tfidf, y_pred_nb_tfidf)
f1_nb_tfidf = f1_score(y_test_tfidf, y_pred_nb_tfidf)

print("\n--- Multinomial Naive Bayes Classifier Performance (TF-IDF features) ---")
print(f"Accuracy: {accuracy_nb_tfidf:.4f}")
print(f"Precision: {precision_nb_tfidf:.4f}")
print(f"Recall: {recall_nb_tfidf:.4f}")
print(f"F1-Score: {f1_nb_tfidf:.4f}")

print("\nClassification Report (TF-IDF features):")
print(classification_report(y_test_tfidf, y_pred_nb_tfidf))


--- Multinomial Naive Bayes Classifier Performance (TF-IDF features) ---
Accuracy: 0.9363
Precision: 0.9396
Recall: 0.9279
F1-Score: 0.9337

Classification Report (TF-IDF features):
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      3529
           1       0.94      0.93      0.93      3302

    accuracy                           0.94      6831
   macro avg       0.94      0.94      0.94      6831
weighted avg       0.94      0.94      0.94      6831



### Naive Bayes Classifier with POS-tagged features

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the Multinomial Naive Bayes classifier
nb_model_pos = MultinomialNB()

# Train the model using the POS-tagged training data
nb_model_pos.fit(X_train_pos, y_train_pos)

print("Multinomial Naive Bayes model trained successfully using POS-tagged features.")

Multinomial Naive Bayes model trained successfully using POS-tagged features.


### Evaluate Naive Bayes Classifier with POS-tagged features

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Make predictions on the POS-tagged test set
y_pred_nb_pos = nb_model_pos.predict(X_test_pos)

# Calculate evaluation metrics
accuracy_nb_pos = accuracy_score(y_test_pos, y_pred_nb_pos)
precision_nb_pos = precision_score(y_test_pos, y_pred_nb_pos)
recall_nb_pos = recall_score(y_test_pos, y_pred_nb_pos)
f1_nb_pos = f1_score(y_test_pos, y_pred_nb_pos)

print("\n--- Multinomial Naive Bayes Classifier Performance (POS-tagged features) ---")
print(f"Accuracy: {accuracy_nb_pos:.4f}")
print(f"Precision: {precision_nb_pos:.4f}")
print(f"Recall: {recall_nb_pos:.4f}")
print(f"F1-Score: {f1_nb_pos:.4f}")

print("\nClassification Report (POS-tagged features):")
print(classification_report(y_test_pos, y_pred_nb_pos))


--- Multinomial Naive Bayes Classifier Performance (POS-tagged features) ---
Accuracy: 0.9305
Precision: 0.9249
Recall: 0.9319
F1-Score: 0.9283

Classification Report (POS-tagged features):
              precision    recall  f1-score   support

           0       0.94      0.93      0.93      3529
           1       0.92      0.93      0.93      3302

    accuracy                           0.93      6831
   macro avg       0.93      0.93      0.93      6831
weighted avg       0.93      0.93      0.93      6831



In [None]:
import joblib

# Save the Random Forest model trained with CountVectorizer features
joblib.dump(rf_model, 'random_forest_countvectorizer_model.joblib')
print("rf_model (CountVectorizer) saved as 'random_forest_countvectorizer_model.joblib'")

# Save the Random Forest model trained with TF-IDF features
joblib.dump(rf_model_tfidf, 'random_forest_tfidf_model.joblib')
print("rf_model_tfidf (TF-IDF) saved as 'random_forest_tfidf_model.joblib'")

# Save the Random Forest model trained with POS-tagged features
joblib.dump(rf_model_pos, 'random_forest_pos_model.joblib')
print("rf_model_pos (POS-tagged) saved as 'random_forest_pos_model.joblib'")

rf_model (CountVectorizer) saved as 'random_forest_countvectorizer_model.joblib'
rf_model_tfidf (TF-IDF) saved as 'random_forest_tfidf_model.joblib'
rf_model_pos (POS-tagged) saved as 'random_forest_pos_model.joblib'


### Saving the Naive Bayes Trained Models

In [None]:
import joblib

# Save the Multinomial Naive Bayes model trained with TF-IDF features
joblib.dump(nb_model_tfidf, 'naive_bayes_tfidf_model.joblib')
print("nb_model_tfidf (TF-IDF) saved as 'naive_bayes_tfidf_model.joblib'")

# Save the Multinomial Naive Bayes model trained with POS-tagged features
joblib.dump(nb_model_pos, 'naive_bayes_pos_model.joblib')
print("nb_model_pos (POS-tagged) saved as 'naive_bayes_pos_model.joblib'")

nb_model_tfidf (TF-IDF) saved as 'naive_bayes_tfidf_model.joblib'
nb_model_pos (POS-tagged) saved as 'naive_bayes_pos_model.joblib'


Evaluations:

* Random Forest Classifier with CountVectorizer Features:

Accuracy: 0.7794
Precision: 0.8769
Recall: 0.6323
F1-Score: 0.7348
Insight: Showed good precision but lower recall for class 1, indicating a bias towards predicting class 0.

* Random Forest Classifier with TF-IDF Features:

Accuracy: 0.9272
Precision: 0.9343
Recall: 0.9137
F1-Score: 0.9239
Insight: A significant improvement over CountVectorizer, demonstrating balanced and high performance for both classes.

* Random Forest Classifier with POS-tagged Features:

Accuracy: 0.9073
Precision: 0.9201
Recall: 0.8852
F1-Score: 0.9023
Insight: Performed well, slightly lower than TF-IDF but still offering strong and balanced metrics.

* Multinomial Naive Bayes Classifier with TF-IDF Features:

Accuracy: 0.9363
Precision: 0.9396
Recall: 0.9279
F1-Score: 0.9337
Insight: This model achieved the highest accuracy among all, showing excellent and balanced performance.

* Multinomial Naive Bayes Classifier with POS-tagged Features:

Accuracy: 0.9305
Precision: 0.9249
Recall: 0.9319
F1-Score: 0.9283
Insight: Another strong performer, very comparable to the TF-IDF version of Naive Bayes and the Random Forest with TF-IDF.

### Saving the TF-IDF Vectorizer

In [None]:
import joblib

# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')
print("TF-IDF Vectorizer saved as 'tfidf_vectorizer.joblib'")

TF-IDF Vectorizer saved as 'tfidf_vectorizer.joblib'


Starting Grid Search Cross-Validation...
Fitting 3 folds for each of 96 candidates, totalling 288 fits

Grid Search completed!
Best parameters found: {'rf__max_depth': 10, 'rf__min_samples_leaf': 1, 'rf__n_estimators': 100, 'tfidf__max_features': 5000, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 1)}
Best cross-validation accuracy: 1.0000

Accuracy of the best model (on CV data): 1.0000

Classification Report of the best model (on CV data):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9984

    accuracy                           1.00      9984
   macro avg       1.00      1.00      1.00      9984
weighted avg       1.00      1.00      1.00      9984

