In [47]:
import numpy as np 
import pandas as pd 
import itertools
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

import string as st
import re       
import nltk

from nltk import PorterStemmer, WordNetLemmatizer
import matplotlib.pyplot as plt

import os 

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


In [48]:
data = pd.read_csv("Article_Data.csv")

In [49]:
data.head(10)

Unnamed: 0.1,Unnamed: 0,author,statement,date,source,target
0,0,Maria Briceño,Los incendios en California “han llegado a Tij...,"13, January 2025",Facebook posts,false
1,1,Jeff Cercone,Video shows people in Los Angeles looting from...,"13, January 2025",Instagram posts,pants-fire
2,2,Sofia Ahmed,A photo shows a Christian’s home that “miracul...,"13, January 2025",Facebook posts,false
3,3,Ciara O'Rourke,Image shows former President Barack Obama was ...,"13, January 2025",Threads posts,pants-fire
4,4,Maria Briceño,Esta imagen muestra los incendios en Californi...,"13, January 2025",Facebook posts,false
5,5,Madison Czopek,“Blue items that survive” California wildfires...,"10, January 2025",Threads posts,false
6,6,Maria Briceño,Imagen muestra el letrero de Hollywood en Cali...,"10, January 2025",Facebook posts,false
7,7,Loreben Tuquero,Video shows a man saving a bunny during the 20...,"10, January 2025",Facebook posts,false
8,8,Loreben Tuquero,Image shows the Hollywood sign was on fire as ...,"10, January 2025",Social Media,false
9,9,Ciara O'Rourke,Video shows Dr. Mehmet Oz and rapper Snoop Dog...,"10, January 2025",Viral image,false


In [50]:
unique_labels = list(set(data['target']))
print(unique_labels)

['full-flop', 'half-true', 'barely-true', 'false', 'true', 'pants-fire', 'mostly-true']


In [51]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target column
data['target_encoded'] = label_encoder.fit_transform(data['target'])

# Get the mapping of labels to encoded values
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

print("Label Mapping:", label_mapping)


Label Mapping: {'barely-true': np.int64(0), 'false': np.int64(1), 'full-flop': np.int64(2), 'half-true': np.int64(3), 'mostly-true': np.int64(4), 'pants-fire': np.int64(5), 'true': np.int64(6)}


Preprocessing the statements

In [52]:
# Drop the unnecessary columns
data_tfidf = data.drop(columns=['author', 'date', 'source', 'target'])

# Optionally reset the index
data_tfidf.reset_index(drop=True, inplace=True)

# Display the resulting DataFrame
print(data_tfidf.head(5))

   Unnamed: 0                                          statement  \
0           0  Los incendios en California “han llegado a Tij...   
1           1  Video shows people in Los Angeles looting from...   
2           2  A photo shows a Christian’s home that “miracul...   
3           3  Image shows former President Barack Obama was ...   
4           4  Esta imagen muestra los incendios en Californi...   

   target_encoded  
0               1  
1               5  
2               1  
3               5  
4               1  


In [53]:
#1.1 Removing non alphabetic characters
def rem_punct(text):
    return ("".join([ch for ch in text if ch not in st.punctuation])) 

data_tfidf['statement'] = data_tfidf['statement'].apply(lambda x: rem_punct(x))
print(data_tfidf.head())

   Unnamed: 0                                          statement  \
0           0  Los incendios en California “han llegado a Tij...   
1           1  Video shows people in Los Angeles looting from...   
2           2  A photo shows a Christian’s home that “miracul...   
3           3  Image shows former President Barack Obama was ...   
4           4  Esta imagen muestra los incendios en Californi...   

   target_encoded  
0               1  
1               5  
2               1  
3               5  
4               1  


In [54]:
#1.2 Tokenization and lowercase
def tokenize(text):
    text = re.split('\s+',text) # \s+ denotes whitespace characters. so we are splitting based on \s+ seperator for the whole (max times to split) of text column
    return[x.lower() for x in text]

data_tfidf['statement'] = data_tfidf['statement'].apply(lambda x: tokenize(x))
print(data_tfidf.head())

   Unnamed: 0                                          statement  \
0           0  [los, incendios, en, california, “han, llegado...   
1           1  [video, shows, people, in, los, angeles, looti...   
2           2  [a, photo, shows, a, christian’s, home, that, ...   
3           3  [image, shows, former, president, barack, obam...   
4           4  [esta, imagen, muestra, los, incendios, en, ca...   

   target_encoded  
0               1  
1               5  
2               1  
3               5  
4               1  


  text = re.split('\s+',text) # \s+ denotes whitespace characters. so we are splitting based on \s+ seperator for the whole (max times to split) of text column


In [55]:

#1.3 Stopword & Smallword removal
def rem_small(text):
    return[word for word in text if len(word)>3]
data_tfidf['statement'] = data_tfidf['statement'].apply(lambda x: rem_small(x))
#print(data_tfidf.head())


def rem_stopword(text):
    return[word for word in text if word not in nltk.corpus.stopwords.words('english')]
data_tfidf['statement'] = data_tfidf['statement'].apply(lambda x: rem_stopword(x))
print(data_tfidf.head())

   Unnamed: 0                                          statement  \
0           0  [incendios, california, “han, llegado, tijuana...   
1           1  [video, shows, people, angeles, looting, homes...   
2           2  [photo, shows, christian’s, home, “miraculousl...   
3           3  [image, shows, former, president, barack, obam...   
4           4  [esta, imagen, muestra, incendios, california,...   

   target_encoded  
0               1  
1               5  
2               1  
3               5  
4               1  


In [44]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [62]:
#1.4 Morphological analysis - Lemmatization
def lemm(text):
    word_net = nltk.WordNetLemmatizer()
    return[word_net.lemmatize(word) for word in text]
data_tfidf['statement'] = data_tfidf['statement'].apply(lambda x: lemm(x))

print(data_tfidf.head(),data_tfidf['statement'].shape)

   Unnamed: 0                                          statement  \
0           0  [incendios, california, “han, llegado, tijuana...   
1           1  [video, show, people, angeles, looting, home, ...   
2           2  [photo, show, christian’s, home, “miraculously...   
3           3  [image, show, former, president, barack, obama...   
4           4  [esta, imagen, muestra, incendios, california,...   

   target_encoded  
0               1  
1               5  
2               1  
3               5  
4               1   (1020,)


In [63]:
# Ensure each row in 'statements' is a string
data_tfidf['statement'] = data_tfidf['statement'].apply(lambda x: " ".join(x) if isinstance(x, list) else x)

# Extract the text column for vectorization
statements = data_tfidf['statement']

# Initialize the TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed

# Fit and transform the statements
X_tfidf = tfidf.fit_transform(statements)

# Convert the TF-IDF matrix to a DataFrame for visualization
import pandas as pd
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())
print(tfidf_df.head())


   0134   02  100  1000  100000  10game  10year  1174  1200  1300  ...  \
0   0.0  0.0  0.0   0.0     0.0     0.0     0.0   0.0   0.0   0.0  ...   
1   0.0  0.0  0.0   0.0     0.0     0.0     0.0   0.0   0.0   0.0  ...   
2   0.0  0.0  0.0   0.0     0.0     0.0     0.0   0.0   0.0   0.0  ...   
3   0.0  0.0  0.0   0.0     0.0     0.0     0.0   0.0   0.0   0.0  ...   
4   0.0  0.0  0.0   0.0     0.0     0.0     0.0   0.0   0.0   0.0  ...   

   zapeta  zarcillos  zelenskyy  zero  zerotolerance  zillow  zone  \
0     0.0        0.0        0.0   0.0            0.0     0.0   0.0   
1     0.0        0.0        0.0   0.0            0.0     0.0   0.0   
2     0.0        0.0        0.0   0.0            0.0     0.0   0.0   
3     0.0        0.0        0.0   0.0            0.0     0.0   0.0   
4     0.0        0.0        0.0   0.0            0.0     0.0   0.0   

   zuckerberg  épico  última  
0         0.0    0.0     0.0  
1         0.0    0.0     0.0  
2         0.0    0.0     0.0  
3         

Training the model

In [64]:
from sklearn.model_selection import train_test_split

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, data['target_encoded'], test_size=0.2, random_state=42)

print("Training size:", X_train.shape)
print("Testing size:", X_test.shape)


Training size: (816, 3707)
Testing size: (204, 3707)


In [66]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the model
model1 = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model1.fit(X_train, y_train)

# Predict on the test set
y_pred1 = model1.predict(X_test)




In [67]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the model
model2 = RandomForestClassifier(n_estimators=100, random_state=42)
model2.fit(X_train, y_train)

# Predict on the test set
y_pred2 = model2.predict(X_test)


Evaluating the model

In [69]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [71]:
# Calculate accuracy
accuracy_logistic = accuracy_score(y_test, y_pred1)
accuracy_random_forest = accuracy_score(y_test, y_pred2)
print("Accuracy_logistic:", accuracy_logistic)
print("Accuracy_random_forest:", accuracy_random_forest)

Accuracy_logistic: 0.6911764705882353
Accuracy_random_forest: 0.6764705882352942


In [72]:
# Classification report
print("Classification Report Logistic:\n", classification_report(y_test, y_pred1))
print("Classification Report Random Forest:\n", classification_report(y_test, y_pred2))

Classification Report Logistic:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.69      0.99      0.82       142
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00         6
           5       0.00      0.00      0.00        32
           6       0.00      0.00      0.00         5

    accuracy                           0.69       204
   macro avg       0.12      0.17      0.14       204
weighted avg       0.48      0.69      0.57       204

Classification Report Random Forest:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.69      0.96      0.80       142
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00         6
           5       0.29      0.06      0.10        32
           6       0.00      0.00      0.00         5

    ac

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [73]:
# Confusion matrix
print("Confusion Matrix Logistic:\n", confusion_matrix(y_test, y_pred1))
print("Confusion Matrix Logistic:\n", confusion_matrix(y_test, y_pred2))

Confusion Matrix Logistic:
 [[  0  12   0   0   0   0]
 [  0 141   0   0   1   0]
 [  0   7   0   0   0   0]
 [  0   6   0   0   0   0]
 [  0  32   0   0   0   0]
 [  0   5   0   0   0   0]]
Confusion Matrix Logistic:
 [[  0  12   0   0   0   0]
 [  1 136   0   0   5   0]
 [  0   7   0   0   0   0]
 [  0   6   0   0   0   0]
 [  0  30   0   0   2   0]
 [  0   5   0   0   0   0]]


save the model

In [74]:
import joblib

# Save the model
joblib.dump(model1, 'fake_news_model_logistic.pkl')
joblib.dump(model2, 'fake_news_model_random_forest.pkl')


# Save the TF-IDF vectorizer
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

making predictions on new data

In [75]:
# Load the saved model and vectorizer
model1 = joblib.load('fake_news_model_logistic.pkl')
model2 = joblib.load('fake_news_model_random_forest.pkl')
tfidf = joblib.load('tfidf_vectorizer.pkl')

# Example new statement
new_statement = ["This is an example of a news article."]

# Transform the new statement into TF-IDF features
new_statement_tfidf = tfidf.transform(new_statement)

# Predict the target
predicted_label = model1.predict(new_statement_tfidf)
print("Predicted label:", predicted_label)


Predicted label: [1]


In [76]:
# Invert the label_mapping dictionary for decoding
reverse_label_mapping = {v: k for k, v in label_mapping.items()}

# Predict the label
predicted_label_encoded = model1.predict(new_statement_tfidf)[0]  # Get the encoded label
predicted_label = reverse_label_mapping[predicted_label_encoded]  # Decode the label

print("Predicted label (decoded):", predicted_label)


Predicted label (decoded): false
