In [1]:
import pandas as pd
import os
import sys
import joblib

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

# Add project root to path
PROJECT_ROOT = os.path.abspath("..")
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# Import FAST cleaner
from src.preprocessing import clean_text_fast  #cleaning time reduce.


In [2]:

fake_df = pd.read_csv('../data/raw/True.csv')
true_df = pd.read_csv('../data/raw/True.csv')

print(f"Fake news shape: {fake_df.shape}")
print(f"True news shape: {true_df.shape}")

print(fake_df.head())
print(fake_df.columns)
print(fake_df['text'].apply(type).value_counts())  # Important

Fake news shape: (21417, 4)
True news shape: (21417, 4)
                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   

                 date  
0  December 31, 2017   
1  December 29, 2017   
2  December 31, 2017   
3  December 30, 2017   
4  December 29, 2017   
Index(['title', 'text', 'subject', 'date']

In [3]:
true_df.info()
fake_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB


In [4]:
true_df.describe()


Unnamed: 0,title,text,subject,date
count,21417,21417,21417,21417
unique,20826,21192,2,716
top,Factbox: Trump fills top jobs for his administ...,(Reuters) - Highlights for U.S. President Dona...,politicsNews,"December 20, 2017"
freq,14,8,11272,182


In [5]:

fake_df.describe()

Unnamed: 0,title,text,subject,date
count,21417,21417,21417,21417
unique,20826,21192,2,716
top,Factbox: Trump fills top jobs for his administ...,(Reuters) - Highlights for U.S. President Dona...,politicsNews,"December 20, 2017"
freq,14,8,11272,182


In [6]:
print("--- Missing Values in True News ---")
print(true_df.isnull().sum())

print("\n--- Missing Values in Fake News ---")
print(fake_df.isnull().sum())

--- Missing Values in True News ---
title      0
text       0
subject    0
date       0
dtype: int64

--- Missing Values in Fake News ---
title      0
text       0
subject    0
date       0
dtype: int64


In [7]:

print(f"True news rows: {len(true_df)}")
print(f"Fake news rows: {len(fake_df)}")

print("\nFinal Null Count in True:")
print(true_df.isnull().sum().sum())

True news rows: 21417
Fake news rows: 21417

Final Null Count in True:
0


In [8]:
# 1. Label the data (1 for real, 0 for fake)
true_df['label'] = 1
fake_df['label'] = 0

# Combine the datasets
df = pd.concat([fake_df, true_df], axis=0, ignore_index=True)

# Keep only text & label
df = df[['text', 'label']]

# Shuffle
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(df.head())

                                                text  label
0  MEXICO CITY (Reuters) - Twelve people died and...      0
1  WASHINGTON (Reuters) - Special counsel Robert ...      1
2  MANILA (Reuters) - Philippine commandos and la...      0
3  WASHINGTON (Reuters) - The United States will ...      0
4  GENEVA (Reuters) - The U.N. refugee agency urg...      0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42834 entries, 0 to 42833
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    42834 non-null  object
 1   label   42834 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 669.4+ KB


DATA PREPROCESSING

In [10]:
df.sample(10)

Unnamed: 0,text,label
7036,WASHINGTON/LONDON (Reuters) - Republican presi...,0
10257,SAN FRANCISCO (Reuters) - Apple Inc (AAPL.O) C...,1
11101,WASHINGTON (Reuters) - U.S. Republican Senator...,0
11440,NAIROBI (Reuters) - Kenyan President Uhuru Ken...,0
9425,SEOUL/BEIJING (Reuters) - Chinese President Xi...,0
21468,NEW YORK (Reuters) - Americans are increasing...,0
15542,MOSCOW (Reuters) - President Vladimir Putin wa...,1
8136,JERUSALEM (Reuters) - Jewish ultra-Orthodox de...,0
6256,TALLINN (Reuters) - German Chancellor Angela M...,1
35827,WASHINGTON (Reuters) - U.S. President Barack O...,1


In [11]:
df.isnull().sum()
df = df.dropna()

In [12]:
df.duplicated().sum()
df=df.drop_duplicates(keep='first')

In [13]:
df.value_counts()

text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

In [14]:
df['label'].value_counts()

label
0    21192
1    21192
Name: count, dtype: int64

In [15]:
df['label'].replace({0: 'fake', 1: 'true'})
df['label'].value_counts()

label
0    21192
1    21192
Name: count, dtype: int64

In [16]:
def transform_text(text):  #text provide krenge
    text = text.lower()     #lower case me convert krega
    return text

In [17]:
transform_text('Hi how are you' )

'hi how are you'

In [18]:
def transform_text(text):  #text provide krenge
    text = text.upper()     #lower case me convert krega
    return text

transform_text('Hi how are you' )

'HI HOW ARE YOU'

In [19]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)  #tokenization
    
    return text

transform_text('Hi how are you' )

['hi', 'how', 'are', 'you']

In [21]:
df['text'][100]

'BERLIN (Reuters) - Leading figures from Angela Merkel s conservatives and the Social Democrats (SPD) outlined differing visions on how their possible government would approach immigration, as Germany s would-be coalition partners prepared for talks in the New Year.    Chancellor Merkel s 2015 decision to open the doors to more than a million migrants, many fleeing war in the Middle East, transformed Germany s demographic landscape and boosted the far right, hurting her bloc and the SPD in September s election. In separate interviews, Thomas Strobl, deputy leader of Merkel s Christian Democrats (CDU) and SPD foreign minister Sigmar Gabriel outlined ways of winning back disenchanted supporters. Strobl told the Heilbronner Stimme newspaper Germany should cap the number of new immigrants at 65,000 a year, the level in 2012, and far below the limit of 200,000 that the conservatives had previously advocated. But Gabriel, whose party s restive membership would be unlikely to accept such a dr

In [22]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text) #tokenization
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)   #append krega 
    return y

transform_text("I'm 90%%% gonna be home soon and i don't want to talk ")
transform_text('Hi how are you Subject: re : doctor approved pill lgw  a man endowed with a 7 - 8 " hammer is simply  better equipped than a man with a 5 - 6 " hammer .' )

['hi',
 'how',
 'are',
 'you',
 'subject',
 're',
 'doctor',
 'approved',
 'pill',
 'lgw',
 'a',
 'man',
 'endowed',
 'with',
 'a',
 '7',
 '8',
 'hammer',
 'is',
 'simply',
 'better',
 'equipped',
 'than',
 'a',
 'man',
 'with',
 'a',
 '5',
 '6',
 'hammer']

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


In [26]:
print(df["text"].head())
print(df["text"].apply(type).value_counts())


0    MEXICO CITY (Reuters) - Twelve people died and...
1    WASHINGTON (Reuters) - Special counsel Robert ...
2    MANILA (Reuters) - Philippine commandos and la...
3    WASHINGTON (Reuters) - The United States will ...
4    GENEVA (Reuters) - The U.N. refugee agency urg...
Name: text, dtype: object
text
<class 'str'>    42384
Name: count, dtype: int64


In [27]:
def extract_text(x):
    if isinstance(x, dict) and 'text' in x:
        return x['text']
    return x

df['text'] = df['text'].apply(extract_text)


In [29]:
df['text'] = df['text'].astype(str)


In [30]:
df['text'] = df['text'].apply(clean_text_fast)


In [31]:
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42
)

vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [35]:
from src.preprocessing import clean_text_fast
df['text'] = df['text'].apply(clean_text_fast)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

# TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Logistic Regression classifier
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Optionally evaluate
accuracy = model.score(X_test_vec, y_test)
print("Test Accuracy:", accuracy)


Test Accuracy: 0.271204435531438


In [42]:
import joblib
import os

# Create a folder for models if it doesn't exist
MODEL_DIR = "models"
os.makedirs(MODEL_DIR, exist_ok=True)

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, os.path.join(MODEL_DIR, "tfidf_vector.pkl"))

# Save the trained Logistic Regression model
joblib.dump(model, os.path.join(MODEL_DIR, "classifier.pkl"))

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!


In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

print("Accuracy:", model.score(X_test_vec, y_test))


Accuracy: 0.27486138964256224


In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "SVM": LinearSVC()
}

for name, model in models.items():
    model.fit(X_train_vec, y_train)
    preds = model.predict(X_test_vec)
    acc = accuracy_score(y_test, preds)
    print(f"{name} Accuracy: {acc:.4f}")


Logistic Regression Accuracy: 0.2749
Naive Bayes Accuracy: 0.3209
SVM Accuracy: 0.2510


In [None]:
best_model = LinearSVC()
best_model.fit(X_train_vec, y_train)

joblib.dump(best_model, "../models/classifier.pkl")
joblib.dump(vectorizer, "../models/tfidf_vector.pkl")


['../models/tfidf_vector.pkl']

In [None]:
%time df = clean_text_fast(df)


CPU times: total: 4.25 s
Wall time: 5.34 s
