In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib
import sys
import os

PROJECT_ROOT = os.path.abspath("..")
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

from src.preprocessing import clean_text
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

fake_df = pd.read_csv('../data/raw/True.csv')
true_df = pd.read_csv('../data/raw/True.csv')

print(f"Fake news shape: {fake_df.shape}")
print(f"True news shape: {true_df.shape}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Fake news shape: (21417, 4)
True news shape: (21417, 4)


In [2]:
fake_df.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [3]:
true_df.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [4]:
true_df.info()
fake_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB


In [5]:
true_df.describe()


Unnamed: 0,title,text,subject,date
count,21417,21417,21417,21417
unique,20826,21192,2,716
top,Factbox: Trump fills top jobs for his administ...,(Reuters) - Highlights for U.S. President Dona...,politicsNews,"December 20, 2017"
freq,14,8,11272,182


In [6]:

fake_df.describe()

Unnamed: 0,title,text,subject,date
count,21417,21417,21417,21417
unique,20826,21192,2,716
top,Factbox: Trump fills top jobs for his administ...,(Reuters) - Highlights for U.S. President Dona...,politicsNews,"December 20, 2017"
freq,14,8,11272,182


In [7]:
print("--- Missing Values in True News ---")
print(true_df.isnull().sum())

print("\n--- Missing Values in Fake News ---")
print(fake_df.isnull().sum())

--- Missing Values in True News ---
title      0
text       0
subject    0
date       0
dtype: int64

--- Missing Values in Fake News ---
title      0
text       0
subject    0
date       0
dtype: int64


In [8]:

print(f"True news rows: {len(true_df)}")
print(f"Fake news rows: {len(fake_df)}")

print("\nFinal Null Count in True:")
print(true_df.isnull().sum().sum())

True news rows: 21417
Fake news rows: 21417

Final Null Count in True:
0


In [9]:
# 1. Label the data (1 for real, 0 for fake)
true_df['label'] = 1
fake_df['label'] = 0

df = pd.concat([fake_df, true_df], axis=0)
df = df[["text", "label"]]

print("Dataset Merged and Shuffled!")
print(df.head())

Dataset Merged and Shuffled!
                                                text  label
0  WASHINGTON (Reuters) - The head of a conservat...      0
1  WASHINGTON (Reuters) - Transgender people will...      0
2  WASHINGTON (Reuters) - The special counsel inv...      0
3  WASHINGTON (Reuters) - Trump campaign adviser ...      0
4  SEATTLE/WASHINGTON (Reuters) - President Donal...      0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 42834 entries, 0 to 21416
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    42834 non-null  object
 1   label   42834 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1003.9+ KB


DATA PREPROCESSING

In [11]:
df.sample(10)

Unnamed: 0,text,label
12669,BERLIN (Reuters) - German Foreign Minister Sig...,0
12076,BERLIN (Reuters) - Germany s Social Democrats ...,1
8771,In a speech weighted with America’s complicate...,1
4508,WASHINGTON (Reuters) - Democrats on Monday cor...,0
14823,LONDON (Reuters) - Scotland s First Minister N...,0
20399,BERLIN (Reuters) - Foreign Minister Sigmar Gab...,1
4660,WASHINGTON (Reuters) - U.S. House of Represent...,1
6427,(Reuters) - New Jersey Governor Chris Christie...,1
2708,WASHINGTON (Reuters) - The United States on Mo...,0
3850,WASHINGTON (Reuters) - Russian Foreign Ministe...,1


In [12]:
df.isnull().sum()
df = df.dropna()

In [13]:
df.duplicated().sum()
df=df.drop_duplicates(keep='first')

In [14]:
df.value_counts()

text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

In [15]:
df['label'].value_counts()

label
0    21192
1    21192
Name: count, dtype: int64

In [16]:
df['label'].replace({0: 'fake', 1: 'true'})
df['label'].value_counts()

label
0    21192
1    21192
Name: count, dtype: int64

In [17]:
def transform_text(text):  #text provide krenge
    text = text.lower()     #lower case me convert krega
    return text

In [18]:
transform_text('Hi how are you' )

'hi how are you'

In [19]:
def transform_text(text):  #text provide krenge
    text = text.upper()     #lower case me convert krega
    return text

transform_text('Hi how are you' )

'HI HOW ARE YOU'

In [20]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)  #tokenization
    
    return text

transform_text('Hi how are you' )

['hi', 'how', 'are', 'you']

In [22]:
df['text'][100]

100    WASHINGTON (Reuters) - Democratic Senator Eliz...
100    WASHINGTON (Reuters) - Democratic Senator Eliz...
Name: text, dtype: object

In [23]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text) #tokenization
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)   #append krega 
    return y

transform_text("I'm 90%%% gonna be home soon and i don't want to talk ")
transform_text('Hi how are you Subject: re : doctor approved pill lgw  a man endowed with a 7 - 8 " hammer is simply  better equipped than a man with a 5 - 6 " hammer .' )

['hi',
 'how',
 'are',
 'you',
 'subject',
 're',
 'doctor',
 'approved',
 'pill',
 'lgw',
 'a',
 'man',
 'endowed',
 'with',
 'a',
 '7',
 '8',
 'hammer',
 'is',
 'simply',
 'better',
 'equipped',
 'than',
 'a',
 'man',
 'with',
 'a',
 '5',
 '6',
 'hammer']

In [None]:
df['text'] = df['text'].apply(clean_text)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42
)

vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

print("Accuracy:", model.score(X_test_vec, y_test))


In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "SVM": LinearSVC()
}

for name, model in models.items():
    model.fit(X_train_vec, y_train)
    preds = model.predict(X_test_vec)
    acc = accuracy_score(y_test, preds)
    print(f"{name} Accuracy: {acc:.4f}")


In [None]:
best_model = LinearSVC()
best_model.fit(X_train_vec, y_train)

joblib.dump(best_model, "../models/classifier.pkl")
joblib.dump(vectorizer, "../models/tfidf_vector.pkl")
