In [48]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from langdetect import detect
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier



In [49]:
import nltk
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


False

In [50]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [51]:
# Load stopwords
def load_stopwords(path):
    with open(path, "r", encoding="utf-8") as f:
        return set([w.strip() for w in f.readlines() if w.strip()])

hindi_stop = load_stopwords(r"C:\Users\vinot\Projects\Fake News Detection\data\stopwords\hindi_stopwords.txt")
tamil_stop = load_stopwords(r"C:\Users\vinot\Projects\Fake News Detection\data\stopwords\Tamil-Stopwords.txt")

english_stop = set(stopwords.words("english"))
stemmer = SnowballStemmer("english")


In [52]:
def multilingual_preprocess(text):
    try:
        lang = detect(text)
    except:
        lang = "en"
    text = re.sub(r"[^a-zA-Z\u0900-\u097F\u0B80-\u0BFF ]", " ", str(text))  # English + Hindi + Tamil
    words = text.lower().split()
    if lang == "en":
        words = [stemmer.stem(w) for w in words if w not in english_stop]
    elif lang == "hi":
        words = [w for w in words if w not in hindi_stop]
    elif lang == "ta":
        words = [w for w in words if w not in tamil_stop]
    return " ".join(words)


In [53]:
# English
fake_dataset = pd.read_csv(r"C:\Users\vinot\Projects\multilingual fake news\data\english\Fake.csv")
true_dataset = pd.read_csv(r"C:\Users\vinot\Projects\multilingual fake news\data\english\True.csv")
fake_dataset["label"] = 0
true_dataset["label"] = 1
english_df = pd.concat([fake_dataset, true_dataset], ignore_index=True)
english_df["content"] = english_df["title"] + " " + english_df["text"]
english_df["language"] = "english"
                      

In [54]:
fake_dataset.shape
true_dataset.shape

(21417, 5)

In [55]:
true_dataset.shape

(21417, 5)

In [56]:
true_dataset.head

<bound method NDFrame.head of                                                    title  \
0      As U.S. budget fight looms, Republicans flip t...   
1      U.S. military to accept transgender recruits o...   
2      Senior U.S. Republican senator: 'Let Mr. Muell...   
3      FBI Russia probe helped by Australian diplomat...   
4      Trump wants Postal Service to charge 'much mor...   
...                                                  ...   
21412  'Fully committed' NATO backs new U.S. approach...   
21413  LexisNexis withdrew two products from Chinese ...   
21414  Minsk cultural hub becomes haven from authorities   
21415  Vatican upbeat on possibility of Pope Francis ...   
21416  Indonesia to buy $1.14 billion worth of Russia...   

                                                    text       subject  \
0      WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1      WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2      WASHINGTON (Reuters)

In [57]:
fake_dataset.head

<bound method NDFrame.head of                                                    title  \
0       Donald Trump Sends Out Embarrassing New Year’...   
1       Drunk Bragging Trump Staffer Started Russian ...   
2       Sheriff David Clarke Becomes An Internet Joke...   
3       Trump Is So Obsessed He Even Has Obama’s Name...   
4       Pope Francis Just Called Out Donald Trump Dur...   
...                                                  ...   
23476  McPain: John McCain Furious That Iran Treated ...   
23477  JUSTICE? Yahoo Settles E-mail Privacy Class-ac...   
23478  Sunnistan: US and Allied ‘Safe Zone’ Plan to T...   
23479  How to Blow $700 Million: Al Jazeera America F...   
23480  10 U.S. Navy Sailors Held by Iranian Military ...   

                                                    text      subject  \
0      Donald Trump just couldn t wish all Americans ...         News   
1      House Intelligence Committee Chairman Devin Nu...         News   
2      On Friday, it was revea

In [58]:
fake_dataset.head

<bound method NDFrame.head of                                                    title  \
0       Donald Trump Sends Out Embarrassing New Year’...   
1       Drunk Bragging Trump Staffer Started Russian ...   
2       Sheriff David Clarke Becomes An Internet Joke...   
3       Trump Is So Obsessed He Even Has Obama’s Name...   
4       Pope Francis Just Called Out Donald Trump Dur...   
...                                                  ...   
23476  McPain: John McCain Furious That Iran Treated ...   
23477  JUSTICE? Yahoo Settles E-mail Privacy Class-ac...   
23478  Sunnistan: US and Allied ‘Safe Zone’ Plan to T...   
23479  How to Blow $700 Million: Al Jazeera America F...   
23480  10 U.S. Navy Sailors Held by Iranian Military ...   

                                                    text      subject  \
0      Donald Trump just couldn t wish all Americans ...         News   
1      House Intelligence Committee Chairman Devin Nu...         News   
2      On Friday, it was revea

In [59]:
fake_dataset.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [60]:
true_dataset.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [61]:
# Hindi
hindi_train = pd.read_csv(r"C:\Users\vinot\Projects\Fake News Detection\data\hindi\Train-Dataset-Example.csv")
hindi_test = pd.read_csv(r"C:\Users\vinot\Projects\Fake News Detection\data\hindi\Test-Dataset-Example.csv")
hindi_df = pd.concat([hindi_train, hindi_test], ignore_index=True)
hindi_df = hindi_df.rename(columns={"news": "content", "label": "label"})
hindi_df["language"] = "hindi"

In [62]:
# Tamil
tamil_df = pd.read_csv(r"C:\Users\vinot\Projects\Fake News Detection\data\tamil\TamilFakeAndReal.csv")
tamil_df = tamil_df.rename(columns={"news": "content", "label": "label"})
tamil_df["language"] = "tamil"


In [63]:
news = pd.concat([english_df, hindi_df, tamil_df], ignore_index=True)


In [64]:
news.shape

(59483, 13)

In [65]:
news.head

<bound method NDFrame.head of                                                    title  \
0       Donald Trump Sends Out Embarrassing New Year’...   
1       Drunk Bragging Trump Staffer Started Russian ...   
2       Sheriff David Clarke Becomes An Internet Joke...   
3       Trump Is So Obsessed He Even Has Obama’s Name...   
4       Pope Francis Just Called Out Donald Trump Dur...   
...                                                  ...   
59478                                                NaN   
59479                                                NaN   
59480                                                NaN   
59481                                                NaN   
59482                                                NaN   

                                                    text subject  \
0      Donald Trump just couldn t wish all Americans ...    News   
1      House Intelligence Committee Chairman Devin Nu...    News   
2      On Friday, it was revealed that former

In [66]:
news.head()

Unnamed: 0.1,title,text,subject,date,label,content,language,Unnamed: 0,id,author,English,Tamil,Is Fake
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0.0,Donald Trump Sends Out Embarrassing New Year’...,english,,,,,,
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0.0,Drunk Bragging Trump Staffer Started Russian ...,english,,,,,,
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0.0,Sheriff David Clarke Becomes An Internet Joke...,english,,,,,,
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0.0,Trump Is So Obsessed He Even Has Obama’s Name...,english,,,,,,
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0.0,Pope Francis Just Called Out Donald Trump Dur...,english,,,,,,


In [67]:
news = news.drop(columns=['date'], errors='ignore')


In [68]:
news.head()

Unnamed: 0.1,title,text,subject,label,content,language,Unnamed: 0,id,author,English,Tamil,Is Fake
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,0.0,Donald Trump Sends Out Embarrassing New Year’...,english,,,,,,
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,0.0,Drunk Bragging Trump Staffer Started Russian ...,english,,,,,,
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,0.0,Sheriff David Clarke Becomes An Internet Joke...,english,,,,,,
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,0.0,Trump Is So Obsessed He Even Has Obama’s Name...,english,,,,,,
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,0.0,Pope Francis Just Called Out Donald Trump Dur...,english,,,,,,


In [69]:
# Preprocess
news["content"] = news["content"].astype(str).apply(multilingual_preprocess)

In [70]:
news = news.dropna(subset=["label"])

In [71]:

news["label"] = news["label"].astype(int)

In [72]:
# --- Training ---
X = news["content"].values
Y = news["label"].values

In [73]:
print("Labels distribution:\n", pd.Series(Y).value_counts())

Labels distribution:
 0    23487
1    21422
Name: count, dtype: int64


In [74]:
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(news["content"].values)


In [75]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X_tfidf, Y, test_size=0.2, stratify=Y, random_state=42
)



In [76]:
# Train individual models first
log_model = LogisticRegression()
log_model.fit(X_train, Y_train) 

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [77]:

nb_model = MultinomialNB()
nb_model.fit(X_train, Y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [78]:



rf_model=RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, Y_train)


0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [79]:
# Voting ensemble
voting_model = VotingClassifier(
    estimators=[
        ("lr", log_model),
        ("nb", nb_model),
        ("rf", rf_model),
        
    ],
    voting="soft"
)


In [80]:
# Train the model
voting_model.fit(X_train, Y_train)

0,1,2
,estimators,"[('lr', ...), ('nb', ...), ...]"
,voting,'soft'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [81]:
# Accuracy
print("Train Accuracy:", accuracy_score(Y_train, voting_model.predict(X_train)))
print("Test Accuracy:", accuracy_score(Y_test, voting_model.predict(X_test)))

Train Accuracy: 0.9941826481476327
Test Accuracy: 0.9870852816744601


In [82]:
X_new = X_test[0]           # single sample
prediction = voting_model.predict(X_new.reshape(1, -1))  # ✅ reshape to 2D
print(prediction)

if(prediction[0]==0):
    print("the news is fake")
else:
    print('its real')

[0]
the news is fake


In [83]:
print(Y_test[0])

0


In [85]:
import pickle

with open('voting_model.pkl', 'wb') as f:
    pickle.dump(voting_model, f)

with open('log_model.pkl', 'wb') as f:
    pickle.dump(log_model, f)

with open('nb_model.pkl', 'wb') as f:
    pickle.dump(nb_model, f)

with open('rf_model.pkl','wb') as f:
    pickle.dump(rf_model,f)
with open('vectorizer.pkl','wb') as f:
    pickle.dump(vectorizer,f)
