In [None]:
!pip install -q kaggle

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle # create the directory
!mv kaggle.json ~/.kaggle/ # move the kaggle.json file to the directory created
!chmod 600 ~/.kaggle/kaggle.json # set the permissions
!kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset

Dataset URL: https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset
License(s): CC-BY-NC-SA-4.0
fake-and-real-news-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
!unzip fake-and-real-news-dataset.zip

Archive:  fake-and-real-news-dataset.zip
replace Fake.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: Fake.csv                
replace True.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: True.csv                


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [None]:
true_df = pd.read_csv('True.csv')
fake_df = pd.read_csv('Fake.csv')

In [None]:
true_df.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [None]:
true_df.head(1)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"


In [None]:
fake_df.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [None]:
# Adding a column as "label" in both the dataframes to label the real news as 1 and fake news as 0
true_df['label'] = 1
fake_df['label'] = 0

In [None]:
true_df.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [None]:
fake_df.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [None]:
news_df = pd.concat([true_df, fake_df], axis=0)
news_df.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [None]:
news_df.tail()

Unnamed: 0,title,text,subject,date,label
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",0
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",0
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",0
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",0
23480,10 U.S. Navy Sailors Held by Iranian Military ...,21st Century Wire says As 21WIRE predicted in ...,Middle-east,"January 12, 2016",0


### example text: The cats are running quickly.
### tokenization: ['The', 'cats', 'are', 'running', 'quickly', '.']
### lemmatization: ['The', 'cat', 'are', 'run', 'quickly', '.']
### stemming: ['The', 'cat', 'are', 'run', 'quickli', '.']. Flying could become Fli

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
  text = text.lower() # convert to lowercase

  # remove punctuation
  translation_table = str.maketrans("", "", string.punctuation) # first arg: to replace, second arg: replace with, third arg: to be deleted from the text
  text = text.translate(translation_table)

  # tokenize, remove stop words, and lemmatize
  tokens = text.split()
  tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
  text = ' '.join(tokens)
  return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
news_df['processed_text'] = news_df['text'].apply(preprocess_text)

print(news_df[['text', 'processed_text']])

                                                    text  \
0      WASHINGTON (Reuters) - The head of a conservat...   
1      WASHINGTON (Reuters) - Transgender people will...   
2      WASHINGTON (Reuters) - The special counsel inv...   
3      WASHINGTON (Reuters) - Trump campaign adviser ...   
4      SEATTLE/WASHINGTON (Reuters) - President Donal...   
...                                                  ...   
23476  21st Century Wire says As 21WIRE reported earl...   
23477  21st Century Wire says It s a familiar theme. ...   
23478  Patrick Henningsen  21st Century WireRemember ...   
23479  21st Century Wire says Al Jazeera America will...   
23480  21st Century Wire says As 21WIRE predicted in ...   

                                          processed_text  
0      washington reuters head conservative republica...  
1      washington reuters transgender people allowed ...  
2      washington reuters special counsel investigati...  
3      washington reuters trump campaign ad

# Term Frequency - Inverse Document Frequency (TF-IDF)
### statistical measure the importance of a word in a document or corpus

### **Term Frequency (TF):** measures frequency of a word within a document, **(number of times the word appears in the document)/(total number of words in the document)**

### **Inverse Document Frequency (IDF):** measures frequency of the word across all the documents (rarity),
### **log((total number of documents)/(number of documents where the word appears))**

## TF-IDF(w, d) = TF(w, d) * IDF(w)

In [None]:
X = news_df['processed_text']
y = news_df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# tfidfVectorizer = TfidfVectorizer(max_features=5000)
tfidfVectorizer = TfidfVectorizer()
X_train_tfidf = tfidfVectorizer.fit_transform(X_train)
X_test_tfidf = tfidfVectorizer.transform(X_test)

In [None]:
X_train_tfidf.shape

(35918, 195785)

In [None]:
X_test_tfidf.shape

(8980, 195785)

In [None]:
print(X_train_tfidf)

  (0, 167194)	0.1480162463219163
  (0, 3458)	0.048655933269779794
  (0, 67737)	0.04165418378258928
  (0, 104309)	0.05595159551219822
  (0, 158066)	0.038584478572756827
  (0, 81239)	0.17693741531200252
  (0, 102676)	0.03679320486751674
  (0, 64162)	0.05239281543430001
  (0, 118597)	0.026046569624462067
  (0, 46788)	0.04047827343421384
  (0, 113750)	0.022957508096608143
  (0, 50934)	0.03223679485879775
  (0, 123897)	0.03260635725508622
  (0, 134643)	0.040822271700207496
  (0, 157685)	0.02858846170392376
  (0, 151256)	0.1292909557520365
  (0, 40963)	0.20336979865891808
  (0, 89538)	0.06011320835934164
  (0, 29395)	0.06760667190652073
  (0, 116404)	0.11457833992799349
  (0, 161230)	0.40353679442918217
  (0, 58842)	0.5631022153832149
  (0, 44446)	0.08741284466430037
  (0, 114689)	0.11206465585765556
  (0, 62881)	0.26223853399290115
  :	:
  (35917, 31113)	0.056783193811602406
  (35917, 169143)	0.045178993811715994
  (35917, 177933)	0.08568346985464154
  (35917, 165007)	0.04839346833692827
  

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

logistic_model = LogisticRegression()
logistic_model.fit(X_train_tfidf, y_train)
y_pred = logistic_model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.9897550111358575
Precision: 0.9853412734768667
Recall: 0.9935334872979215
F1 Score: 0.9894204231830727


In [None]:
import joblib

joblib.dump(logistic_model, 'logistic_model.pkl')
joblib.dump(tfidfVectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [None]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

model = joblib.load('logistic_model.pkl')  # Replace with your saved model filename
vectorizer = joblib.load('tfidf_vectorizer.pkl')  # Replace with your saved vectorizer filename

user_input = input("Enter the news article: ")

if user_input.strip():
    processed_input = vectorizer.transform([user_input])

    prediction = model.predict(processed_input)

    if prediction[0] == 1:
        print("This news is likely REAL.")
    else:
        print("This news is likely FAKE.")
else:
    print("Please enter a valid news article!")

Enter the news article: Dinosaurs found on Jupiter
This news is likely FAKE.


In [None]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

model = joblib.load('logistic_model.pkl')  # Replace with your saved model filename
vectorizer = joblib.load('tfidf_vectorizer.pkl')  # Replace with your saved vectorizer filename

user_input = input("Enter the news article: ")

if user_input.strip():
    processed_input = vectorizer.transform([user_input])

    prediction = model.predict(processed_input)

    if prediction[0] == 1:
        print("This news is likely REAL.")
    else:
        print("This news is likely FAKE.")
else:
    print("Please enter a valid news article!")

Enter the news article: WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases for non-defense “discretionary” spending on progr

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
true_df.head(1)

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip their fiscal script","WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases for non-defense “discretionary” spending on programs that support education, scientific research, infrastructure, public health and environmental protection. “The (Trump) administration has already been willing to say: ‘We’re going to increase non-defense discretionary spending ... by about 7 percent,’” Meadows, chairman of the small but influential House Freedom Caucus, said on the program. “Now, Democrats are saying that’s not enough, we need to give the government a pay raise of 10 to 11 percent. For a fiscal conservative, I don’t see where the rationale is. ... Eventually you run out of other people’s money,” he said. Meadows was among Republicans who voted in late December for their party’s debt-financed tax overhaul, which is expected to balloon the federal budget deficit and add about $1.5 trillion over 10 years to the $20 trillion national debt. “It’s interesting to hear Mark talk about fiscal responsibility,” Democratic U.S. Representative Joseph Crowley said on CBS. Crowley said the Republican tax bill would require the United States to borrow $1.5 trillion, to be paid off by future generations, to finance tax cuts for corporations and the rich. “This is one of the least ... fiscally responsible bills we’ve ever seen passed in the history of the House of Representatives. I think we’re going to be paying for this for many, many years to come,” Crowley said. Republicans insist the tax package, the biggest U.S. tax overhaul in more than 30 years, will boost the economy and job growth. House Speaker Paul Ryan, who also supported the tax bill, recently went further than Meadows, making clear in a radio interview that welfare or “entitlement reform,” as the party often calls it, would be a top Republican priority in 2018. In Republican parlance, “entitlement” programs mean food stamps, housing assistance, Medicare and Medicaid health insurance for the elderly, poor and disabled, as well as other programs created by Washington to assist the needy. Democrats seized on Ryan’s early December remarks, saying they showed Republicans would try to pay for their tax overhaul by seeking spending cuts for social programs. But the goals of House Republicans may have to take a back seat to the Senate, where the votes of some Democrats will be needed to approve a budget and prevent a government shutdown. Democrats will use their leverage in the Senate, which Republicans narrowly control, to defend both discretionary non-defense programs and social spending, while tackling the issue of the “Dreamers,” people brought illegally to the country as children. Trump in September put a March 2018 expiration date on the Deferred Action for Childhood Arrivals, or DACA, program, which protects the young immigrants from deportation and provides them with work permits. The president has said in recent Twitter messages he wants funding for his proposed Mexican border wall and other immigration law changes in exchange for agreeing to help the Dreamers. Representative Debbie Dingell told CBS she did not favor linking that issue to other policy objectives, such as wall funding. “We need to do DACA clean,” she said. On Wednesday, Trump aides will meet with congressional leaders to discuss those issues. That will be followed by a weekend of strategy sessions for Trump and Republican leaders on Jan. 6 and 7, the White House said. Trump was also scheduled to meet on Sunday with Florida Republican Governor Rick Scott, who wants more emergency aid. The House has passed an $81 billion aid package after hurricanes in Florida, Texas and Puerto Rico, and wildfires in California. The package far exceeded the $44 billion requested by the Trump administration. The Senate has not yet voted on the aid.",politicsNews,"December 31, 2017",1


In [None]:
import json

vectorizer_params = {
    'vocabulary': vectorizer.vocabulary_,
    'idf_': vectorizer.idf_.tolist(),
    'stop_words': vectorizer.stop_words,
    'max_features': vectorizer.max_features,
}

with open('tfidf_vectorizer.json', 'w') as f:
    json.dump(vectorizer_params, f)