# Dataset Description
## Link: https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset?select=True.csv
### Dataset separated in two files:
### 1. Fake.csv (23502 fake news article)
### 2. True.csv (21417 true news article)
### Dataset columns:
### 1. Title: title of news article
### 2.Text: body text of news article
### 3. Subject: subject of news article
### 4. Date: publish date of news article

In [1]:
import pandas as pd

In [2]:
df_fake = pd.read_csv('Fake.csv')
df_true = pd.read_csv('True.csv')

In [3]:
print("Fake News Data:")
df_fake

Fake News Data:


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"
...,...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016"
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016"
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016"
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016"


In [4]:
print("\nTrue News Data:")
df_true


True News Data:


Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"
...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017"
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017"
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017"
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017"


# Combine and Label the Data

In [5]:
df_fake['label'] = 'FAKE'
df_true['label'] = 'TRUE'

In [6]:
df_combined = pd.concat([df_fake, df_true], ignore_index=True)
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

In [7]:
df_combined

Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",FAKE
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",TRUE
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",TRUE
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",FAKE
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",TRUE
...,...,...,...,...,...
44893,UNREAL! CBS’S TED KOPPEL Tells Sean Hannity He...,,politics,"Mar 27, 2017",FAKE
44894,PM May seeks to ease Japan's Brexit fears duri...,LONDON/TOKYO (Reuters) - British Prime Ministe...,worldnews,"August 29, 2017",TRUE
44895,Merkel: Difficult German coalition talks can r...,BERLIN (Reuters) - Chancellor Angela Merkel sa...,worldnews,"November 16, 2017",TRUE
44896,Trump Stole An Idea From North Korean Propaga...,Jesus f*cking Christ our President* is a moron...,News,"July 14, 2017",FAKE


# Clean and preprocess the text data (title and body text).

In [8]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

In [9]:
df_combined['text_combined'] = df_combined['title'] + ' ' + df_combined['text']

## lowercase convertion, numbers, punctuation, stopword removal, stemming

In [10]:
def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r'[^a-z\s]', '', text) 
    stop_words = set(stopwords.words('english'))  #
    words = text.split()
    words = [word for word in words if word not in stop_words] 
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]  
    return ' '.join(words)

In [11]:
df_combined['text_cleaned'] = df_combined['text_combined'].apply(preprocess_text)

In [12]:
df_combined[['text_combined', 'text_cleaned']]

Unnamed: 0,text_combined,text_cleaned
0,Ben Stein Calls Out 9th Circuit Court: Committ...,ben stein call th circuit court commit coup dt...
1,Trump drops Steve Bannon from National Securit...,trump drop steve bannon nation secur council w...
2,Puerto Rico expects U.S. to lift Jones Act shi...,puerto rico expect us lift jone act ship restr...
3,OOPS: Trump Just Accidentally Confirmed He Le...,oop trump accident confirm leak isra intellig ...
4,Donald Trump heads for Scotland to reopen a go...,donald trump head scotland reopen golf resort ...
...,...,...
44893,UNREAL! CBS’S TED KOPPEL Tells Sean Hannity He...,unreal cbss ted koppel tell sean hanniti he ba...
44894,PM May seeks to ease Japan's Brexit fears duri...,pm may seek eas japan brexit fear trade visit ...
44895,Merkel: Difficult German coalition talks can r...,merkel difficult german coalit talk reach deal...
44896,Trump Stole An Idea From North Korean Propaga...,trump stole idea north korean propaganda parod...


# Feature Extraction
## Using TF-IDF to convert the cleaned text into numerical features.

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the number of features

In [15]:
X = vectorizer.fit_transform(df_combined['text_cleaned']).toarray()

In [16]:
# 1 for FAKE, 0 for TRUE
y = df_combined['label'].apply(lambda x: 1 if x == 'FAKE' else 0).values

# Regression Model Building and Training

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)

In [19]:
model.fit(X_train, y_train)

In [20]:
y_pred = model.predict(X_test)

# Model Evaluation

In [21]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [22]:
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 0.9865
Precision: 0.9906
Recall: 0.9837
F1 Score: 0.9871


# Model Testing

In [23]:
def predict_news(news):
    cleaned_news = preprocess_text(news)
    
    # Vectorize the cleaned news using the trained vectorizer
    news_vector = vectorizer.transform([cleaned_news]).toarray()
        
    prediction = model.predict(news_vector)
    return 'FAKE' if prediction[0] == 1 else 'REAL'

In [25]:
user_input = "Aliens have landed on Earth and are taking over the government."
print(f"User Input: {user_input}")
print(f"Prediction: {predict_news(user_input)}")

User Input: Aliens have landed on Earth and are taking over the government.
Prediction: FAKE


In [32]:
user_input = "Democrat Franken to leave Senate on January 2."
print(f"User Input: {user_input}")
print("f"Prediction: {predict_news(user_input)}")"

User Input: Democrat Franken to leave Senate on January 2.
Prediction: REAL
