In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [71]:
# read the file fakeandtrue_news_dataset


# read the file fakeandtrue_news_dataset, using the Python engine and handling errors
news = pd.read_csv('/content/sample_data/fakeandTrue_news_dataset.csv')
# or on_bad_lines='skip' to skip problematic lines

print(news.head())

                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text  label  
0  Donald Trump just couldn t wish all Americans ...      0  
1  House Intelligence Committee Chairman Devin Nu...      0  
2  On Friday, it was revealed that former Milwauk...      0  
3  On Christmas day, Donald Trump announced that ...      0  
4  Pope Francis used his annual Christmas Day mes...      0  


In [72]:
# extract real and fake news from news
news = news.drop_duplicates()
real_news = news[news['label'] == 1]
fake_news = news[news['label'] == 0]




# concatinate both
news = pd.concat([real_news, fake_news])
print(news.head())


# print count of label
print(news['label'].value_counts())

                                                   title  \
24066  Aditi Rao Hydari and Siddharth get married in ...   
24067  Government deallocates coal block in Jharkhand...   
24068  View: Dealing with bureaucracy will be a chall...   
24069    Buy RBL Bank, price target Rs 360: Kunal Bothra   
24070  ATF cost up 2.6%; non-subsidised LPG price hik...   

                                                    text  label  
24066  A Look At Their Love Story\n\n\n\nAaand they a...      1  
24067  New Delhi: The government has cancelled the al...      1  
24068  General Bipin Rawat took over as the countrys...      1  
24069  Independent Analyst Kunal Bothra has a buy cal...      1  
24070  Jet fuel or ATF price was on Wednesday hiked b...      1  
label
1    54344
0    53284
Name: count, dtype: int64


In [73]:
# preprocess title and text from news for vectorization and create it as a function
import re
import string
def clean_text(text):
    return re.sub(f"[{re.escape(string.punctuation)}]", '', text.lower())

# convert data text into string
news['text'] = news['text'].astype(str)
# apply the cleaning function to 'title', 'text', and 'date' columns

print(news.head())


                                                   title  \
24066  Aditi Rao Hydari and Siddharth get married in ...   
24067  Government deallocates coal block in Jharkhand...   
24068  View: Dealing with bureaucracy will be a chall...   
24069    Buy RBL Bank, price target Rs 360: Kunal Bothra   
24070  ATF cost up 2.6%; non-subsidised LPG price hik...   

                                                    text  label  
24066  A Look At Their Love Story\n\n\n\nAaand they a...      1  
24067  New Delhi: The government has cancelled the al...      1  
24068  General Bipin Rawat took over as the countrys...      1  
24069  Independent Analyst Kunal Bothra has a buy cal...      1  
24070  Jet fuel or ATF price was on Wednesday hiked b...      1  


In [74]:
# remove duplicates out of the news
news = news.drop_duplicates()
#print the amount of lable
print(news['label'].value_counts())

label
1    54344
0    53284
Name: count, dtype: int64


In [6]:
!pip install textstat



In [55]:
# get n gram frequency , sentimant analysis and readability score for above news text with NLTK
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from textstat import flesch_reading_ease
# download the VADER lexicon

nltk.download('vader_lexicon')
# create a sentiment analyzer
sid = SentimentIntensityAnalyzer()
#analise news text
news['sentiment_score'] = news['text'].apply(lambda x: sid.polarity_scores(x)['compound'])
news['readability_score'] = news['text'].apply(lambda x: flesch_reading_ease(x))



[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [35]:
print(news.head())

                                                   title  \
24066  Aditi Rao Hydari and Siddharth get married in ...   
24067  Government deallocates coal block in Jharkhand...   
24068  View: Dealing with bureaucracy will be a chall...   
24069    Buy RBL Bank, price target Rs 360: Kunal Bothra   
24070  ATF cost up 2.6%; non-subsidised LPG price hik...   

                                                    text  label  \
24066  A Look At Their Love Story\n\n\n\nAaand they a...      1   
24067  New Delhi: The government has cancelled the al...      1   
24068  General Bipin Rawat took over as the countrys...      1   
24069  Independent Analyst Kunal Bothra has a buy cal...      1   
24070  Jet fuel or ATF price was on Wednesday hiked b...      1   

       sentiment_score  readability_score  
24066           0.9952              60.85  
24067          -0.7579              42.55  
24068           0.9955              43.73  
24069           0.9753              79.87  
24070           

In [None]:
# n gram anallysis for text in news
from nltk.util import ngrams
from collections import Counter
# perso anallysis on text in news
def get_ngrams(text, n):
    tokens = text.split()
    return list(ngrams(tokens, n))
# get bigrams
news['bigrams'] = news['text'].apply(lambda x: get_ngrams(x, 2))
# get trigrams
news['trigrams'] = news['text'].apply(lambda x: get_ngrams(x, 3))

In [75]:
# remove all news with empty title or text
news = news.dropna(subset=['title', 'text'])
# print count of 1 and 0
print(news['label'].value_counts())
# sort news data by label


label
1    53825
0    53284
Name: count, dtype: int64


In [57]:
# layble from news
lable = news['label']


In [58]:
# prompt: # get features for text from news

from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer()
tfidf_text = vectorizer.fit_transform(news['text'])


In [59]:
# get data of sentiment and reliabity score with tfidf_text for testing
sentiment_score = news['sentiment_score']
readability_score = news['readability_score']


In [60]:
# install hstack
from scipy.sparse import hstack

In [61]:
# have tfidf_text , sentimental_score and reliablity_score in x and y for label
x = hstack((tfidf_text, news[['sentiment_score', 'readability_score']].values))
y = lable
print(x.shape)



(39105, 122004)


In [62]:
# split into test and train test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [63]:
# try performing logical regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9817158931082982


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [66]:
# test the modle for fake news
from sklearn.feature_extraction.text import TfidfVectorizer
new_text = """

In his first statement on the worsening India-Canada ties, Prime Minister Narendra Modi on Monday "strongly condemned" the "deliberate attack" on a Hindu temple in Brampton, and slammed the Justin Trudeau administration's "cowardly attempts" to "intimidate" Indian diplomats.
Standing up for Indians across the world, PM Modi's strong statement comes as India-Canada relations have hit a low point since the former withdrew its diplomats following allegations of violence against Sikh separatist Khalistani leaders on Canadian soil.
"I strongly condemn the deliberate attack on a Hindu temple in Canada. Equally appalling are the cowardly attempts to intimidate our diplomats. Such acts of violence will never weaken India’s resolve. We expect the Canadian government to ensure justice and uphold the rule of law," PM Modi posted on X.” Earlier, the MEA also condemned the attack on the Hindu Sabha Temple in Brampton, Ontario, reportedly carried out by alleged Sikh extremists, calling the incident "deeply disturbing."
"We condemn the acts of violence perpetrated by extremists and separatists at the Hindu Sabha Temple in Brampton, Ontario yesterday. We call on the Government of Canada to ensure that all places of worship are protected from such attacks. We also expect that those indulging in violence will be prosecuted. We remain deeply concerned about the safety and security of Indian nationals in Canada. The outreach of our Consular officers to provide services to Indians and Canadian citizens alike will not be deterred by intimidation, harassment and violence," MEA spokesperson Randhir Jaiswal said.
According to the community organisation Hindu Canadian Foundation, women and children were also attacked in the violent mob.
Meanwhile, last month, India had protested Canada's decision to investigate the Indian High Commissioner and other diplomats as "persons of interest" likely in connection with the assassination of Hardeep Singh Nijjar, a Khalistani terrorist.
It had firmly rejected Trudeau's allegations against its diplomats, labeling them as "baseless" and politically motivated. The MEA had accused Trudeau of exploiting anti-India rhetoric for domestic political gain, highlighting his previous associations with separatist elements."""
# extract the features out of  new_text and remove puncation and make it lower case


# vectorize the new_text
new_features = vectorizer.transform([new_text])
new_sentiment_score = sid.polarity_scores(new_text)['compound']
new_readability_score = flesch_reading_ease(new_text)


In [67]:
# predict
prediction = lr.predict(hstack((new_features, [[new_sentiment_score, new_readability_score]])))
# gues if fake or real
if prediction[0] == 0:
    print("This is a fake news")
else:
    print("This is a real news")


This is a real news
