In [1]:
# import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import warnings
import sklearn
%matplotlib inline

warnings.filterwarnings('ignore')

In [2]:
true_data = pd.read_csv('True.csv')
fake_data = pd.read_csv('Fake.csv')

true_data['Target'] = ['True'] * len(true_data)
fake_data['Target'] = ['Fake'] * len(fake_data)

# Concatenate the data frames using pd.concat
data = pd.concat([true_data, fake_data]).sample(frac=1).reset_index(drop=True)
print(data.shape)
data.head()


(44898, 5)


Unnamed: 0,title,text,subject,date,Target
0,"Rights groups urge EU, Japan to consider halt ...",BANGKOK (Reuters) - Rights groups on Wednesday...,worldnews,"October 18, 2017",True
1,WATCH: IRRELEVANT DEM POLITICAL ANALYST James ...,"On Friday s broadcast of HBO s Real Time, fo...",left-news,"Oct 21, 2017",Fake
2,"Trump Asks O’Reilly, ‘Do you think our country...",21st Century Wire says Regardless of what one ...,US_News,"February 6, 2017",Fake
3,Factbox: Trump fills top jobs for his administ...,(Reuters) - U.S. President-elect Donald Trump ...,politicsNews,"December 5, 2016",True
4,ONE LAST TIME ON OUR DIME: Mooch and Barack Ar...,"The hard working First Family, in need of an...",politics,"Aug 6, 2016",Fake


In [3]:
data['title'][0]


'Rights groups urge EU, Japan to consider halt in funding for  Cambodian election'

In [4]:
data['text'][0]

'BANGKOK (Reuters) - Rights groups on Wednesday urged the European Union and Japan to consider halting their funding for the election panel in Cambodia, if the ruling party succeeds in a bid to dissolve the main opposition party ahead of next year s general election. The ruling Cambodia People s Party (CPP) has launched a crackdown on its critics, including politicians, independent media and non-government bodies. Nearly half the opposition members of parliament have fled abroad since September. In a session boycotted by the opposition, Cambodia s parliament voted on Monday to change party laws to re-distribute seats if a party is dissolved. The measure came after the government filed a lawsuit this month seeking to dissolve the main opposition Cambodia National Rescue Party (CNRP).  If the government s position to dissolve the opposition Cambodia National Rescue Party succeeds, next year s election will be a joke,  Phil Robertson, deputy director for Asia at New York-based group Human

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   Target   44898 non-null  object
dtypes: object(5)
memory usage: 1.7+ MB


In [6]:
#preprocessing
#drop null values
data=data.dropna(axis=0)

In [7]:
len(data)

44898

In [8]:
#converting all strings to lowercase
data['clean_news']=data['text'].str.lower()
data['clean_news']

0        bangkok (reuters) - rights groups on wednesday...
1        on friday s broadcast of hbo s  real time,  fo...
2        21st century wire says regardless of what one ...
3        (reuters) - u.s. president-elect donald trump ...
4        the  hard working  first family, in need of an...
                               ...                        
44893    21st century wire says does the american ideal...
44894    barinas, venezuela (reuters) - tirelessly trav...
44895    phnom penh (reuters) - cambodian prime ministe...
44896    geneva (reuters) - the united states wants to ...
44897    beijing (reuters) - u.s. president donald trum...
Name: clean_news, Length: 44898, dtype: object

In [9]:
#removing special characters , extra spaces and escape characters
data['clean_news']=data['clean_news'].str.replace('[^A-Za-z0-9\s]','')
data['clean_news']=data['clean_news'].str.replace('[\n]','')
data['clean_news']=data['clean_news'].str.replace('[\s+]',' ')
data['clean_news']

0        bangkok (reuters) - rights groups on wednesday...
1        on friday s broadcast of hbo s  real time,  fo...
2        21st century wire says regardless of what one ...
3        (reuters) - u.s. president-elect donald trump ...
4        the  hard working  first family, in need of an...
                               ...                        
44893    21st century wire says does the american ideal...
44894    barinas, venezuela (reuters) - tirelessly trav...
44895    phnom penh (reuters) - cambodian prime ministe...
44896    geneva (reuters) - the united states wants to ...
44897    beijing (reuters) - u.s. president donald trum...
Name: clean_news, Length: 44898, dtype: object

In [10]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rsriv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
#remove stop words
from nltk.corpus import stopwords
stop=stopwords.words('english')
data['clean_news']=data['clean_news'].apply(lambda x: " ".join([word for word in x.split() if word not in stop]))
data.head()

Unnamed: 0,title,text,subject,date,Target,clean_news
0,"Rights groups urge EU, Japan to consider halt ...",BANGKOK (Reuters) - Rights groups on Wednesday...,worldnews,"October 18, 2017",True,bangkok (reuters) - rights groups wednesday ur...
1,WATCH: IRRELEVANT DEM POLITICAL ANALYST James ...,"On Friday s broadcast of HBO s Real Time, fo...",left-news,"Oct 21, 2017",Fake,"friday broadcast hbo real time, former clinton..."
2,"Trump Asks O’Reilly, ‘Do you think our country...",21st Century Wire says Regardless of what one ...,US_News,"February 6, 2017",Fake,21st century wire says regardless one thinks d...
3,Factbox: Trump fills top jobs for his administ...,(Reuters) - U.S. President-elect Donald Trump ...,politicsNews,"December 5, 2016",True,(reuters) - u.s. president-elect donald trump ...
4,ONE LAST TIME ON OUR DIME: Mooch and Barack Ar...,"The hard working First Family, in need of an...",politics,"Aug 6, 2016",Fake,"hard working first family, need another taxpay..."


In [13]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rsriv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
#Tokenization
from nltk.tokenize import word_tokenize
data['tokenized_news'] = data['clean_news'].apply(lambda x: word_tokenize(x))
data.head()

Unnamed: 0,title,text,subject,date,Target,clean_news,tokenized_news
0,"Rights groups urge EU, Japan to consider halt ...",BANGKOK (Reuters) - Rights groups on Wednesday...,worldnews,"October 18, 2017",True,bangkok (reuters) - rights groups wednesday ur...,"[bangkok, (, reuters, ), -, rights, groups, we..."
1,WATCH: IRRELEVANT DEM POLITICAL ANALYST James ...,"On Friday s broadcast of HBO s Real Time, fo...",left-news,"Oct 21, 2017",Fake,"friday broadcast hbo real time, former clinton...","[friday, broadcast, hbo, real, time, ,, former..."
2,"Trump Asks O’Reilly, ‘Do you think our country...",21st Century Wire says Regardless of what one ...,US_News,"February 6, 2017",Fake,21st century wire says regardless one thinks d...,"[21st, century, wire, says, regardless, one, t..."
3,Factbox: Trump fills top jobs for his administ...,(Reuters) - U.S. President-elect Donald Trump ...,politicsNews,"December 5, 2016",True,(reuters) - u.s. president-elect donald trump ...,"[(, reuters, ), -, u.s., president-elect, dona..."
4,ONE LAST TIME ON OUR DIME: Mooch and Barack Ar...,"The hard working First Family, in need of an...",politics,"Aug 6, 2016",Fake,"hard working first family, need another taxpay...","[hard, working, first, family, ,, need, anothe..."


In [15]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rsriv\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
#Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_text(tokens, lemmatizer):
    return [lemmatizer.lemmatize(token) for token in tokens]
data['lemmatized_news'] = data['tokenized_news'].apply(lambda x: lemmatize_text(x, lemmatizer))
data.head()

Unnamed: 0,title,text,subject,date,Target,clean_news,tokenized_news,lemmatized_news
0,"Rights groups urge EU, Japan to consider halt ...",BANGKOK (Reuters) - Rights groups on Wednesday...,worldnews,"October 18, 2017",True,bangkok (reuters) - rights groups wednesday ur...,"[bangkok, (, reuters, ), -, rights, groups, we...","[bangkok, (, reuters, ), -, right, group, wedn..."
1,WATCH: IRRELEVANT DEM POLITICAL ANALYST James ...,"On Friday s broadcast of HBO s Real Time, fo...",left-news,"Oct 21, 2017",Fake,"friday broadcast hbo real time, former clinton...","[friday, broadcast, hbo, real, time, ,, former...","[friday, broadcast, hbo, real, time, ,, former..."
2,"Trump Asks O’Reilly, ‘Do you think our country...",21st Century Wire says Regardless of what one ...,US_News,"February 6, 2017",Fake,21st century wire says regardless one thinks d...,"[21st, century, wire, says, regardless, one, t...","[21st, century, wire, say, regardless, one, th..."
3,Factbox: Trump fills top jobs for his administ...,(Reuters) - U.S. President-elect Donald Trump ...,politicsNews,"December 5, 2016",True,(reuters) - u.s. president-elect donald trump ...,"[(, reuters, ), -, u.s., president-elect, dona...","[(, reuters, ), -, u.s., president-elect, dona..."
4,ONE LAST TIME ON OUR DIME: Mooch and Barack Ar...,"The hard working First Family, in need of an...",politics,"Aug 6, 2016",Fake,"hard working first family, need another taxpay...","[hard, working, first, family, ,, need, anothe...","[hard, working, first, family, ,, need, anothe..."


In [17]:
def return_sentences(tokenized_news):
    return " ".join([word for word in tokenized_news])

In [18]:
data['clean_text'] = data['lemmatized_news'].apply(lambda x : return_sentences(x))
data.head()

Unnamed: 0,title,text,subject,date,Target,clean_news,tokenized_news,lemmatized_news,clean_text
0,"Rights groups urge EU, Japan to consider halt ...",BANGKOK (Reuters) - Rights groups on Wednesday...,worldnews,"October 18, 2017",True,bangkok (reuters) - rights groups wednesday ur...,"[bangkok, (, reuters, ), -, rights, groups, we...","[bangkok, (, reuters, ), -, right, group, wedn...",bangkok ( reuters ) - right group wednesday ur...
1,WATCH: IRRELEVANT DEM POLITICAL ANALYST James ...,"On Friday s broadcast of HBO s Real Time, fo...",left-news,"Oct 21, 2017",Fake,"friday broadcast hbo real time, former clinton...","[friday, broadcast, hbo, real, time, ,, former...","[friday, broadcast, hbo, real, time, ,, former...","friday broadcast hbo real time , former clinto..."
2,"Trump Asks O’Reilly, ‘Do you think our country...",21st Century Wire says Regardless of what one ...,US_News,"February 6, 2017",Fake,21st century wire says regardless one thinks d...,"[21st, century, wire, says, regardless, one, t...","[21st, century, wire, say, regardless, one, th...",21st century wire say regardless one think don...
3,Factbox: Trump fills top jobs for his administ...,(Reuters) - U.S. President-elect Donald Trump ...,politicsNews,"December 5, 2016",True,(reuters) - u.s. president-elect donald trump ...,"[(, reuters, ), -, u.s., president-elect, dona...","[(, reuters, ), -, u.s., president-elect, dona...",( reuters ) - u.s. president-elect donald trum...
4,ONE LAST TIME ON OUR DIME: Mooch and Barack Ar...,"The hard working First Family, in need of an...",politics,"Aug 6, 2016",Fake,"hard working first family, need another taxpay...","[hard, working, first, family, ,, need, anothe...","[hard, working, first, family, ,, need, anothe...","hard working first family , need another taxpa..."


In [20]:
data['Target'] = [1 if x == 'Fake' else 0 for x in data['Target']]
data.head()

Unnamed: 0,title,text,subject,date,Target,clean_news,tokenized_news,lemmatized_news,clean_text
0,"Rights groups urge EU, Japan to consider halt ...",BANGKOK (Reuters) - Rights groups on Wednesday...,worldnews,"October 18, 2017",0,bangkok (reuters) - rights groups wednesday ur...,"[bangkok, (, reuters, ), -, rights, groups, we...","[bangkok, (, reuters, ), -, right, group, wedn...",bangkok ( reuters ) - right group wednesday ur...
1,WATCH: IRRELEVANT DEM POLITICAL ANALYST James ...,"On Friday s broadcast of HBO s Real Time, fo...",left-news,"Oct 21, 2017",1,"friday broadcast hbo real time, former clinton...","[friday, broadcast, hbo, real, time, ,, former...","[friday, broadcast, hbo, real, time, ,, former...","friday broadcast hbo real time , former clinto..."
2,"Trump Asks O’Reilly, ‘Do you think our country...",21st Century Wire says Regardless of what one ...,US_News,"February 6, 2017",1,21st century wire says regardless one thinks d...,"[21st, century, wire, says, regardless, one, t...","[21st, century, wire, say, regardless, one, th...",21st century wire say regardless one think don...
3,Factbox: Trump fills top jobs for his administ...,(Reuters) - U.S. President-elect Donald Trump ...,politicsNews,"December 5, 2016",0,(reuters) - u.s. president-elect donald trump ...,"[(, reuters, ), -, u.s., president-elect, dona...","[(, reuters, ), -, u.s., president-elect, dona...",( reuters ) - u.s. president-elect donald trum...
4,ONE LAST TIME ON OUR DIME: Mooch and Barack Ar...,"The hard working First Family, in need of an...",politics,"Aug 6, 2016",1,"hard working first family, need another taxpay...","[hard, working, first, family, ,, need, anothe...","[hard, working, first, family, ,, need, anothe...","hard working first family , need another taxpa..."


In [21]:
from sklearn.model_selection import train_test_split

In [22]:

X_train, X_test, y_train, y_test = train_test_split(data['clean_text'], data['Target'], test_size=0.2, random_state=5)

print(X_train.shape)
print(X_test.shape)

(35918,)
(8980,)


In [23]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Assuming 'lemmatized_news' is a column in your DataFrame 'data'
count_vectorizer = CountVectorizer()
X = count_vectorizer.fit_transform(data['clean_text'])

# Get feature names
feature_names_count = count_vectorizer.get_feature_names_out()

print("CountVectorizer feature names:", feature_names_count)

CountVectorizer feature names: ['00' '000' '0000' ... 'zzzzzzzz' 'zzzzzzzzzzzzz' 'émigré']


In [26]:

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(data['clean_text'])
feature_names_tfidf = tfidf_vectorizer.get_feature_names_out()
print("TfidfVectorizer feature names:", feature_names_tfidf)

TfidfVectorizer feature names: ['00' '000' '0000' ... 'zzzzzzzz' 'zzzzzzzzzzzzz' 'émigré']


In [27]:

tfidf = TfidfVectorizer()
tfidf_train = tfidf.fit_transform(X_train)
tfidf_test = tfidf.transform(X_test)

print(tfidf_train.shape)
print(tfidf_test.shape)

(35918, 106465)
(8980, 106465)
