In [1]:
import pandas as pd 
import numpy as np 

In [2]:
data_true=pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')
data_false=pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')

data_true['label']=1
data_false['label']=0

In [3]:
data_true.duplicated().sum()
data_false.duplicated().sum()

3

In [4]:
data_true.drop_duplicates(inplace=True)
data_false.drop_duplicates(inplace=True)

In [5]:
data_true.columns

Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')

In [6]:
data_true[data_true['title']==None].shape[0]
data_false[data_false['title']==None].shape[0]

0

### Preprocessing

In [7]:
import nltk 
from nltk import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
import spacy

Stop_words=stopwords.words('english')
model=spacy.load('en_core_web_sm')

def lemmatize(text,model):
    doc=model(text)
    lemmatized_tokens=[i.lemma_ for i in doc if i.is_punct==False and i.is_stop==False]
    return ' '.join(lemmatized_tokens)

In [8]:
### Exploring the data 
data_true.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [9]:
###Now concatinate the two data frames 
data=pd.concat([data_true,data_false],axis=0)
data.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [10]:
### Now provide the new indexes 
data.shape

(44689, 5)

In [11]:
import random 
random.seed(42)

Index=np.arange(0,44689,1)
random.shuffle(Index)

data.index=Index

In [12]:
data.sort_index(inplace=True)

In [13]:
### there are duplicate rows present in the data so we habe to handle them first 
data.drop_duplicates('text',inplace=True)
data['text'].duplicated().sum()

0

In [14]:
data.drop_duplicates('title',inplace=True)

In [15]:
### Now lets remove the subject and data as they are not relvent to the truth value of the news 
data.columns

Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')

In [16]:
data.drop(['subject','date'],axis=1,inplace=True)

In [17]:
### we would only try to take a small sample of the bigger subject because of the limited resources
data_sample=data.iloc[1:1000,:]

In [18]:
data_sample.shape

(999, 3)

In [19]:
import warnings 
warnings.filterwarnings('ignore')

data_sample['text']=data_sample['text'].apply(lambda x:lemmatize(x,model))

In [20]:
data_sample['label'].value_counts()

label
0    512
1    487
Name: count, dtype: int64

In [21]:
from imblearn.over_sampling import RandomOverSampler 

oversampler=RandomOverSampler()

X=data_sample['text'].values
y=data_sample['label'].values

X_resample,y_resample=oversampler.fit_resample(X.reshape(-1,1),y)

In [22]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline 
from sklearn.metrics import classification_report,accuracy_score


X_resample=X_resample.reshape(-1)

X_train,X_test,y_train,y_test=train_test_split(X_resample,y_resample,train_size=0.8,random_state=42,stratify=y_resample)

clf=Pipeline([
    ('vectorizer',CountVectorizer()),
    ('Learning Model',LogisticRegression())
])

clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.93      0.96       103
           1       0.94      0.99      0.96       102

    accuracy                           0.96       205
   macro avg       0.96      0.96      0.96       205
weighted avg       0.96      0.96      0.96       205



In [23]:
print(accuracy_score(y_test,y_pred))

0.9609756097560975
