# IMPORTING LIBRARIES

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sruja\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# PRE PROCESSING DATA

In [4]:
data = pd.read_csv('fake_train.csv')
data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [5]:
data.shape

(20800, 5)

In [6]:
data.isna().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [7]:
data = data.fillna('')

In [8]:
data.isna().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [31]:
data['content'] = data['author']+' '+data['title']

In [32]:
data['content']

0        Darrell Lucus House Dem Aide: We Didn’t Even S...
1        Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2        Consortiumnews.com Why the Truth Might Get You...
3        Jessica Purkiss 15 Civilians Killed In Single ...
4        Howard Portnoy Iranian woman jailed for fictio...
                               ...                        
20795    Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
20796    Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma...
20797    Michael J. de la Merced and Rachel Abrams Macy...
20798    Alex Ansary NATO, Russia To Hold Parallel Exer...
20799              David Swanson What Keeps the F-35 Alive
Name: content, Length: 20800, dtype: object

In [33]:
#INDEPENDENT VARIABLE
X = data.drop(['label'],axis=1)
#DEPENDENT VARIABLE
Y = data['label']

In [34]:
#returns a Series that contain counts of unique values
Y.value_counts()

1    10413
0    10387
Name: label, dtype: int64

In [35]:
X.shape

(20800, 5)

In [36]:
Y.shape

(20800,)

In [37]:
print(pd.DataFrame(X))
print(pd.DataFrame(Y))

          id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
0                                  Darrell Lucus   
1                                Daniel J. Flynn   
2                             Consortiu

In [38]:
ste = PorterStemmer()

In [120]:
def stemming(content):
    corpus = re.sub('[^a-zA-Z]',' ',content)
    corpus = corpus.lower()
    corpus = corpus.split()
    corpus = [ste.stem(word) for word in corpus if not word in stopwords.words('english')]
    corpus = ' '.join(corpus)
    return corpus

In [41]:
data['content'] = data['content'].apply(lambda x: str(x))
data['content'] = data['content'].apply(stemming)

In [42]:
print(data['text'])

0        House Dem Aide: We Didn’t Even See Comey’s Let...
1        Ever get the feeling your life circles the rou...
2        Why the Truth Might Get You Fired October 29, ...
3        Videos 15 Civilians Killed In Single US Airstr...
4        Print \nAn Iranian woman has been sentenced to...
                               ...                        
20795    Rapper T. I. unloaded on black celebrities who...
20796    When the Green Bay Packers lost to the Washing...
20797    The Macy’s of today grew from the union of sev...
20798    NATO, Russia To Hold Parallel Exercises In Bal...
20799      David Swanson is an author, activist, journa...
Name: text, Length: 20800, dtype: object


In [86]:
x = data['content'].values
y = data['label'].values

In [88]:
print(x)

['darrel lucu hous dem aid even see comey letter jason chaffetz tweet'
 'daniel j flynn flynn hillari clinton big woman campu breitbart'
 'consortiumnew com truth might get fire' ...
 'michael j de la merc rachel abram maci said receiv takeov approach hudson bay new york time'
 'alex ansari nato russia hold parallel exercis balkan'
 'david swanson keep f aliv']


In [89]:
print(y)

[1 0 1 ... 0 1 1]


In [91]:
y.shape

(20800,)

In [92]:
#splitting will be done in a way that maintains the proportion of different classes in the target variable Y across both the training and testing datasets.
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, stratify=Y, random_state=2)

# TFIDF stands for "tern frequency-inverse document frequency"
->TF-IDF is a numerical statistic which measures the importance of the word in a document
->Term Frequency: Number of time a word appear in a text document
->Inverse Document Frequency: measure the word is a rare word or common word in a document

In [93]:
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [95]:
x_test.shape

(4160, 15495)

# LOGISTIC REGRESSION:
Logistic regression is a statistical analysis method to predict a binary outcome, such as yes or no, based on prior observations of a data set.A logistic regression model predicts a logistic regression model predicts a dependentdata variable by analyzing the relationship between one or more existing independent variables. 

In [57]:
model = LogisticRegression()

In [96]:
model.fit(x_train,y_train)

LogisticRegression()

In [98]:
prediction = model.predict(x_test)
accuracy = model.score(x_test, y_test)

In [99]:
accuracy

0.9800480769230769

In [121]:
import pickle

In [122]:
pickle.dump(vectorizer, open('vectorz.pkl','wb'))

In [123]:
pickle.dump(model, open('modelz.pkl','wb'))

In [124]:
vector_form = pickle.load(open('vectorz.pkl','rb'))

In [125]:
load_model = pickle.load(open('modelz.pkl','rb'))

In [111]:
def fake_news(news):
    news = stemming(news)
    input_data=[news]
    vector_form1=vector_form.transform(input_data)
    prediction=load_model.predict(vector_form1)
    return prediction

In [118]:
val = fake_news('''In these trying times, Jackie Mason is the Voice of Reason. [In this weekâ€™s exclusive clip for Breitbart News, Jackie discusses the looming threat of North Korea, and explains how President Donald Trump could win the support of the Hollywood left if the U. S. needs to strike first.  â€œIf he decides to bomb them, the whole country will be behind him, because everybody will realize he had no choice and that was the only thing to do,â€ Jackie says. â€œExcept the Hollywood left. Theyâ€™ll get nauseous. â€ â€œ[Trump] could win the left over, theyâ€™ll fall in love with him in a minute. If he bombed them for a better reason,â€ Jackie explains. â€œLike if they have no transgender toilets. â€ Jackie also says itâ€™s no surprise that Hollywood celebrities didnâ€™t support Trumpâ€™s strike on a Syrian airfield this month. â€œThey were infuriated,â€ he says. â€œBecause it might only save lives. That doesnâ€™t mean anything to them. If it only saved the environment, or climate change! Theyâ€™d be the happiest people in the world. â€ Still, Jackie says heâ€™s got nothing against Hollywood celebs. Theyâ€™ve got a tough life in this country. Watch Jackieâ€™s latest clip above.   Follow Daniel Nussbaum on Twitter: @dznussbaum
''')

In [119]:
if val == [0]:
    print('Reliable')
else:
    print('Unreliable')

Reliable
