In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import itertools
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [4]:
news = pd.read_csv('../Data/FakeNewsData.csv')
news.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [5]:
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
Unnamed: 0    6335 non-null int64
title         6335 non-null object
text          6335 non-null object
label         6335 non-null object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB


In [6]:
news.title.astype(str)

0                            You Can Smell Hillary’s Fear
1       Watch The Exact Moment Paul Ryan Committed Pol...
2             Kerry to go to Paris in gesture of sympathy
3       Bernie supporters on Twitter erupt in anger ag...
4        The Battle of New York: Why This Primary Matters
                              ...                        
6330    State Department says it can't find emails fro...
6331    The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
6332    Anti-Trump Protesters Are Tools of the Oligarc...
6333    In Ethiopia, Obama seeks progress on peace, se...
6334    Jeb Bush Is Suddenly Attacking Trump. Here's W...
Name: title, Length: 6335, dtype: object

In [7]:
news.text.astype(str)

0       Daniel Greenfield, a Shillman Journalism Fello...
1       Google Pinterest Digg Linkedin Reddit Stumbleu...
2       U.S. Secretary of State John F. Kerry said Mon...
3       — Kaydee King (@KaydeeKing) November 9, 2016 T...
4       It's primary day in New York and front-runners...
                              ...                        
6330    The State Department told the Republican Natio...
6331    The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
6332     Anti-Trump Protesters Are Tools of the Oligar...
6333    ADDIS ABABA, Ethiopia —President Obama convene...
6334    Jeb Bush Is Suddenly Attacking Trump. Here's W...
Name: text, Length: 6335, dtype: object

In [8]:
news[news['label'] == 'FAKE'].count()

Unnamed: 0    3164
title         3164
text          3164
label         3164
dtype: int64

In [9]:
news[news['label'] =='REAL'].count()

Unnamed: 0    3171
title         3171
text          3171
label         3171
dtype: int64

In [10]:
# unnecessary column
news.drop('Unnamed: 0', axis=1, inplace=True)

In [11]:
news.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [12]:
# Lowering the case of the text in the non-target rows
news['title'] = news.title.str.lower()
news['text'] = news.text.str.lower()

In [13]:
news.head()

Unnamed: 0,title,text,label
0,you can smell hillary’s fear,"daniel greenfield, a shillman journalism fello...",FAKE
1,watch the exact moment paul ryan committed pol...,google pinterest digg linkedin reddit stumbleu...,FAKE
2,kerry to go to paris in gesture of sympathy,u.s. secretary of state john f. kerry said mon...,REAL
3,bernie supporters on twitter erupt in anger ag...,"— kaydee king (@kaydeeking) november 9, 2016 t...",FAKE
4,the battle of new york: why this primary matters,it's primary day in new york and front-runners...,REAL


In [14]:
y = news.label
X = news.drop('label', axis=1)

In [15]:
X.head()

Unnamed: 0,title,text
0,you can smell hillary’s fear,"daniel greenfield, a shillman journalism fello..."
1,watch the exact moment paul ryan committed pol...,google pinterest digg linkedin reddit stumbleu...
2,kerry to go to paris in gesture of sympathy,u.s. secretary of state john f. kerry said mon...
3,bernie supporters on twitter erupt in anger ag...,"— kaydee king (@kaydeeking) november 9, 2016 t..."
4,the battle of new york: why this primary matters,it's primary day in new york and front-runners...


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [17]:
print(X_train.shape)
print(y_train.shape)

(5068, 2)
(5068,)


In [18]:
print(X_test.shape)
print(y_test.shape)

(1267, 2)
(1267,)


In [19]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/reuben/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
tokenizer = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")

title = X_train['title']

test_one = title.apply(word_tokenize)

In [21]:
X_train['title'] = X_train.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)
X_train['text'] = X_train.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [23]:
X_train.head()

Unnamed: 0,title,text
1142,"[alabama, sen., sessions, backs, trump, ’, s, ...","[donald, trump, received, a, key, endorsement,..."
2654,"[as, of, 6:00, am, november, 6th, ,, trump, is...","[nina, november, 6, ,, 2016, @, 2:39, pm, poli..."
5395,"[time, :, investigating, hillary, is, an, atta...","[time, :, investigating, hillary, is, an, atta..."
1170,"[women, should, vote, with, their, husbands]","[taki, 's, magazine, october, 28, ,, 2016, thi..."
4371,"[pakistan, police, detain, dozens, of, imran, ...","[pakistan, pakistan, 's, cricketer, turned, po..."
