In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import itertools
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [3]:
news = pd.read_csv('../Data/FakeNewsData.csv')
news.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
Unnamed: 0    6335 non-null int64
title         6335 non-null object
text          6335 non-null object
label         6335 non-null object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB


In [5]:
news.text.astype(str)

0       Daniel Greenfield, a Shillman Journalism Fello...
1       Google Pinterest Digg Linkedin Reddit Stumbleu...
2       U.S. Secretary of State John F. Kerry said Mon...
3       — Kaydee King (@KaydeeKing) November 9, 2016 T...
4       It's primary day in New York and front-runners...
                              ...                        
6330    The State Department told the Republican Natio...
6331    The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
6332     Anti-Trump Protesters Are Tools of the Oligar...
6333    ADDIS ABABA, Ethiopia —President Obama convene...
6334    Jeb Bush Is Suddenly Attacking Trump. Here's W...
Name: text, Length: 6335, dtype: object

In [6]:
news[news['label'] == 'FAKE'].count()

Unnamed: 0    3164
title         3164
text          3164
label         3164
dtype: int64

In [7]:
news[news['label'] =='REAL'].count()

Unnamed: 0    3171
title         3171
text          3171
label         3171
dtype: int64

In [8]:
# unnecessary columns
news.drop('Unnamed: 0', axis=1, inplace=True)
news.drop('title', axis=1, inplace=True)

In [6]:
news.head()

Unnamed: 0,text,label
0,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,It's primary day in New York and front-runners...,REAL


In [9]:
# Lowering the case of the text in the non-target rows
news['text'] = news.text.str.lower()

In [10]:
news.head()

Unnamed: 0,text,label
0,"daniel greenfield, a shillman journalism fello...",FAKE
1,google pinterest digg linkedin reddit stumbleu...,FAKE
2,u.s. secretary of state john f. kerry said mon...,REAL
3,"— kaydee king (@kaydeeking) november 9, 2016 t...",FAKE
4,it's primary day in new york and front-runners...,REAL


In [45]:
y = news['label']
X = news.drop('label', axis=1)

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [47]:
print(X_train.shape)
print(y_train.shape)

(5068, 1)
(5068,)


In [48]:
print(X_test.shape)
print(y_test.shape)

(1267, 1)
(1267,)


In [13]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/reuben/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [49]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5068 entries, 1142 to 860
Data columns (total 1 columns):
text    5068 non-null object
dtypes: object(1)
memory usage: 79.2+ KB


In [50]:
tokenize = word_tokenize()
X_train['text'] = X_train['text'].apply(word_tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [82]:
test = X_train.text.apply(re.sub("[^a-zA-Z]", "", X_train.text))

TypeError: expected string or bytes-like object

In [81]:
def custom_tokenize(text):
    if not text:
        text = ''
    return word_tokenize(text)
test = X_train.text.apply(custom_tokenize)

TypeError: expected string or bytes-like object

In [55]:
# tokenizer = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")
tokenizer = RegexpTokenizer(r'\w+')

title = tokenizer.tokenize('users-:123 Have a great day!')

In [58]:
test = X_train.apply(tokenizer.tokenize, axis=1)

TypeError: ('expected string or bytes-like object', 'occurred at index 1142')

In [75]:
pd.set_option('display.width', 1000)

In [72]:
print(X_train.iloc[0])

text    [donald, trump, received, a, key, endorsement,...
Name: 1142, dtype: object


In [56]:
print(title)

['users', '123', 'Have', 'a', 'great', 'day']


In [73]:
X_train.head()

Unnamed: 0,text
1142,"[donald, trump, received, a, key, endorsement,..."
2654,"[nina, november, 6, ,, 2016, @, 2:39, pm, poli..."
5395,"[time, :, investigating, hillary, is, an, atta..."
1170,"[taki, 's, magazine, october, 28, ,, 2016, thi..."
4371,"[pakistan, pakistan, 's, cricketer, turned, po..."


In [52]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5068 entries, 1142 to 860
Data columns (total 1 columns):
text    5068 non-null object
dtypes: object(1)
memory usage: 79.2+ KB


In [19]:
# X_train['text'] = X_train.text.astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [103]:
test = X_train.replace(r'[^a-zA-z]', '')

In [112]:
test 

Unnamed: 0,text
1142,"[donald, trump, received, a, key, endorsement,..."
2654,"[nina, november, 6, ,, 2016, @, 2:39, pm, poli..."
5395,"[time, :, investigating, hillary, is, an, atta..."
1170,"[taki, 's, magazine, october, 28, ,, 2016, thi..."
4371,"[pakistan, pakistan, 's, cricketer, turned, po..."
...,...
3772,"[what, happened, was, less, a, debate, among, ..."
5191,"[clinton, ,, fbigate, and, the, true, depth, o..."
5226,"[fearing, election, day, trouble, ,, some, us,..."
5390,"[president, obama, ’, s, appearance, at, a, to..."


In [53]:
X_train["text"] = X_train['text'].replace(r'[^a-zA-z]', '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [54]:
X_train

Unnamed: 0,text
1142,"[donald, trump, received, a, key, endorsement,..."
2654,"[nina, november, 6, ,, 2016, @, 2:39, pm, poli..."
5395,"[time, :, investigating, hillary, is, an, atta..."
1170,"[taki, 's, magazine, october, 28, ,, 2016, thi..."
4371,"[pakistan, pakistan, 's, cricketer, turned, po..."
...,...
3772,"[what, happened, was, less, a, debate, among, ..."
5191,"[clinton, ,, fbigate, and, the, true, depth, o..."
5226,"[fearing, election, day, trouble, ,, some, us,..."
5390,"[president, obama, ’, s, appearance, at, a, to..."


In [None]:
Punc_num = ['`','~','!','@',]
for row in X_train.text:
    

In [22]:
stop = set(stopwords.words('english'))

In [23]:
print(stop)

{"wasn't", 'from', "you're", 'just', 'be', 'shouldn', 'mustn', 'then', 'we', 'them', 'there', 'aren', "you'll", "haven't", 'their', 'all', 'that', 'each', 'ma', 'were', 'm', 'can', "doesn't", "you'd", 'both', 'by', 'didn', 'on', 'when', 'same', "mightn't", 'the', 'between', 'her', 'not', 'weren', 'o', 'its', 'are', 'him', 'hers', 's', "aren't", 'no', 'where', 'your', 'most', "it's", 'did', 'himself', 'wouldn', 'y', 'his', 'against', 'she', 're', "hasn't", 'won', "mustn't", 'so', 'under', 'as', 'herself', 'have', 'than', 'ours', 'wasn', 'but', 'because', "weren't", 'does', 't', "shan't", 'out', 'our', 'of', 'been', 'how', 'few', 'here', 'now', 'during', "shouldn't", 'nor', 'this', 'own', 'haven', 'more', 'who', 'such', 'while', 'further', "didn't", 'should', 'very', 'themselves', 'once', 'off', 'only', 'couldn', 'through', 'in', 'itself', 'until', 'ain', 'is', 'an', 'll', "won't", "wouldn't", 'having', "couldn't", "that'll", 'theirs', 'was', 'hadn', 'those', 'doing', 'or', 'down', 'why'

In [26]:
test = X_train['text'].apply(lambda row: [word for word in row if not word in stop])

In [27]:
test

1142    [n, l,  , r, u, p,  , r, e, c, e, v, e,  ,  , ...
2654    [n, n,  , n, v, e, b, e, r,  , 6,  ,  , 2, 0, ...
5395    [e,  ,  , n, v, e, g, n, g,  , h, l, l, r,  , ...
1170    [k,  ,  , g, z, n, e,  , c, b, e, r,  , 2, 8, ...
4371    [p, k, n,  , p, k, n,  ,  , c, r, c, k, e, e, ...
                              ...                        
3772    [w, h,  , h, p, p, e, n, e,  , w,  , l, e,  , ...
5191    [c, l, n, n,  ,  , f, b, g, e,  , n,  , h, e, ...
5226    [f, e, r, n, g,  , e, l, e, c, n,  ,  , r, u, ...
5390    [p, r, e, e, n,  , b,  ,  ,  , p, p, e, r, n, ...
860     [ , n,  , p, r, e,  , g, n,  , n, e, r, n, n, ...
Name: text, Length: 5068, dtype: object

In [113]:
PassiveAggressiveClassifier()

PassiveAggressiveClassifier()