In [2]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier


## Clean and Vectorize Data

In [3]:
# Clean the data Whoop whoop
df = pd.read_csv('../data/train.csv')
df.drop('id', inplace=True, axis=1)
df.replace('', np.nan, inplace=True)
df.dropna(inplace=True)

df['label'].replace(0, 'reliable', inplace=True)
df['label'].replace(1, 'unreliable', inplace=True)

train, test = train_test_split(df, test_size=0.5)
x_train = train['text']
y_train = train['label']
x_test = test['text']
y_test = test['label']

base_pipeline_steps = [
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
]

In [4]:
train.head()

Unnamed: 0,title,author,text,label
14213,"After Retirement, Finding a ‘Second Career’ as...",Emily Palmer,"On an October day, Anne Davis, 76, wheeled int...",reliable
14096,How To Tie A Real Hangman’s Noose Knot,# 1 NWO Hatr,"Posted on October 27, 2016 by # 1 NWO Hatr Pub...",unreliable
3424,Girl Soldiers: Forgotten Casualties of War,Pat Hynes,"Share This \nAs secretary of State, Hillary Cl...",unreliable
2334,Protest Song Of The Week: ‘Retribution’ by Tan...,Kevin Gosztola,The colonialism of the United States rears its...,unreliable
17709,Melania Trump Rips Kathy Griffin: Beheading Ph...,Daniel Nussbaum,First Lady Melania Trump has responded to Kath...,reliable


In [5]:
test.head()

Unnamed: 0,title,author,text,label
20388,Soldier Headed Home from Deployment Says Unite...,Katherine Rodriguez,A National Guard soldier headed home from a de...,reliable
4038,"Two Houston Police Officers Shot, Manhunt Unde...",Bob Price,Two Houston Police Department officers were ho...,reliable
4567,"In Interview With Bill Maher, Obama Warns Amer...",Allison Vincent,"By Allison Vincent Election 2016 , News , Poli...",unreliable
8483,Diplomatic Deal With Turkey Upsets Israelis Wa...,Isabel Kershner,JERUSALEM — As Israeli and Turkish leaders ...,reliable
1188,"BRINK OF WAR: UK sends TANKS, DRONES and 800 S...",admin,By JOEY MILLAR \nThe show of force comes just ...,unreliable


## Classification

<mark>Only use one of the options—on run, it overrides the classifier variables.</mark>

### Option 1: Naïve Bayes

In [6]:
classifier_name = "naïve Bayes"
pipeline = Pipeline(base_pipeline_steps + [('clf', MultinomialNB())])

### Option 2: Stochastic Gradient Descent

In [7]:
classifier_name = "stochastic gradient descent"
pipeline = Pipeline(base_pipeline_steps + [
    ('clf', SGDClassifier(loss='perceptron', alpha=1e-3, random_state=42, max_iter=5, tol=None)),
])

### Option 3: Passive Aggressive

In [8]:
classifier_name = "passive aggressive"
pipeline = Pipeline(base_pipeline_steps + [('clf', PassiveAggressiveClassifier(loss='squared_hinge'))])

## Train Classifier

In [9]:
pipeline.fit(x_train, y_train)



Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...       shuffle=True, tol=None, validation_fraction=0.1, verbose=0,
              warm_start=False))])

In [10]:
vectorizer = pipeline.steps[0][1]
classifier = pipeline.steps[2][1]

feature_weights = sorted(zip(classifier.coef_[0], vectorizer.get_feature_names()))
features = pd.DataFrame(feature_weights, columns=['weight', 'token'])

### Most Unreliable Features

In [11]:
features.head()

Unnamed: 0,weight,token
0,-6.999958,said
1,-6.527838,breitbart
2,-6.416279,twitter
3,-5.454342,2017
4,-5.342326,follow


### Most Reliable Features

In [12]:
features.tail()

Unnamed: 0,weight,token
114571,4.53573,hillary
114572,4.632024,us
114573,5.367817,october
114574,6.240606,anti
114575,6.449284,2016


## Evaluate Classifier Against Testing Data

In [13]:
prediction = pipeline.predict(x_test)

In [14]:
def outcome(row):
    if row['label'] == row['prediction']:
        return ''
    
    if row['prediction'] == 'unreliable':
        return 'false positive'
    
    return 'false negative'

test.loc[:,'prediction'] = prediction
test.loc[:,'outcome'] = test.apply(lambda row: outcome(row), axis=1)

accuracy = np.mean(prediction == y_test)

false_positives = test[test['outcome'] == 'false positive']
false_negatives = test[test['outcome'] == 'false negative']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [15]:
print('classifier:', classifier_name)
print('accuracy:', accuracy)

classifier: passive aggressive
accuracy: 0.9584381494039156


In [16]:
num_false_positives = len(false_positives.index)
num_false_negatives = len(false_negatives.index)

print('%s misclassified documents of %s documents' % (num_false_positives + num_false_negatives, len(test.index)))
print('%s false positives' % num_false_positives)
print('%s false negatives' % num_false_negatives)

380 misclassified documents of 9143 documents
157 false positives
223 false negatives


## Visualize Classifications

In [17]:
summary = test.apply(lambda x: x.str.slice(0, 100))

def highlight_misclassfied(row):

    bg_color = 'white'
    text_color = 'black'
    
    if row['outcome'] != '':
        bg_color = 'red'
        text_color = 'white'

    return ['background-color: %s; color: %s;' % (bg_color, text_color)]*len(row.values)

summary.head().style.apply(highlight_misclassfied, axis=1)

Unnamed: 0,title,author,text,label,prediction,outcome
20388,Soldier Headed Home from Deployment Says United Airlines Charged Him $200 for ’Overweight’ Military,Katherine Rodriguez,A National Guard soldier headed home from a deployment to Afghanistan says he was charged a hefty fe,reliable,reliable,
4038,"Two Houston Police Officers Shot, Manhunt Underway",Bob Price,Two Houston Police Department officers were hospitalized Tuesday after a suspected burglar reportedl,reliable,reliable,
4567,"In Interview With Bill Maher, Obama Warns America: Sitting Out This Election Is A Vote For Trump (VI",Allison Vincent,"By Allison Vincent Election 2016 , News , Politics , Videos November 5, 2016 In Interview With Bill",unreliable,unreliable,
8483,Diplomatic Deal With Turkey Upsets Israelis Wary of Conceding Too Much - The New York Times,Isabel Kershner,JERUSALEM — As Israeli and Turkish leaders announced the formal resumption of full diplomatic rel,reliable,reliable,
1188,"BRINK OF WAR: UK sends TANKS, DRONES and 800 SOLDIERS to Russian border as tensions grow",admin,By JOEY MILLAR The show of force comes just days after the Russian leader’s fleet passed near the B,unreliable,unreliable,


In [18]:
false_positives.head()

Unnamed: 0,title,author,text,label,prediction,outcome
15318,Judicial Watch: Huma Abedin Emails Show Favors...,Kristina Wong,Hillary Clinton emails recently discovered on ...,reliable,unreliable,false positive
6052,YouTube Restores Conservative ’Legal Insurrect...,Charlie Nash,YouTube has restored the official channel for ...,reliable,unreliable,false positive
4427,Amazon’s July 12 ‘Day of Action’ for Net Neutr...,Chriss W. Street,Amazon is mobilizing a July 12 collective “Day...,reliable,unreliable,false positive
2689,Ann Coulter: All We Need Is Love ... and Depor...,Ann Coulter,"In Britain, as in the U. S. when an Islamic te...",reliable,unreliable,false positive
8101,Report: Donald Trump’s 2016 TV Ads Promised Po...,Neil Munro,Donald Trump’s 2016 TV campaign were far more...,reliable,unreliable,false positive


In [19]:
false_negatives.head()

Unnamed: 0,title,author,text,label,prediction,outcome
18680,Mobilizing in Guatemala,Rowan Wolf,[Photo: Chiquibul Forest Reserve near Guatemal...,unreliable,reliable,false negative
18432,NFL Warns Clenbuterol Level So High in Beef th...,Amando Flavio,The National Football League (NFL) in the Unit...,unreliable,reliable,false negative
12877,Donald Trump Elected 45th President Of The Uni...,Roosh Valizadeh,Via AP : \nDonald Trump was elected America’s ...,unreliable,reliable,false negative
14430,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Fed Up,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,unreliable,reliable,false negative
17966,Occupier David Fry released from jail: ‘I’m re...,Admin,"PORTLAND, Ore. (KOIN) — The last person to sur...",unreliable,reliable,false negative
