In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier

## Clean and Vectorize Data

In [None]:
# Clean the data Whoop whoop
df = pd.read_csv('../data/train.csv')
df.drop('id', inplace=True, axis=1)
df.replace('', np.nan, inplace=True)
df.dropna(inplace=True)

df['label'].replace(0, 'reliable', inplace=True)
df['label'].replace(1, 'unreliable', inplace=True)

train, test = train_test_split(df, test_size=0.5)
x_train = train['text']
y_train = train['label']
x_test = test['text']
y_test = test['label']

base_pipeline_steps = [
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
]

In [None]:
train.head()

In [None]:
test.head()

## Classification

<mark>Only use one of the options—on run, it overrides the classifier variables.</mark>

### Option 1: Naïve Bayes

In [None]:
classifier_name = "naïve Bayes"
pipeline = Pipeline(base_pipeline_steps + [('clf', MultinomialNB())])

### Option 2: Stochastic Gradient Descent

In [None]:
classifier_name = "stochastic gradient descent"
pipeline = Pipeline(base_pipeline_steps + [
    ('clf', SGDClassifier(loss='perceptron', alpha=1e-3, random_state=42, max_iter=5, tol=None)),
])

### Option 3: Passive Aggressive

In [None]:
classifier_name = "passive aggressive"
pipeline = Pipeline(base_pipeline_steps + [('clf', PassiveAggressiveClassifier(loss='squared_hinge'))])

## Train Classifier

In [None]:
pipeline.fit(x_train, y_train)

In [None]:
vectorizer = pipeline.steps[0][1]
classifier = pipeline.steps[2][1]

feature_weights = sorted(zip(classifier.coef_[0], vectorizer.get_feature_names()))
features = pd.DataFrame(feature_weights, columns=['weight', 'token'])

### Most Unreliable Features

In [None]:
features.head()

### Most Reliable Features

In [None]:
features.tail()

## Evaluate Classifier Against Testing Data

In [None]:
prediction = pipeline.predict(x_test)

In [None]:
def outcome(row):
    if row['label'] == row['prediction']:
        return ''
    
    if row['prediction'] == 'unreliable':
        return 'false positive'
    
    return 'false negative'

test.loc[:,'prediction'] = prediction
test.loc[:,'outcome'] = test.apply(lambda row: outcome(row), axis=1)

accuracy = np.mean(prediction == y_test)

false_positives = test[test['outcome'] == 'false positive']
false_negatives = test[test['outcome'] == 'false negative']

In [None]:
print('classifier:', classifier_name)
print('accuracy:', accuracy)

In [None]:
num_false_positives = len(false_positives.index)
num_false_negatives = len(false_negatives.index)

print('%s misclassified documents of %s documents' % (num_false_positives + num_false_negatives, len(test.index)))
print('%s false positives' % num_false_positives)
print('%s false negatives' % num_false_negatives)

## Visualize Classifications

In [None]:
summary = test.apply(lambda x: x.str.slice(0, 100))

def highlight_misclassfied(row):

    bg_color = 'white'
    text_color = 'black'
    
    if row['outcome'] != '':
        bg_color = 'red'
        text_color = 'white'

    return ['background-color: %s; color: %s;' % (bg_color, text_color)]*len(row.values)

summary.head().style.apply(highlight_misclassfied, axis=1)

In [None]:
false_positives.head()

In [None]:
false_negatives.head()