In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import SGDClassifier

## Clean and Vectorize Data

In [None]:
# Clean the data Whoop whoop
df = pd.read_csv('~/fake-news/train.csv')
df.drop('author', inplace=True, axis=1)
df.replace('', np.nan, inplace=True)
df.dropna(inplace=True)

x_train, x_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=.5)

base_pipeline_steps = [
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
]

## Classification

<mark>Only use one of the options—on run, it overrides the classifier variables.</mark>

### Option 1: Naïve Bayes

In [None]:
classifier_name = "naïve Bayes"
text_clf = Pipeline(base_pipeline_steps + [('clf', MultinomialNB())])

### Option 2: Stochastic Gradient Descent

In [None]:
classifier_name = "stochastic gradient descent"
text_clf = Pipeline(base_pipeline_steps + [
    ('clf', SGDClassifier(loss='perceptron', alpha=1e-3, random_state=42, max_iter=5, tol=None)),
])

## Train Classifier

In [None]:
text_clf.fit(x_train, y_train)

## Evaluate Classifier Against Training Data

In [None]:
predicted = text_clf.predict(x_test)
accuracy = np.mean(predicted == y_test)

print("classifier:", classifier_name)
print("accuracy:", accuracy)

In [None]:
# And printing output
# print(metrics.classification_report(y_test, predicted,
#     target_names=['fake', 'real']))