In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import SGDClassifier

In [None]:
# Clean the data Whoop whoop
df = pd.read_csv('~/fake-news/train.csv')
df.drop('author', inplace=True, axis=1)
df.replace('', np.nan, inplace=True)
df.dropna(inplace=True)

x_train, x_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=.5)

base_pipeline_steps = [
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
]

In [None]:
# CLASSIFICATION OPTION 1 
# Only use one of the two options -- on run, it overrides classifier variables

# Pipeline with Naive Bayes Text Classification
text_clf = Pipeline(base_pipeline_steps + [('clf', MultinomialNB())])

In [None]:
# CLASSIFICATION OPTION 2
# Only use one of the two options -- on run, it overrides classifier variables

# Pipeline with Support Vector Machine classification
text_clf = Pipeline(base_pipeline_steps + [
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

In [None]:
text_clf.fit(x_train, y_train)

In [None]:
# Run test data against classifier
predicted = text_clf.predict(x_test)
np.mean(predicted == y_test)

In [None]:
# And printing output
# print(metrics.classification_report(y_test, predicted,
#     target_names=['fake', 'real']))