In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier

## Clean and Vectorize Data

In [None]:
# Clean the data Whoop whoop
df = pd.read_csv('../data/train.csv')
df.drop('id', inplace=True, axis=1)
df.replace('', np.nan, inplace=True)
df.dropna(inplace=True)

df['label'].replace(0, 'reliable', inplace=True)
df['label'].replace(1, 'unreliable', inplace=True)

train, test = train_test_split(df, test_size=0.5)
x_train = train['text']
y_train = train['label']
x_test = test['text']
y_test = test['label']

base_pipeline_steps = [
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
]

In [None]:
train.head()

In [None]:
test.head()

## Classification

<mark>Only use one of the options—on run, it overrides the classifier variables.</mark>

### Option 1: Naïve Bayes

In [None]:
classifier_name = "naïve Bayes"
text_clf = Pipeline(base_pipeline_steps + [('clf', MultinomialNB())])

### Option 2: Stochastic Gradient Descent

In [None]:
classifier_name = "stochastic gradient descent"
text_clf = Pipeline(base_pipeline_steps + [
    ('clf', SGDClassifier(loss='perceptron', alpha=1e-3, random_state=42, max_iter=5, tol=None)),
])

### Option 3: Passive Aggressive

In [None]:
classifier_name = "passive aggressive"
text_clf = Pipeline(base_pipeline_steps + [('clf', PassiveAggressiveClassifier(loss='squared_hinge'))])

## Train Classifier

In [None]:
text_clf.fit(x_train, y_train)

## Evaluate Classifier Against Training Data

In [None]:
prediction = text_clf.predict(x_test)
test['prediction'] = prediction
accuracy = np.mean(prediction == y_test)

print("classifier:", classifier_name)
print("accuracy:", accuracy)
test