# Classification exercises

This is a modified, more compact version of the SVM text classification code from the classification notebook. You may wish to use this as a starting point for doing some of the exercises.

(This first piece of code just imports the necessary libraries.)

In [None]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix
from eli5 import show_weights

## Loading and splitting data

In [None]:
data = read_csv('../data/enron-spam/enron-spam-subjects.tsv', sep='\t', names=('class', 'id', 'text'))
data = data[['class', 'text']]    # drop IDs

train_data, devel_and_test_data = train_test_split(data, test_size=0.4, random_state=1234)
devel_data, test_data = train_test_split(devel_and_test_data, test_size=0.5, random_state=5678)

train_Y, train_texts = train_data['class'], train_data['text']
devel_Y, devel_texts = devel_data['class'], devel_data['text']
test_Y, test_texts = test_data['class'], test_data['text']

## Featurization and vectorization

In [None]:
space_tokenizer = lambda text: text.split()

vectorizer = TfidfVectorizer(tokenizer=space_tokenizer, ngram_range=(1,2))
vectorizer.fit(train_texts)

train_X = vectorizer.transform(train_texts)
devel_X = vectorizer.transform(devel_texts)
test_X = vectorizer.transform(test_texts)

## Training and prediction

In [None]:
%%time
classifier = LinearSVC(
    C=1.0,
    class_weight=None,
    max_iter=1000,
    loss='squared_hinge'
)
classifier.fit(train_X, train_Y)

In [None]:
%%time
pred_Y = classifier.predict(devel_X)

## Evaluation and analysis

In [None]:
accuracy = accuracy_score(devel_Y, pred_Y)

tn, fp, fn, tp = confusion_matrix(devel_Y, pred_Y, labels=['spam', 'ham']).ravel()
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f_score = 2 * precision * recall / (precision + recall)

print('accuracy {:.2%}'.format(accuracy))
print('precision {:.2%}, recall {:.2%}, f-score {:.2%}'.format(precision, recall, f_score))

In [None]:
show_weights(classifier, vec=vectorizer)