## 0. Imports

In [109]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import GridSearchCV

In [32]:
import spacy
from nltk.corpus import stopwords
import string

In [106]:
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

## 1. Get data

In [3]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

## 2. Data processing

In [4]:
x_train = newsgroups_train.data
y_train = newsgroups_train.target

In [60]:
x_test = newsgroups_test.data
y_test = newsgroups_test.target

In [23]:
x_train[120]

'From: shd2001@andy.bgsu.edu (Sherlette Dixon)\nSubject: Christianity & Atheism:  an update\nOrganization: BGSU\nLines: 32\n\nFirst, I would like to thank all who sent me their opinions on the matter\nat hand.  All advice was taken to heart, if not directly used.  My friend\nfound out about the matter quite accidently.  After reading some of my\nmail, I quit from the mail reader & went about my business.  I must have\ntrashed my mail improperly, because he got on the same terminal the next\nday & saw my old messages.  He thought they were responses to a post he\nplaced in alt.atheism earlier that week, so he read some of them before\nrealizing that they were for me.  I got a message from him the next day; he\napologized for reading my mail & said that he did not want to appear to be\na snoop.  He said that he would be willing to talk to me about his views &\ndidn\'t mind doing so, especially with a friend.  So we did.  I neither\nchanged his mind nor did he change mine, as that was not

So most of the data processing:
- stopwords
- punctuation
- punctuation chains
- single character words
- stuff like '\n\t\t\t\t\t\t' and '\n'

## 3. Document processing

In [8]:
nlp = spacy.load("en_core_web_md")

In [25]:
%%time
x_train_nlp = [[x.lemma_ for x in nlp(y)] for y in x_train]

Wall time: 17min 34s


In [61]:
%%time
x_test_nlp = [[x.lemma_ for x in nlp(y)] for y in x_test]

Wall time: 11min 31s


## 2 bis. Data processing: take 2

### 2.1. Remove stopwords

In [28]:
stop_en = stopwords.words("english")

In [29]:
x_cleaned_1 = []
for x in x_train_nlp:
    x_cleaned_1.append([y for y in x if not y in stop_en])

In [62]:
x_cleaned_1_test = []
for x in x_test_nlp:
    x_cleaned_1_test.append([y for y in x if not y in stop_en])

### 2.2. Remove punct

In [33]:
x_cleaned_2 = []
for x in x_cleaned_1:
    x_cleaned_2.append([y for y in x if not y in list(string.punctuation)])

In [63]:
x_cleaned_2_test = []
for x in x_cleaned_1_test:
    x_cleaned_2_test.append([y for y in x if not y in list(string.punctuation)])

### 2.3 Remove other useless stuff

In [35]:
useless = ["-PRON-"]

In [36]:
x_cleaned_3 = []
for x in x_cleaned_2:
    x_cleaned_3.append([y for y in x if not y in useless])

In [64]:
x_cleaned_3_test = []
for x in x_cleaned_2_test:
    x_cleaned_3_test.append([y for y in x if not y in useless])

### 2.4 Remove \n and '--'

In [37]:
x_cleaned_4 = []
for x in x_cleaned_3:
    x_cleaned_4.append([y for y in x if not ("--" in y or '\n' in y) ])

In [65]:
x_cleaned_4_test = []
for x in x_cleaned_3_test:
    x_cleaned_4_test.append([y for y in x if not ("--" in y or '\n' in y) ])

### 2.5 Join together

In [39]:
x_cleaned = [" ".join(y) for y in x_cleaned_4]

In [66]:
x_cleaned_test = [" ".join(y) for y in x_cleaned_4_test]

In [124]:
x_train[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [123]:
x_cleaned[0]

'lerxst@wam.umd.edu thing subject car nntp posting host rac3.wam.umd.edu organization university maryland college park line 15 wonder anyone enlighten car see day 2-door sport car look late 60s/ early 70 call bricklin door really small addition front bumper separate rest body know anyone tellme model name engine spec year production car make history whatev info funky look car please e mail thank il bring neighborhood lerxst'

## 4. Splitting

Already taken care of :-)

In [43]:
cnt = Counter(y_train)

In [48]:
cnt

Counter({0: 480,
         1: 584,
         2: 591,
         3: 590,
         4: 578,
         5: 593,
         6: 585,
         7: 594,
         8: 598,
         9: 597,
         10: 600,
         11: 595,
         12: 591,
         13: 594,
         14: 593,
         15: 599,
         16: 546,
         17: 564,
         18: 465,
         19: 377})

## 5. Feature representation

In [55]:
vec = TfidfVectorizer()
x_train_vec = vec.fit_transform(x_train)
x_train_vec.shape

(11314, 130107)

In [75]:
vec = TfidfVectorizer()
x_train_vec = vec.fit_transform(x_cleaned)
x_train_vec.shape

(11314, 119777)

## 6. Metric and algo

In [99]:
clf = LinearSVC(C=1, multi_class='ovr', dual=True)

In [100]:
%%time
clf.fit(x_train_vec, y_train)

Wall time: 2.24 s


LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

## 7. Validation

In [101]:
x_test_vec = vec.transform(x_cleaned_test)

In [102]:
y_predict = clf.predict(x_test_vec)

In [103]:
print("accuracy: ", accuracy_score(y_pred=y_predict, y_true=y_test))
print("precision: ", precision_score(y_pred=y_predict, y_true=y_test, average= "micro"))
print("recall: ", recall_score(y_pred=y_predict, y_true=y_test, average= "micro"))
print("f1: ", f1_score(y_pred=y_predict, y_true=y_test, average= "micro"))

accuracy:  0.8538236856080722
precision:  0.8538236856080722
recall:  0.8538236856080722
f1:  0.8538236856080722


In [107]:
def show_top10(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.coef_[i])[-10:]
        print("%s: %s" % (category, " ".join(feature_names[top10])))

In [108]:
show_top10(clf, vec, newsgroups_train.target_names)

alt.atheism: mangoe rushdie jaeger atheists cobb wingate islamic atheist keith atheism
comp.graphics: animation cview tiff polygon pov 3do graphics 3d image graphic
comp.os.ms-windows.misc: nt winqvt download ini file ax win3 driver cica windows
comp.sys.ibm.pc.hardware: jumper scsi monitor fastmicro irq vlb 486 pc ide gateway
comp.sys.mac.hardware: se lciii iisi lc centris duo quadra apple powerbook mac
comp.windows.x: expo xpert xlib window lcs server xterm x11r5 widget motif
misc.forsale: camera include distribution wanted condition sell ship forsale offer sale
rec.motorcycles: harley kawasaki dog helmet rider bmw ride motorcycle bike dod
rec.sport.baseball: braves giants tigers stadium cub yankee pitch sox phillies baseball
rec.sport.hockey: cup bruins goal coach espn play team playoff nhl hockey
sci.crypt: encrypt nsa crypto security wiretap pgp tap encryption key clipper
sci.electronics: ee explode power scope voltage 256k electronic electronics 8051 circuit
sci.med: pitt krillea

## 8. Parameter tuning

In [114]:
parameters = {'C':[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 1, 1.5, 2], "dual":[True,False]}

In [115]:
clf = LinearSVC()

In [116]:
grid = GridSearchCV(clf, parameters)

In [118]:
grid.fit(x_train_vec, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 1, 1.5, 2], 'dual': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [119]:
grid.best_params_

{'C': 1.5, 'dual': True}

In [122]:
grid.best_score_

0.9164751635142302

## 9. Into production

See video 4.5 ;-)