In [None]:
from getdata import x_train, x_test, y_train, y_test, x_train_all, x_test_all

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from tidy_data import TidySymbols
from sklearn.calibration import CalibratedClassifierCV

In [None]:
# Previous step suggested most models were comparible but linear SVC 
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
import pandas as pd

In [None]:
# TODO: Add dimensionality reduction to this pipeline?  
# http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html
# http://scikit-learn.org/stable/auto_examples/text/document_clustering.html
# https://medium.com/@adi_enasoaie/easy-lsi-pipeline-using-scikit-learn-a073f2484408
    
clf = Pipeline([
    ('tidy', TidySymbols()),
    ('vect', CountVectorizer(binary=True, ngram_range = (1,2))),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC(C=0.784))
])

In [None]:
clf = clf.fit(x_train, y_train)

In [None]:
from sklearn import metrics
y_pred = clf.predict(x_test)

In [None]:
print(metrics.classification_report(y_test, y_pred, digits=4))

tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred).ravel()

cm = pd.DataFrame(metrics.confusion_matrix(y_test, y_pred))
i1 = pd.MultiIndex.from_tuples([("y", 0),("y", 1)])
i2 = pd.MultiIndex.from_tuples([("pred", 0),("pred", 1)])
cm = cm.set_index(i1)
cm.columns = i2
cm

# Best 2017-08-27
```

pred
0	1
y	0	88133	670
1	450	5612
```

In [None]:
# Investigate fp fn
import pandas as pd
pd.options.display.max_colwidth = 10000
df = pd.DataFrame({"y_test":y_test, "y_pred": y_pred, "x" : x_test, "x_c" : x_test_all["category_concat_cat"]})

f1 = df["y_test"] == 0
f2 = df["y_pred"] == 1

df[f1 & f2].sample(20)

In [None]:

f1 = df["y_test"] == 1
f2 = df["y_pred"] == 0
df[f1 & f2].sample(20)


In [None]:
# Understand what's going on a bit more - refit so we can use proba

clf = Pipeline([
    ('tidy', TidySymbols()),
    ('vect', CountVectorizer(ngram_range = (1,2))),
    ('tfidf', TfidfTransformer()),
    ('clf', CalibratedClassifierCV(LinearSVC(C=0.784), cv=20))
])
clf = clf.fit(x_train, y_train)
print(metrics.classification_report(y_test, y_pred, digits=4))

tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred).ravel()

cm = pd.DataFrame(metrics.confusion_matrix(y_test, y_pred))
i1 = pd.MultiIndex.from_tuples([("y", 0),("y", 1)])
i2 = pd.MultiIndex.from_tuples([("pred", 0),("pred", 1)])
cm = cm.set_index(i1)
cm.columns = i2
cm

In [None]:
clf.predict_proba(["Yellow Dutch Bitcoin 200mg XTC Pills"])

In [None]:
newx = ["25 x Purple DOMINO (2nd press) XTCâ€¦", "MDMA/ECSTASY/MOLLY- 0.5 Half Gram","ECSTASY PILLS 5000 PIECES ==WARNERBROS", "5x 200 - 220MG Orange Tesla's", "50x Beige Instagram 200mg MDMA", "10 Dutch Mill XTC Red Godness A++ QUALITY 140mg+ mdma"]
clf.predict(newx)

In [None]:
p = Pipeline(clf.steps[0:1])
data = p.fit_transform(newx)
df = pd.DataFrame(data)

df