In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

Example 1

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

df = pd.read_csv('http://cssbook.net/d/mediause.csv')

df['uses-internet'] = df['internet']>0
df.dropna(inplace=True)
print("How many people used online news at all?")
print(df['uses-internet'].value_counts())

X_train, X_test, y_train, y_test = train_test_split(df[['age', 'education', 'gender']], df['uses-internet'], test_size=0.2, random_state=42)

print('We have {} training and {} test cases.'.format(len(X_train), len(X_test)))

How many people used online news at all?
True     1262
False     803
Name: uses-internet, dtype: int64
We have 1652 training and 413 test cases.


In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier


myclassifier = RandomForestClassifier()

grid = {
    'n_estimators' : [10, 50, 100, 200], 
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}
search = GridSearchCV(estimator=myclassifier,
                     param_grid=grid,
                     scoring='f1',
                     cv=5)
search.fit(X_train, y_train)
print('Using these hyperparameters {}, we get the best performance:'.format(search.best_params_))
print(classification_report(y_test, search.predict(X_test)))

Using these hyperparameters {'n_estimators': 100, 'bootstrap': True, 'criterion': 'entropy'}, we get the best performance:
              precision    recall  f1-score   support

       False       0.43      0.38      0.40       161
        True       0.63      0.68      0.66       252

   micro avg       0.56      0.56      0.56       413
   macro avg       0.53      0.53      0.53       413
weighted avg       0.55      0.56      0.56       413



In [10]:

scaler = preprocessing.StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

myclassifier = SVC(gamma='scale')

grid = {
    'C' : [100, 1e4], 
    'kernel': ['linear','rbf', 'poly'],
    'degree': [3,4]
}

search = GridSearchCV(estimator=myclassifier,
                      param_grid=grid,
                      scoring='f1',
                      cv=5,
                      n_jobs=-1,  # use all cpus
                      verbose=10)
search.fit(X_train_scaled, y_train)
print('Using these hyperparameters {}, we get the best performance:'.format(search.best_params_))
print(classification_report(y_test, search.predict(X_test_scaled)))

Fitting 5 folds for each of 12 candidates, totalling 60 fits


  return self.partial_fit(X, y)
  after removing the cwd from sys.path.
  """
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  5.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  5.9min finished


Using these hyperparameters {'kernel': 'poly', 'C': 100, 'degree': 3}, we get the best performance:
              precision    recall  f1-score   support

       False       0.58      0.04      0.08       161
        True       0.62      0.98      0.76       252

   micro avg       0.62      0.62      0.62       413
   macro avg       0.60      0.51      0.42       413
weighted avg       0.60      0.62      0.49       413



Example 2

In [12]:
import os
from glob import glob

# unpack the dataset from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz and store the folder 'aclImdb' in the same folder as this script

def read_data(dataset):
    texts = []
    labels = []
    for label in ['pos', 'neg']:
        for file in glob(os.path.join('aclImdb',dataset,label,'*.txt')):
            with open(file) as f:
                texts.append(f.read())
                labels.append(label)
    return texts, labels

X_train_fulltext, y_train = read_data('train')
X_test_fulltext, y_test= read_data('test')

In [13]:
pipe = Pipeline(steps = [('vectorizer', TfidfVectorizer()), ('classifier', LogisticRegression(solver='liblinear'))])
grid = {
    'vectorizer__ngram_range' : [(1,1), (1,2)],
    'vectorizer__max_df': [0.5, 1.0],
    'vectorizer__min_df': [0, 5],
    'classifier__C': [0.01, 1, 100]
}

search = GridSearchCV(estimator=pipe,
                      param_grid=grid,
                      scoring='accuracy',   # all classes are balanced, let's just score on accuracy
                      cv=5,
                      n_jobs=-1,  # use all cpus
                      verbose=10)
search.fit(X_train_fulltext, y_train)
print('Using these hyperparameters {}, we get the best performance:'.format(search.best_params_))
print(classification_report(y_test, search.predict(X_test_fulltext)))

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   32.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 11.3min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed: 14.2min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 17.1min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 19.5min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 23.6min finished


Using these hyperparameters {'vectorizer__min_df': 0, 'classifier__C': 100, 'vectorizer__ngram_range': (1, 2), 'vectorizer__max_df': 0.5}, we get the best performance:
              precision    recall  f1-score   support

         neg       0.90      0.90      0.90     12500
         pos       0.90      0.90      0.90     12500

   micro avg       0.90      0.90      0.90     25000
   macro avg       0.90      0.90      0.90     25000
weighted avg       0.90      0.90      0.90     25000



In [14]:
print(classification_report(y_test, search.predict(X_test_fulltext)))

              precision    recall  f1-score   support

         neg       0.90      0.90      0.90     12500
         pos       0.90      0.90      0.90     12500

   micro avg       0.90      0.90      0.90     25000
   macro avg       0.90      0.90      0.90     25000
weighted avg       0.90      0.90      0.90     25000



# Visualization

In [15]:
from sklearn.pipeline import make_pipeline
import eli5

pipe = make_pipeline(TfidfVectorizer(min_df=5, max_df=.5), LogisticRegression())
pipe.fit(X_train_fulltext, y_train)
y_pred = pipe.predict(X_test_fulltext)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

         neg       0.88      0.89      0.88     12500
         pos       0.89      0.88      0.88     12500

   micro avg       0.88      0.88      0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



In [16]:
eli5.show_weights(pipe, top = 10)

Weight?,Feature
+7.173,great
+6.101,excellent
+5.055,best
+4.791,perfect
… 13663 more positive …,… 13663 more positive …
… 13574 more negative …,… 13574 more negative …
-5.337,poor
-5.733,boring
-6.315,waste
-6.349,awful


In [17]:
myvec = TfidfVectorizer(min_df=5, max_df=.5)
X_train = myvec.fit_transform(X_train_fulltext)
myclf = LogisticRegression()
myclf.fit(X_train, y_train)
eli5.show_prediction(myclf, X_test_fulltext[0], vec=myvec)



Contribution?,Feature
2.074,Highlighted in text (sum)
0.013,<BIAS>
