# Rocchio classifier

In [18]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestCentroid
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
import pickle

In [2]:
df = pd.read_csv("../dataset/saif_processed.csv")

In [3]:
df.head()

Unnamed: 0,issue,label,label_name
0,append new column neccesary bot append new col...,0,bug
1,student currentsemester would show sections st...,0,bug
2,duplicate articles toc causes weird behavior t...,0,bug
3,fix typo collection finder py summary describe...,0,bug
4,zimagi dbshell find psql moreover looks even p...,0,bug


In [4]:
X = df["issue"].apply(lambda x: np.str_(x))
y = df["label"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [6]:
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', NearestCentroid()),
                     ])

In [7]:
text_clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', NearestCentroid())])

In [8]:
predicted = text_clf.predict(X_test)

In [9]:
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.51      0.58      0.54       441
           1       0.59      0.57      0.58       458
           2       0.79      0.62      0.69       873
           3       0.63      0.64      0.63      1289
           4       0.81      0.50      0.62       353
           5       0.42      0.64      0.50       491

    accuracy                           0.61      3905
   macro avg       0.62      0.59      0.60      3905
weighted avg       0.64      0.61      0.61      3905



# Applying Grid Search to find best model

In [11]:
parameters = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__max_features': [5000, 10000, None]
}

In [12]:
grid = GridSearchCV(text_clf, parameters, verbose=2)

In [13]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time=   0.9s
[CV] END tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time=   0.7s
[CV] END tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time=   0.7s
[CV] END tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time=   0.7s
[CV] END tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time=   0.7s
[CV] END tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time=   2.6s
[CV] END tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time=   2.5s
[CV] END tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time=   2.5s
[CV] END tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time=   2.5s
[CV] END tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time=   2.5s
[CV] END tfidf__max_features=10000, tfidf__ngram_range=(1, 1); total time=   0.7s
[CV] END tfidf__max_features=10000, tfidf__ngram

GridSearchCV(estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('clf', NearestCentroid())]),
             param_grid={'tfidf__max_features': [5000, 10000, None],
                         'tfidf__ngram_range': [(1, 1), (1, 2)]},
             verbose=2)

In [14]:
predicted = grid.predict(X_test)

In [15]:
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.53      0.56      0.55       441
           1       0.59      0.61      0.60       458
           2       0.80      0.66      0.72       873
           3       0.66      0.65      0.65      1289
           4       0.83      0.51      0.63       353
           5       0.42      0.64      0.51       491

    accuracy                           0.62      3905
   macro avg       0.64      0.61      0.61      3905
weighted avg       0.65      0.62      0.63      3905



In [16]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_tfidf__max_features,param_tfidf__ngram_range,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.60901,0.066365,0.126326,0.010793,5000.0,"(1, 1)","{'tfidf__max_features': 5000, 'tfidf__ngram_ra...",0.605634,0.597951,0.613956,0.588988,0.599872,0.60128,0.008292,5
1,2.268004,0.033509,0.225722,0.015769,5000.0,"(1, 2)","{'tfidf__max_features': 5000, 'tfidf__ngram_ra...",0.603713,0.599232,0.612676,0.589949,0.610435,0.603201,0.00817,3
2,0.559847,0.01107,0.127364,0.00917,10000.0,"(1, 1)","{'tfidf__max_features': 10000, 'tfidf__ngram_r...",0.606274,0.596351,0.617798,0.587388,0.606914,0.602945,0.010324,4
3,2.301815,0.069832,0.236847,0.017014,10000.0,"(1, 2)","{'tfidf__max_features': 10000, 'tfidf__ngram_r...",0.612996,0.608515,0.62516,0.59347,0.612996,0.610627,0.01021,2
4,0.579993,0.019191,0.131414,0.012673,,"(1, 1)","{'tfidf__max_features': None, 'tfidf__ngram_ra...",0.603073,0.59443,0.613956,0.586748,0.606274,0.600896,0.009447,6
5,2.296108,0.082377,0.28238,0.008862,,"(1, 2)","{'tfidf__max_features': None, 'tfidf__ngram_ra...",0.617157,0.613316,0.630922,0.605314,0.616517,0.616645,0.00829,1


In [17]:
grid.best_params_

{'tfidf__max_features': None, 'tfidf__ngram_range': (1, 2)}

In [19]:
pickle.dump(grid.best_estimator_, open("../models/rocchio.sav", 'wb'))