In [8]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
import pickle

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
df = pd.read_csv("/content/drive/MyDrive/github-issue-bot/saif_processed.csv")

In [11]:
X = df["issue"].apply(lambda x: np.str_(x))
y = df["label"]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

# Bagging with KNN as base estimator

In [13]:
text_clf = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
                     ('clf', BaggingClassifier(KNeighborsClassifier())),
                     ])

In [14]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 BaggingClassifier(base_estimator=KNeighborsClassifier(algorithm='auto',
                                                                       leaf_size=30,
                                                                       metric='minkowski',
                                     

In [15]:
predicted = text_clf.predict(X_test)

In [16]:
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.51      0.42      0.46       361
           1       0.57      0.56      0.56       320
           2       0.66      0.69      0.68       635
           3       0.57      0.66      0.61       961
           4       0.67      0.71      0.69       262
           5       0.47      0.31      0.37       390

    accuracy                           0.58      2929
   macro avg       0.57      0.56      0.56      2929
weighted avg       0.58      0.58      0.58      2929



# Bagging with Decision Tree Classifier

In [30]:
text_clf = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
                     ('clf', BaggingClassifier()),
                     ])

In [31]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 BaggingClassifier(base_estimator=None, bootstrap=True,
                                   bootstrap_features=False, max_features=1

In [32]:
predicted = text_clf.predict(X_test)

In [33]:
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.54      0.58      0.56       361
           1       0.59      0.48      0.53       320
           2       0.71      0.72      0.71       635
           3       0.61      0.72      0.66       961
           4       0.71      0.55      0.62       262
           5       0.55      0.41      0.47       390

    accuracy                           0.62      2929
   macro avg       0.62      0.58      0.59      2929
weighted avg       0.62      0.62      0.62      2929



In [34]:
with open("/content/drive/MyDrive/github-issue-bot/models/bagging.sav", 'wb') as f:
  pickle.dump(text_clf, f)