In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
import pickle
import xgboost

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df = pd.read_csv("/content/drive/MyDrive/github-issue-bot/saif_processed.csv")

In [4]:
X = df["issue"].apply(lambda x: np.str_(x))
y = df["label"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

# Scikit-learn Boosting

In [6]:
text_clf = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
                     ('clf', GradientBoostingClassifier()),
                     ])

In [7]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                                            learning_rate=0.1, loss='deviance',
                                            max_depth=3, max_features=None,
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
          

In [8]:
predicted = text_clf.predict(X_test)

In [9]:
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.64      0.49      0.56       321
           1       0.87      0.42      0.57       363
           2       0.81      0.72      0.76       635
           3       0.57      0.88      0.69       980
           4       0.85      0.63      0.73       240
           5       0.68      0.45      0.54       390

    accuracy                           0.67      2929
   macro avg       0.74      0.60      0.64      2929
weighted avg       0.71      0.67      0.66      2929



In [20]:
with open("/content/drive/MyDrive/github-issue-bot/models/sklearn_boosting.sav", 'wb') as f:
    pickle.dump(text_clf, f)

# XGBoost classifier

In [6]:
tfidf_vect = TfidfVectorizer(ngram_range=(1, 2))

In [7]:
tfidf_vect.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [8]:
with open("/content/drive/MyDrive/github-issue-bot/models/tfidf.pk", 'wb') as f:
    pickle.dump(tfidf_vect, f)

In [9]:
X_train_tfidf = tfidf_vect.transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)

In [19]:
clf = xgboost.XGBClassifier(n_jobs=-1, learning_rate=0.1, n_estimators=100)

In [20]:
clf.fit(X_train_tfidf.tocsc(), y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [21]:
predictions = clf.predict(X_test_tfidf.tocsc())

In [22]:
print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.65      0.43      0.52       365
           1       0.86      0.39      0.54       319
           2       0.81      0.69      0.74       646
           3       0.53      0.89      0.67       959
           4       0.87      0.52      0.65       261
           5       0.63      0.40      0.49       379

    accuracy                           0.64      2929
   macro avg       0.72      0.55      0.60      2929
weighted avg       0.69      0.64      0.63      2929



In [23]:
with open("/content/drive/MyDrive/github-issue-bot/models/xgboost.sav", 'wb') as f:
    pickle.dump(clf, f)