In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../dataset/saif_processed.csv")

In [4]:
X = df["issue"].apply(lambda x: np.str_(x))
y = df["label"]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

# Bagging with KNN as base estimator

In [13]:
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', BaggingClassifier(KNeighborsClassifier())),
                     ])

In [14]:
text_clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf',
                 BaggingClassifier(base_estimator=KNeighborsClassifier()))])

In [15]:
predicted = text_clf.predict(X_test)

In [16]:
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.52      0.32      0.40       369
           1       0.60      0.56      0.58       322
           2       0.59      0.70      0.64       629
           3       0.55      0.67      0.61       963
           4       0.65      0.65      0.65       270
           5       0.51      0.27      0.36       376

    accuracy                           0.57      2929
   macro avg       0.57      0.53      0.54      2929
weighted avg       0.56      0.57      0.56      2929



# Bagging with Decision Tree Classifier

In [17]:
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', BaggingClassifier()),
                     ])

In [18]:
text_clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', BaggingClassifier())])

In [19]:
predicted = text_clf.predict(X_test)

In [20]:
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.51      0.58      0.54       369
           1       0.65      0.45      0.53       322
           2       0.74      0.73      0.74       629
           3       0.61      0.76      0.68       963
           4       0.71      0.46      0.56       270
           5       0.53      0.41      0.46       376

    accuracy                           0.62      2929
   macro avg       0.63      0.56      0.58      2929
weighted avg       0.63      0.62      0.62      2929



# Scikit-learn Boosting

In [21]:
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', GradientBoostingClassifier(n_estimators=100)),
                     ])

In [22]:
text_clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf', GradientBoostingClassifier())])

In [23]:
predicted = text_clf.predict(X_test)

In [24]:
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.67      0.48      0.56       369
           1       0.78      0.43      0.55       322
           2       0.81      0.69      0.74       629
           3       0.55      0.87      0.68       963
           4       0.87      0.59      0.70       270
           5       0.68      0.45      0.55       376

    accuracy                           0.65      2929
   macro avg       0.73      0.58      0.63      2929
weighted avg       0.69      0.65      0.65      2929



# XGBoost classifier

In [28]:
import xgboost

In [25]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}')

In [26]:
tfidf_vect.fit(X_train)

TfidfVectorizer(token_pattern='\\w{1,}')

In [27]:
X_train_tfidf = tfidf_vect.transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)

In [29]:
clf = xgboost.XGBClassifier()

In [32]:
clf.fit(X_train_tfidf.tocsc(), y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [33]:
predictions = clf.predict(X_test_tfidf.tocsc())

In [34]:
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.67      0.48      0.56       369
           1       0.78      0.43      0.55       322
           2       0.81      0.69      0.74       629
           3       0.55      0.87      0.68       963
           4       0.87      0.59      0.70       270
           5       0.68      0.45      0.55       376

    accuracy                           0.65      2929
   macro avg       0.73      0.58      0.63      2929
weighted avg       0.69      0.65      0.65      2929

