In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, RepeatedKFold
from sklearn.model_selection import cross_val_score

from sklearn.metrics import make_scorer,matthews_corrcoef
from sklearn.model_selection import cross_val_score


from sklearn.pipeline import Pipeline

RANDOM_SEED = 42

In [2]:
df = pd.read_csv('sample1.csv')

In [3]:
df.head()

Unnamed: 0,issue_url,issue_label,issue_created_at,issue_author_association,repository_url,issue_title,issue_body
0,https://api.github.com/repos/deepnight/ldtk/is...,bug,2021-03-10T01:39:16Z,CONTRIBUTOR,https://api.github.com/repos/deepnight/ldtk,__tileSrcRect is null in Entities.ldtk Sample ...,"In the Entities example, we there are some `__..."
1,https://api.github.com/repos/sef-global/sef-si...,bug,2021-01-30T13:51:30Z,COLLABORATOR,https://api.github.com/repos/sef-global/sef-site,Update the blog link in the SEF Site,**Describe the bug**\r\nUpdate the blog link i...
2,https://api.github.com/repos/cherry-script/che...,bug,2021-04-07T13:38:24Z,CONTRIBUTOR,https://api.github.com/repos/cherry-script/che...,🐛 Parser cannot properly distinguish between p...,Consider these two expressions:\r\n```\r\nf (g...
3,https://api.github.com/repos/IgniteUI/igniteui...,bug,2020-10-16T15:41:48Z,CONTRIBUTOR,https://api.github.com/repos/IgniteUI/igniteui...,"""Row added"" snackbar is not visible if the gri...",## Description \r\nWhen grid has no height an...
4,https://api.github.com/repos/OpenSIPS/opensips...,bug,2020-09-21T21:22:05Z,NONE,https://api.github.com/repos/OpenSIPS/opensips,[CRASH] _tcp_write_on_socket crashes when flu...,<!--\r\nThank you for reporting a crash in Ope...


In [4]:
print(len(df), 'instances')

72289 instances


In [5]:
df['full_text'] = df['issue_title'] + "_" + df['issue_body']

In [6]:
tmp = df.dropna().groupby('issue_label').apply(lambda x: x.sample(frac=.20)).copy().drop(columns=['issue_label'], axis=1).reset_index()

In [7]:
tmp.head()

Unnamed: 0,issue_label,level_1,issue_url,issue_created_at,issue_author_association,repository_url,issue_title,issue_body,full_text
0,bug,8659,https://api.github.com/repos/bonitasoft/bonita...,2021-01-22T16:07:29Z,CONTRIBUTOR,https://api.github.com/repos/bonitasoft/bonita...,404 error when displaying site in console log,"When we work on css switch to sass format, som...",404 error when displaying site in console log_...
1,bug,4519,https://api.github.com/repos/open-cogsci/rapun...,2021-02-22T09:26:08Z,COLLABORATOR,https://api.github.com/repos/open-cogsci/rapunzel,R-style code cells without comment are not par...,When pressing `F9` while the cursor is on the ...,R-style code cells without comment are not par...
2,bug,33701,https://api.github.com/repos/libsdl-org/SDL/is...,2021-02-11T01:14:22Z,COLLABORATOR,https://api.github.com/repos/libsdl-org/SDL,Support for VK_KHR_mir_surface should be remov...,\n# This bug report was migrated from our old ...,Support for VK_KHR_mir_surface should be remov...
3,bug,22058,https://api.github.com/repos/acrosman/Salesfor...,2021-05-11T15:03:17Z,OWNER,https://api.github.com/repos/acrosman/Salesfor...,Better feedback from database generation,**Summary**\r\nCreate a message to display aft...,Better feedback from database generation_**Sum...
4,bug,28653,https://api.github.com/repos/hashicorp/terrafo...,2020-06-04T19:41:26Z,NONE,https://api.github.com/repos/hashicorp/terrafo...,kubernetes.io annotations are not available as...,\r\n### Terraform Version and Provider Version...,kubernetes.io annotations are not available as...


In [8]:
X = tmp['full_text'].values
y = tmp['issue_label'].values

In [9]:
print(y)


['bug' 'bug' 'bug' ... 'question' 'question' 'question']


print (X)

In [10]:
mcc_scorer = make_scorer(matthews_corrcoef)

cv = StratifiedKFold(shuffle=True, random_state=RANDOM_SEED)

pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", RandomForestClassifier()),
    ]
)

In [11]:
model = pipeline.fit(X,y)

In [12]:
import joblib
joblib.dump(model,'model1.sav')

['model1.sav']

In [13]:
mccs = cross_val_score(pipeline, X, y, scoring=mcc_scorer, cv=cv)

print('Average mcc:', np.mean(mccs))

Average mcc: 0.5450865715606013
