In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, RepeatedKFold
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import Pipeline

RANDOM_SEED = 42

In [2]:
df = pd.read_csv('./data/sample1.csv.gz')

In [3]:
df.head()

Unnamed: 0,issue_url,issue_label,issue_created_at,issue_author_association,repository_url,issue_title,issue_body
0,https://api.github.com/repos/deepnight/ldtk/is...,bug,2021-03-10T01:39:16Z,CONTRIBUTOR,https://api.github.com/repos/deepnight/ldtk,__tileSrcRect is null in Entities.ldtk Sample ...,"In the Entities example, we there are some `__..."
1,https://api.github.com/repos/sef-global/sef-si...,bug,2021-01-30T13:51:30Z,COLLABORATOR,https://api.github.com/repos/sef-global/sef-site,Update the blog link in the SEF Site,**Describe the bug**\r\nUpdate the blog link i...
2,https://api.github.com/repos/cherry-script/che...,bug,2021-04-07T13:38:24Z,CONTRIBUTOR,https://api.github.com/repos/cherry-script/che...,🐛 Parser cannot properly distinguish between p...,Consider these two expressions:\r\n```\r\nf (g...
3,https://api.github.com/repos/IgniteUI/igniteui...,bug,2020-10-16T15:41:48Z,CONTRIBUTOR,https://api.github.com/repos/IgniteUI/igniteui...,"""Row added"" snackbar is not visible if the gri...",## Description \r\nWhen grid has no height an...
4,https://api.github.com/repos/OpenSIPS/opensips...,bug,2020-09-21T21:22:05Z,NONE,https://api.github.com/repos/OpenSIPS/opensips,[CRASH] _tcp_write_on_socket crashes when flu...,<!--\r\nThank you for reporting a crash in Ope...


In [4]:
print(len(df), 'instances')

72289 instances


In [5]:
df['full_text'] = df['issue_title'] + "_" + df['issue_body']

In [6]:
tmp = df.dropna().groupby('issue_label').apply(lambda x: x.sample(frac=.20)).copy().drop(columns=['issue_label'], axis=1).reset_index()

In [7]:
tmp.head()

Unnamed: 0,issue_label,level_1,issue_url,issue_created_at,issue_author_association,repository_url,issue_title,issue_body,full_text
0,bug,23149,https://api.github.com/repos/libsdl-org/SDL/is...,2021-02-10T23:25:45Z,COLLABORATOR,https://api.github.com/repos/libsdl-org/SDL,"On OSX, an SDL app prevents system shutdown.",\n# This bug report was migrated from our old ...,"On OSX, an SDL app prevents system shutdown._\..."
1,bug,16717,https://api.github.com/repos/siyuan-note/siyua...,2020-12-14T15:55:25Z,NONE,https://api.github.com/repos/siyuan-note/siyuan,Heading as list item problem,### 描述问题 Describe the problem\r\n\r\n使用快捷键 `Ct...,Heading as list item problem_### 描述问题 Describe...
2,bug,10157,https://api.github.com/repos/elastic/beats/iss...,2019-02-02T22:34:08Z,CONTRIBUTOR,https://api.github.com/repos/elastic/beats,build script docker run --network=host does no...,The build script uses the following:\r\n\r\nlo...,build script docker run --network=host does no...
3,bug,27561,https://api.github.com/repos/CollaboraOnline/o...,2021-02-08T11:23:22Z,NONE,https://api.github.com/repos/CollaboraOnline/o...,"iOS App: Writer, Calc & Impress: comment funct...",**Describe the bug**\r\nIf you insert a commen...,"iOS App: Writer, Calc & Impress: comment funct..."
4,bug,7458,https://api.github.com/repos/ucfopen/canvasapi...,2021-01-03T18:39:05Z,NONE,https://api.github.com/repos/ucfopen/canvasapi,Favorite object removal does not handle parame...,# Describe the bug\r\n\r\n_Favorite_ objects c...,Favorite object removal does not handle parame...


In [8]:
X = tmp['full_text'].values
y = tmp['issue_label'].values

In [9]:
cv = StratifiedKFold(shuffle=True, random_state=RANDOM_SEED)

pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", RandomForestClassifier()),
    ]
)

In [10]:
model = pipeline.fit(X,y)

In [11]:
import joblib
joblib.dump(model,'model1.sav')

['model1.sav']

In [None]:
mccs = cross_val_score(pipeline, X, y, scoring='matthews_corrcoef', cv=cv)

print('Average mcc:', np.mean(mccs))