In [15]:
%matplotlib inline

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, RepeatedKFold
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import Pipeline

RANDOM_SEED = 42

df = pd.read_csv('sample1.csv.gz')
df['full_text'] = df['issue_title'] + "_" + df['issue_body']
tmp = df.dropna().groupby('issue_label').apply(lambda x: x.sample(frac=.20)).copy().drop(columns=['issue_label'], axis=1).reset_index()
X = tmp['full_text'].values
y = tmp['issue_label'].values
cv = StratifiedKFold(shuffle=True, random_state=RANDOM_SEED)

pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", RandomForestClassifier()),
    ]
)
model = pipeline.fit(X,y)
import joblib
joblib.dump(model,'model1.sav')
mccs = cross_val_score(pipeline, X, y, scoring='matthews_corrcoef', cv=cv)

print('Average mcc:', np.mean(mccs))

KeyboardInterrupt: 

In [16]:
df

Unnamed: 0,issue_url,issue_label,issue_created_at,issue_author_association,repository_url,issue_title,issue_body,full_text
0,https://api.github.com/repos/deepnight/ldtk/is...,bug,2021-03-10T01:39:16Z,CONTRIBUTOR,https://api.github.com/repos/deepnight/ldtk,__tileSrcRect is null in Entities.ldtk Sample ...,"In the Entities example, we there are some `__...",__tileSrcRect is null in Entities.ldtk Sample ...
1,https://api.github.com/repos/sef-global/sef-si...,bug,2021-01-30T13:51:30Z,COLLABORATOR,https://api.github.com/repos/sef-global/sef-site,Update the blog link in the SEF Site,**Describe the bug**\r\nUpdate the blog link i...,Update the blog link in the SEF Site_**Describ...
2,https://api.github.com/repos/cherry-script/che...,bug,2021-04-07T13:38:24Z,CONTRIBUTOR,https://api.github.com/repos/cherry-script/che...,🐛 Parser cannot properly distinguish between p...,Consider these two expressions:\r\n```\r\nf (g...,🐛 Parser cannot properly distinguish between p...
3,https://api.github.com/repos/IgniteUI/igniteui...,bug,2020-10-16T15:41:48Z,CONTRIBUTOR,https://api.github.com/repos/IgniteUI/igniteui...,"""Row added"" snackbar is not visible if the gri...",## Description \r\nWhen grid has no height an...,"""Row added"" snackbar is not visible if the gri..."
4,https://api.github.com/repos/OpenSIPS/opensips...,bug,2020-09-21T21:22:05Z,NONE,https://api.github.com/repos/OpenSIPS/opensips,[CRASH] _tcp_write_on_socket crashes when flu...,<!--\r\nThank you for reporting a crash in Ope...,[CRASH] _tcp_write_on_socket crashes when flu...
...,...,...,...,...,...,...,...,...
72284,https://api.github.com/repos/rancher/k3d/issue...,question,2021-04-03T01:21:15Z,NONE,https://api.github.com/repos/rancher/k3d,[HELP] Volume Mount hostpath: Unable to mount ...,hi Team I am facing the below issue. Any solut...,[HELP] Volume Mount hostpath: Unable to mount ...
72285,https://api.github.com/repos/node-formidable/f...,question,2020-02-17T12:38:57Z,NONE,https://api.github.com/repos/node-formidable/f...,is form.progress valid for upload progress?,I thought form.progress would get fired every ...,is form.progress valid for upload progress?_I ...
72286,https://api.github.com/repos/ebaauw/homebridge...,question,2021-03-16T00:45:36Z,NONE,https://api.github.com/repos/ebaauw/homebridge...,"RPi used for door contact sensors, warning in log",Thank you for the most excellent plug-in!! \r\...,"RPi used for door contact sensors, warning in ..."
72287,https://api.github.com/repos/envoyproxy/envoy/...,question,2020-08-18T23:28:15Z,NONE,https://api.github.com/repos/envoyproxy/envoy,Tail latency of envoy proxy is bad if every re...,We have envoy running as the front door servic...,Tail latency of envoy proxy is bad if every re...


In [17]:
df.issue_label.value_counts()

bug            36110
enhancement    29937
question        6242
Name: issue_label, dtype: int64

In [20]:
df.head()

Unnamed: 0,issue_url,issue_label,issue_created_at,issue_author_association,repository_url,issue_title,issue_body,full_text
0,https://api.github.com/repos/deepnight/ldtk/is...,bug,2021-03-10T01:39:16Z,CONTRIBUTOR,https://api.github.com/repos/deepnight/ldtk,__tileSrcRect is null in Entities.ldtk Sample ...,"In the Entities example, we there are some `__...",__tileSrcRect is null in Entities.ldtk Sample ...
1,https://api.github.com/repos/sef-global/sef-si...,bug,2021-01-30T13:51:30Z,COLLABORATOR,https://api.github.com/repos/sef-global/sef-site,Update the blog link in the SEF Site,**Describe the bug**\r\nUpdate the blog link i...,Update the blog link in the SEF Site_**Describ...
2,https://api.github.com/repos/cherry-script/che...,bug,2021-04-07T13:38:24Z,CONTRIBUTOR,https://api.github.com/repos/cherry-script/che...,🐛 Parser cannot properly distinguish between p...,Consider these two expressions:\r\n```\r\nf (g...,🐛 Parser cannot properly distinguish between p...
3,https://api.github.com/repos/IgniteUI/igniteui...,bug,2020-10-16T15:41:48Z,CONTRIBUTOR,https://api.github.com/repos/IgniteUI/igniteui...,"""Row added"" snackbar is not visible if the gri...",## Description \r\nWhen grid has no height an...,"""Row added"" snackbar is not visible if the gri..."
4,https://api.github.com/repos/OpenSIPS/opensips...,bug,2020-09-21T21:22:05Z,NONE,https://api.github.com/repos/OpenSIPS/opensips,[CRASH] _tcp_write_on_socket crashes when flu...,<!--\r\nThank you for reporting a crash in Ope...,[CRASH] _tcp_write_on_socket crashes when flu...
