# Модель fasttext

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score

import pandas as pd
import re
import fasttext
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
sns.set(style='whitegrid')

## Загрузка набора данных

In [3]:
data = pd.read_csv("data/preprocessed/issue_data_encoded_small_extra_prep.csv")
data.head()

Unnamed: 0,labels,text
0,3,encode issue great project test whether could ...
1,2,update feedback connor comment connor need upd...
2,2,productinventory description describe document...
3,1,docs iconstructorselector optimization problem...
4,3,new user options table support errors append r...


In [4]:
# Размер набора данных
data.shape

(80000, 2)

In [5]:
data = data.dropna()

In [6]:
# Количество экземляров каждого класса
data.labels.value_counts()

2    20000
1    20000
0    20000
3    19999
Name: labels, dtype: int64

## Предобработка датасета для модели fasttext

In [7]:
# Приведение меток к специальному формату
data['labels'] = data['labels'].map({
    0: "__label__bug",
    1: "__label__enhancement",
    2: "__label__documentation",
    3: "__label__question"
})

In [8]:
data.sample(10)

Unnamed: 0,labels,text
6269,__label__question,cant send button chat send message button skyp...
56080,__label__enhancement,hi make pyfastg available conda hi love parser...
19733,__label__enhancement,add dark theme dark need light default
58401,__label__documentation,dropconnectrate efficientnet document urls iss...
12708,__label__enhancement,todo rework command deployment currently comma...
53035,__label__enhancement,format namespaces problem introduce fmt additi...
27447,__label__enhancement,panelapp panel high confidence green lower con...
33645,__label__bug,server go sain school dry run dec 22 2021 need...
61466,__label__question,selection timeline understand correctly differ...
13199,__label__enhancement,configure service core microservice startup co...


In [9]:
# Удаление знаков препинания
def preproccess(text):
    text = re.sub(r'[^\w\s\']', ' ', text)
    text = re.sub(r' +', ' ', text)
    return text.strip().lower()

In [10]:
data.text = data.text.apply(preproccess)

In [11]:
# Объединение колонок
data['label_text'] = data['labels'] + " " + data['text']

data.head()

Unnamed: 0,labels,text,label_text
0,__label__question,encode issue great project test whether could ...,__label__question encode issue great project t...
1,__label__documentation,update feedback connor comment connor need upd...,__label__documentation update feedback connor ...
2,__label__documentation,productinventory description describe document...,__label__documentation productinventory descri...
3,__label__enhancement,docs iconstructorselector optimization problem...,__label__enhancement docs iconstructorselector...
4,__label__question,new user options table support errors append r...,__label__question new user options table suppo...


## Разделение на обучающую и тестовые выборки

In [12]:
train, test = train_test_split(data, test_size=0.2, random_state=7)

In [13]:
# Размер обучающей выборки
train.shape

(63999, 3)

In [14]:
# Размер тестовой выборки
test.shape

(16000, 3)

In [15]:
# Сохранение выборок
train.to_csv("fasttext/issues.train", columns=["label_text"], index=False, header=False)
test.to_csv("fasttext/issues.test", columns=["label_text"], index=False, header=False)

## Обучение модели

In [16]:
%%time
# Обучение модели
model = fasttext.train_supervised(input="fasttext/issues.train")

CPU times: total: 9.02 s
Wall time: 2.67 s


In [17]:
%%time
model.test("fasttext/issues.test")

CPU times: total: 250 ms
Wall time: 247 ms


(16000, 0.7074375, 0.7074375)

In [18]:
pred_labels = []

for text in test["text"]:
    text = text.replace("\n", " ")
    pred_labels.append(model.predict(text)[0][0])

In [19]:
test["labels"].value_counts()

__label__bug              4045
__label__question         4004
__label__documentation    4001
__label__enhancement      3950
Name: labels, dtype: int64

In [20]:
print(classification_report(test["labels"], pred_labels,labels=["__label__bug", "__label__enhancement", "__label__documentation", "__label__question"]))

                        precision    recall  f1-score   support

          __label__bug       0.70      0.74      0.72      4045
  __label__enhancement       0.68      0.71      0.70      3950
__label__documentation       0.80      0.73      0.76      4001
     __label__question       0.66      0.66      0.66      4004

              accuracy                           0.71     16000
             macro avg       0.71      0.71      0.71     16000
          weighted avg       0.71      0.71      0.71     16000



In [21]:
f1_score(test["labels"], pred_labels, average="micro")

0.7074375

In [22]:
f1_score(test["labels"], pred_labels, average=None,
         labels=["__label__bug", "__label__enhancement", "__label__documentation", "__label__question"])

array([0.71848638, 0.69504605, 0.76153143, 0.65676856])

In [23]:
accuracy_score(test["labels"], pred_labels)

0.7074375

In [24]:
# Создание датафрейма с метриками
f1_for_classes = f1_score(test["labels"], pred_labels, average=None,
                          labels=["__label__bug", "__label__enhancement", "__label__documentation", "__label__question"])

metrics_data = pd.DataFrame(data={"model" :"fasttext",
                                  "accuracy": accuracy_score(test["labels"], pred_labels),
                                  "f1_avg" : f1_score(test["labels"], pred_labels, average="micro"),
                                  "f1_bug" : f1_for_classes[0],
                                  "f1_enh" : f1_for_classes[1],
                                  "f1_doc" : f1_for_classes[2],
                                  "f1_que" : f1_for_classes[3],
                                  "extra_processed": True},
                            index=[0]
                            )

In [25]:
metrics_data

Unnamed: 0,model,accuracy,f1_avg,f1_bug,f1_enh,f1_doc,f1_que,extra_processed
0,fasttext,0.707438,0.707438,0.718486,0.695046,0.761531,0.656769,True


In [26]:
# Сохранение датасета
metrics_data.to_csv("data/metrics/fasttext_extra.csv", sep=",", index=False, header=False)