# Модель fasttext

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score

import pandas as pd
import re
import fasttext
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
sns.set(style='whitegrid')

## Загрузка набора данных

In [2]:
data = pd.read_csv("data/preprocessed/issue_data_encoded_small.csv")
data.head()

Unnamed: 0,labels,text
0,3,"Encoding issue. Great project, I am testing ou..."
1,2,Update with feedback from Connor. All the comm...
2,2,ProductInventory. Description Describe what d...
3,1,Docs for `IConstructorSelector` optimization. ...
4,3,New user options table does not support (error...


In [3]:
# Размер набора данных
data.shape

(80000, 2)

In [4]:
# Количество экземляров каждого класса
data.labels.value_counts()

3    20000
2    20000
1    20000
0    20000
Name: labels, dtype: int64

## Предобработка датасета для модели fasttext

In [5]:
# Приведение меток к специальному формату
data['labels'] = data['labels'].map({
    0: "__label__bug",
    1: "__label__enhancement",
    2: "__label__documentation",
    3: "__label__question"
})

In [6]:
data.sample(10)

Unnamed: 0,labels,text
75417,__label__question,google.auth.exceptions.RefreshError: . Hi Team...
73432,__label__documentation,[UI] Web-Landing Page_V3. V2 landing page mockup
41414,__label__bug,Properly handle 404s from the history API on s...
62761,__label__documentation,Update Getting Started documentation. User Fee...
75744,__label__question,"[Support,Feature] How to revert from pyramid t..."
64936,__label__documentation,UI main wireframe. Should we use something sim...
45427,__label__bug,Step back time/position not saved in state. Te...
41754,__label__question,[Question] [URGENT]Telegram not responding to ...
49319,__label__bug,Icomoon field doesn’t work when use it in repe...
57879,__label__bug,Kirby Cog missing. Kirby not working It seems ...


In [7]:
# Удаление знаков препинания
def preproccess(text):
    text = re.sub(r'[^\w\s\']', ' ', text)
    text = re.sub(r' +', ' ', text)
    return text.strip().lower()

In [8]:
data.text = data.text.apply(preproccess)

In [9]:
# Объединение колонок
data['label_text'] = data['labels'] + " " + data['text']

data.head()

Unnamed: 0,labels,text,label_text
0,__label__question,encoding issue great project i am testing out ...,__label__question encoding issue great project...
1,__label__documentation,update with feedback from connor all the comme...,__label__documentation update with feedback fr...
2,__label__documentation,productinventory description describe what doc...,__label__documentation productinventory descri...
3,__label__enhancement,docs for iconstructorselector optimization pro...,__label__enhancement docs for iconstructorsele...
4,__label__question,new user options table does not support errors...,__label__question new user options table does ...


## Разделение на обучающую и тестовые выборки

In [10]:
train, test = train_test_split(data, test_size=0.2, random_state=7)

In [11]:
# Размер обучающей выборки
train.shape

(64000, 3)

In [12]:
# Размер тестовой выборки
test.shape

(16000, 3)

In [13]:
# Сохранение выборок
train.to_csv("fasttext/issues.train", columns=["label_text"], index=False, header=False)
test.to_csv("fasttext/issues.test", columns=["label_text"], index=False, header=False)

## Обучение модели

In [14]:
%%time
# Обучение модели
model = fasttext.train_supervised(input="fasttext/issues.train")

CPU times: total: 13.6 s
Wall time: 3.65 s


In [15]:
%%time
model.test("fasttext/issues.test")

CPU times: total: 406 ms
Wall time: 434 ms


(15996, 0.7327456864216054, 0.7327456864216054)

In [16]:
pred_labels = []

for text in test["text"]:
    text = text.replace("\n", " ")
    pred_labels.append(model.predict(text)[0][0])

In [17]:
test["labels"].value_counts()

__label__documentation    4018
__label__question         4007
__label__bug              3989
__label__enhancement      3986
Name: labels, dtype: int64

In [18]:
print(classification_report(test["labels"], pred_labels,labels=["__label__bug", "__label__enhancement", "__label__documentation", "__label__question"]))

                        precision    recall  f1-score   support

          __label__bug       0.71      0.75      0.73      3989
  __label__enhancement       0.71      0.75      0.73      3986
__label__documentation       0.81      0.73      0.77      4018
     __label__question       0.70      0.70      0.70      4007

              accuracy                           0.73     16000
             macro avg       0.73      0.73      0.73     16000
          weighted avg       0.74      0.73      0.73     16000



In [19]:
f1_score(test["labels"], pred_labels, average="micro")

0.7326874999999999

In [20]:
f1_score(test["labels"], pred_labels, average=None,
         labels=["__label__bug", "__label__enhancement", "__label__documentation", "__label__question"])

array([0.73152227, 0.72778389, 0.77114103, 0.70210105])

In [21]:
accuracy_score(test["labels"], pred_labels)

0.7326875

In [22]:
# Создание датафрейма с метриками
f1_for_classes = f1_score(test["labels"], pred_labels, average=None,
                          labels=["__label__bug", "__label__enhancement", "__label__documentation", "__label__question"])

metrics_data = pd.DataFrame(data={"model" :"fasttext",
                                  "accuracy": accuracy_score(test["labels"], pred_labels),
                                  "f1_avg" : f1_score(test["labels"], pred_labels, average="micro"),
                                  "f1_bug" : f1_for_classes[0],
                                  "f1_enh" : f1_for_classes[1],
                                  "f1_doc" : f1_for_classes[2],
                                  "f1_que" : f1_for_classes[3],
                                  "extra_processed": False},
                            index=[0]
                            )

In [23]:
# Сохранение датасета
metrics_data.to_csv("data/metrics/fasttext.csv", sep=",", index=False, header=False)