# РК2 по курсу "Технологии машинного обучения"
## Выполнил студент РТ5-61Б Пименов Георгий

В качестве датасета по варианту используется набор данных героев из киновселенной Marvel.
Ссылка на датасет: https://www.kaggle.com/fivethirtyeight/fivethirtyeight-comic-characters-dataset

In [38]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


marvel = pd.read_csv('C:\\marvel-wikia-data.csv')

Взглянем на данные.

In [2]:
marvel

Unnamed: 0,page_id,name,urlslug,ID,ALIGN,EYE,HAIR,SEX,GSM,ALIVE,APPEARANCES,FIRST APPEARANCE,Year
0,1678,Spider-Man (Peter Parker),\/Spider-Man_(Peter_Parker),Secret Identity,Good Characters,Hazel Eyes,Brown Hair,Male Characters,,Living Characters,4043.0,Aug-62,1962.0
1,7139,Captain America (Steven Rogers),\/Captain_America_(Steven_Rogers),Public Identity,Good Characters,Blue Eyes,White Hair,Male Characters,,Living Characters,3360.0,Mar-41,1941.0
2,64786,"Wolverine (James \""Logan\"" Howlett)",\/Wolverine_(James_%22Logan%22_Howlett),Public Identity,Neutral Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,3061.0,Oct-74,1974.0
3,1868,"Iron Man (Anthony \""Tony\"" Stark)",\/Iron_Man_(Anthony_%22Tony%22_Stark),Public Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,,Living Characters,2961.0,Mar-63,1963.0
4,2460,Thor (Thor Odinson),\/Thor_(Thor_Odinson),No Dual Identity,Good Characters,Blue Eyes,Blond Hair,Male Characters,,Living Characters,2258.0,Nov-50,1950.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16371,657508,Ru'ach (Earth-616),\/Ru%27ach_(Earth-616),No Dual Identity,Bad Characters,Green Eyes,No Hair,Male Characters,,Living Characters,,,
16372,665474,Thane (Thanos' son) (Earth-616),\/Thane_(Thanos%27_son)_(Earth-616),No Dual Identity,Good Characters,Blue Eyes,Bald,Male Characters,,Living Characters,,,
16373,695217,Tinkerer (Skrull) (Earth-616),\/Tinkerer_(Skrull)_(Earth-616),Secret Identity,Bad Characters,Black Eyes,Bald,Male Characters,,Living Characters,,,
16374,708811,TK421 (Spiderling) (Earth-616),\/TK421_(Spiderling)_(Earth-616),Secret Identity,Neutral Characters,,,Male Characters,,Living Characters,,,


## Предобработка данных

In [3]:
marvel.isnull().sum()

page_id                 0
name                    0
urlslug                 0
ID                   3770
ALIGN                2812
EYE                  9767
HAIR                 4264
SEX                   854
GSM                 16286
ALIVE                   3
APPEARANCES          1096
FIRST APPEARANCE      815
Year                  815
dtype: int64

In [4]:
marvel.shape

(16376, 13)

Сразу можем сказать, что можно удалить колонки:
* page_id, name, urlslug - поскольку это уникальные значения
* EYE, GSM - поскольку там слишком много пропусков

In [5]:
marvel = marvel.drop(['page_id', 'name', 'urlslug', 'EYE', 'GSM', 'FIRST APPEARANCE'], axis=1)

Будем предсказывать, является ли персонаж живым или умершим (признак ALIVE).

In [6]:
marvel.ALIVE.unique()

array(['Living Characters', 'Deceased Characters', nan], dtype=object)

Как видно из набора уникальных значений, задача будет сводиться к бинарной классификации.

Удалим те строчки, в которых пропущено значение ALIVE.

In [7]:
marvel.dropna(axis=0, subset=['ALIVE'], inplace=True)

### Заполнение пропущенных значений

In [8]:
marvel.isnull().sum()

ID             3767
ALIGN          2809
HAIR           4261
SEX             851
ALIVE             0
APPEARANCES    1093
Year            812
dtype: int64

In [9]:
marvel.dtypes

ID              object
ALIGN           object
HAIR            object
SEX             object
ALIVE           object
APPEARANCES    float64
Year           float64
dtype: object

In [10]:
str(marvel.ID.dtypes)

'object'

In [11]:
mode_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

for col in marvel.columns:
    if str(marvel[col].dtypes) == 'object':
        marvel[col] = mode_imputer.fit_transform(marvel[[col]])
    else:
        marvel[col] = mean_imputer.fit_transform(marvel[[col]])

In [12]:
marvel.isnull().sum()

ID             0
ALIGN          0
HAIR           0
SEX            0
ALIVE          0
APPEARANCES    0
Year           0
dtype: int64

### Разделение датасета

In [13]:
X = marvel.drop('ALIVE', axis=1)
y = marvel['ALIVE']

### Кодирование категориальных признаков

In [14]:
X = pd.get_dummies(X)

In [15]:
X.head()

Unnamed: 0,APPEARANCES,Year,ID_Known to Authorities Identity,ID_No Dual Identity,ID_Public Identity,ID_Secret Identity,ALIGN_Bad Characters,ALIGN_Good Characters,ALIGN_Neutral Characters,HAIR_Auburn Hair,...,HAIR_Reddish Blond Hair,HAIR_Silver Hair,HAIR_Strawberry Blond Hair,HAIR_Variable Hair,HAIR_White Hair,HAIR_Yellow Hair,SEX_Agender Characters,SEX_Female Characters,SEX_Genderfluid Characters,SEX_Male Characters
0,4043.0,1962.0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,3360.0,1941.0,0,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
2,3061.0,1974.0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,2961.0,1963.0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2258.0,1950.0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [16]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

### Масштабирование

Применим масштабирование на основе Z-оценки.

In [17]:
standard_scaler = StandardScaler()
X['APPEARANCES'] = standard_scaler.fit_transform(X[['APPEARANCES']])
X['Year'] = standard_scaler.fit_transform(X[['Year']])

In [18]:
X.tail()

Unnamed: 0,APPEARANCES,Year,ID_Known to Authorities Identity,ID_No Dual Identity,ID_Public Identity,ID_Secret Identity,ALIGN_Bad Characters,ALIGN_Good Characters,ALIGN_Neutral Characters,HAIR_Auburn Hair,...,HAIR_Reddish Blond Hair,HAIR_Silver Hair,HAIR_Strawberry Blond Hair,HAIR_Variable Hair,HAIR_White Hair,HAIR_Yellow Hair,SEX_Agender Characters,SEX_Female Characters,SEX_Genderfluid Characters,SEX_Male Characters
16371,3.8161170000000005e-17,0.0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
16372,3.8161170000000005e-17,0.0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
16373,3.8161170000000005e-17,0.0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
16374,3.8161170000000005e-17,0.0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
16375,3.8161170000000005e-17,0.0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


#### Тестовая и обучающая выборка

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

## Обучение моделей

### Решающие деревья

In [20]:
tree = DecisionTreeClassifier(criterion='gini', max_depth=10, random_state=100)
tree.fit(X_train, y_train)
tree_predicted = tree.predict(X_test)
precision_score(y_test, tree_predicted)

0.7885511426669625

In [45]:
recall_score(y_test, tree_predicted)

0.9335434725505648

In [25]:
f1_score(y_test, tree_predicted)

0.8549434688477267

In [26]:
accuracy_score(y_test, tree_predicted)

0.7544788273615635

Все оценки говорят о достаточно высоком качестве обученной модели.

### Градиентный бустинг 

In [39]:
gb = GradientBoostingClassifier(learning_rate=0.2, n_estimators=200, random_state=100)
gb.fit(X_train, y_train)
gb_predicted = gb.predict(X_test)

In [43]:
accuracy_score(y_test, gb_predicted)

0.7677117263843648

In [46]:
precision_score(y_test, gb_predicted)

0.7874083656748598

In [47]:
recall_score(y_test, gb_predicted)

0.9592855266614132

## Итоги и сравнение показателей метрик

Итак, можно сказать, что алгоритм показал примерно одни и те же результаты как в случае одиночного решающего дерева, так и в случае ансамблевой модели. (Правда, нужно учитывать, что использовалась довольно слабенькая реализация градиентного бустинга из sklearn).