# Лабораторная 1. Классификация с помощью PyTorch

## Импорт библиотек и модулей

In [1]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report

## Предобработка данных

### Загрузка датасета

In [2]:
df = pd.read_csv("../data/csgo_task.csv")
df

Unnamed: 0,time_left,ct_score,t_score,map,bomb_planted,ct_health,t_health,ct_armor,t_armor,ct_money,t_money,ct_helmets,t_helmets,ct_defuse_kits,ct_players_alive,t_players_alive
0,175.00,0.0,0.0,de_dust2,False,500.0,500.0,0.0,0.0,4000.0,4000.0,0.0,0.0,0.0,5.0,5.0
1,156.03,0.0,0.0,de_dust2,False,500.0,500.0,400.0,300.0,600.0,650.0,0.0,0.0,1.0,5.0,5.0
2,96.03,0.0,0.0,de_dust2,False,391.0,400.0,294.0,200.0,750.0,500.0,0.0,0.0,1.0,4.0,4.0
3,76.03,0.0,0.0,de_dust2,False,391.0,400.0,294.0,200.0,750.0,500.0,0.0,0.0,1.0,4.0,4.0
4,174.97,1.0,0.0,de_dust2,False,500.0,500.0,192.0,0.0,18350.0,10750.0,0.0,0.0,1.0,5.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122405,15.41,11.0,14.0,de_train,True,200.0,242.0,195.0,359.0,100.0,5950.0,2.0,4.0,1.0,2.0,4.0
122406,174.93,11.0,15.0,de_train,False,500.0,500.0,95.0,175.0,11500.0,23900.0,1.0,2.0,1.0,5.0,5.0
122407,114.93,11.0,15.0,de_train,False,500.0,500.0,495.0,475.0,1200.0,6700.0,3.0,5.0,1.0,5.0,5.0
122408,94.93,11.0,15.0,de_train,False,500.0,500.0,495.0,475.0,1200.0,6700.0,3.0,5.0,1.0,5.0,5.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122410 entries, 0 to 122409
Data columns (total 16 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   time_left         122410 non-null  float64
 1   ct_score          122410 non-null  float64
 2   t_score           121386 non-null  float64
 3   map               122199 non-null  object 
 4   bomb_planted      122410 non-null  bool   
 5   ct_health         122410 non-null  float64
 6   t_health          121627 non-null  float64
 7   ct_armor          122410 non-null  float64
 8   t_armor           122410 non-null  float64
 9   ct_money          122410 non-null  float64
 10  t_money           122410 non-null  float64
 11  ct_helmets        121766 non-null  float64
 12  t_helmets         121560 non-null  float64
 13  ct_defuse_kits    121766 non-null  float64
 14  ct_players_alive  122410 non-null  float64
 15  t_players_alive   122182 non-null  float64
dtypes: bool(1), float64(

### Пропущенные значения

In [4]:
df.isnull().sum()

time_left              0
ct_score               0
t_score             1024
map                  211
bomb_planted           0
ct_health              0
t_health             783
ct_armor               0
t_armor                0
ct_money               0
t_money                0
ct_helmets           644
t_helmets            850
ct_defuse_kits       644
ct_players_alive       0
t_players_alive      228
dtype: int64

### Изменения типов данных с float64 на int32

In [5]:
name_columns_type_float = ["t_score", "ct_score", "ct_health", "t_health", "ct_armor",
                           "t_armor", "ct_money", "t_money", "ct_defuse_kits", "ct_helmets",
                           "t_helmets", "ct_players_alive", "t_players_alive"]

for name in name_columns_type_float:
    df[name] = df[name].astype("Int32")

### Заполенение пропущенных значений

1. Если это целое число, то замена на median
2. Если это число с плавающей точкой, то замена на mean
3. Если это дискретное значение, то замена на mode

In [6]:
df['t_score'] = df['t_score'].fillna(6)
df['map'] = df['map'].fillna('de_inferno')
df['t_health'] = df['t_health'].fillna(500)
df['ct_helmets'] = df['ct_helmets'].fillna(2.0)
df['t_helmets'] = df['t_helmets'].fillna(3.0)
df['t_players_alive'] = df['t_players_alive'].fillna(5.0)
df['ct_defuse_kits'] = df['ct_defuse_kits'].fillna(0.0)

In [7]:
dict_pathch = {
    'de_dust2': 0,
    'de_mirage': 1,
    'de_inferno': 2,
    'de_nuke': 3,
    'de_overpass': 4,
    'de_vertigo': 5,
    'de_train': 6,
    'de_cache': 7
}
df['map'] = df['map'].map(dict_pathch)


In [8]:
dict_pathch = {
    False: 0,
    True: 1
}
df['bomb_planted'] = df['bomb_planted'].map(dict_pathch)

### Удаление дубликатов

In [9]:
df = df.drop_duplicates().reset_index(drop=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116693 entries, 0 to 116692
Data columns (total 16 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   time_left         116693 non-null  float64
 1   ct_score          116693 non-null  Int32  
 2   t_score           116693 non-null  Int32  
 3   map               116693 non-null  int64  
 4   bomb_planted      116693 non-null  int64  
 5   ct_health         116693 non-null  Int32  
 6   t_health          116693 non-null  Int32  
 7   ct_armor          116693 non-null  Int32  
 8   t_armor           116693 non-null  Int32  
 9   ct_money          116693 non-null  Int32  
 10  t_money           116693 non-null  Int32  
 11  ct_helmets        116693 non-null  Int32  
 12  t_helmets         116693 non-null  Int32  
 13  ct_defuse_kits    116693 non-null  Int32  
 14  ct_players_alive  116693 non-null  Int32  
 15  t_players_alive   116693 non-null  Int32  
dtypes: Int32(13), float6

In [11]:
df.describe()

Unnamed: 0,time_left,ct_score,t_score,map,bomb_planted,ct_health,t_health,ct_armor,t_armor,ct_money,t_money,ct_helmets,t_helmets,ct_defuse_kits,ct_players_alive,t_players_alive
count,116693.0,116693.0,116693.0,116693.0,116693.0,116693.0,116693.0,116693.0,116693.0,116693.0,116693.0,116693.0,116693.0,116693.0,116693.0,116693.0
mean,94.148813,6.776122,6.830555,2.592169,0.117265,407.939337,398.666638,319.574088,304.309153,10014.031262,11515.060029,2.13419,2.887637,1.651264,4.23956,4.232062
std,53.020753,4.803559,4.813609,1.936556,0.321737,134.06513,141.387219,169.175948,173.166751,11269.167104,12211.614541,1.824536,1.962226,1.614035,1.22394,1.246542
min,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,54.9,3.0,3.0,1.0,0.0,333.0,309.0,196.0,181.0,1300.0,1650.0,0.0,1.0,0.0,4.0,4.0
50%,94.87,6.0,6.0,2.0,0.0,500.0,500.0,384.0,356.0,6000.0,7750.0,2.0,3.0,1.0,5.0,5.0
75%,114.96,10.0,10.0,4.0,0.0,500.0,500.0,487.0,471.0,15050.0,18350.0,4.0,5.0,3.0,5.0,5.0
max,175.0,32.0,33.0,7.0,1.0,500.0,600.0,500.0,500.0,80000.0,80000.0,5.0,5.0,5.0,5.0,6.0


## Выделение на обучающую и тестовую выборки

Так как целевой признак бинарный, то производится балансировка классов. Делается это приведением числа мажорного класса к минорному. Затем в выборке надо сохраняем пропорции.

In [12]:
X = StandardScaler().fit_transform(df.drop(['bomb_planted'], axis=1))
X = pd.DataFrame(X).values
Y = df['bomb_planted'].values
rus = RandomUnderSampler()

X_resampled, y_resampled = rus.fit_resample(X, Y)

y_resampled = y_resampled.reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, train_size=0.3, random_state=42, stratify=y_resampled)

In [13]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8210, 15), (19158, 15), (8210, 1), (19158, 1))

In [14]:
from torch.utils.data import TensorDataset, DataLoader

## Преобразование датасета с использованием TensorDataSet и DataLoader

In [15]:
train_ds = TensorDataset(torch.from_numpy(X_train).type(torch.float32),
                         torch.from_numpy(y_train).type(torch.float32))
train_dl = DataLoader(train_ds, batch_size=256, shuffle=True)

test_ds = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))
test_dl = DataLoader(test_ds, batch_size=256, shuffle=True)

## Разбиение на батчи

In [16]:
x_c, y_c = next(iter(train_dl))
x_c.shape, y_c.shape

(torch.Size([256, 15]), torch.Size([256, 1]))

## Создание полносвязной нейронной сети с помощью PyTorch

In [17]:
import torch.nn as nn

In [18]:
class MyRegressionModel(nn.Module):
    # любая модель в PyTorch - это набор слоев
    # при этом, мы сами определяем порядок их выполнения
    # в конструкторе мы задаем набор слоев с указанием параметров
    def __init__(self):
        super(MyRegressionModel, self).__init__()
        # определяем первый линейный слой (y = wx + b)
        self.first_linear = nn.Linear(15, 120)
        # определяем первый слой ReLU
        self.relu = nn.ReLU()
        self.sigmoida = nn.Sigmoid()
        self.second_linear = nn.Linear(120, 240)
        self.third_linear = nn.Linear(240, 60)
        self.fourth_linear = nn.Linear(60, 20)
        self.fifth_linear = nn.Linear(20, 1)

    # в методе forward мы определяем, как слои будут связаны друг с другом
    def forward(self, x):
        # y - результат выполнения первого слоя
        y = self.first_linear(x)
        # в теперь продолжаем накидывать оставшиеся слои
        y = self.relu(y)
        y = self.second_linear(y)
        y = self.relu(y)
        y = self.third_linear(y)
        y = self.relu(y)
        y = self.fourth_linear(y)
        y = self.relu(y)
        y = self.fifth_linear(y)
        y = self.sigmoida(y)
        return y

In [19]:
model = MyRegressionModel()

In [20]:
print(model)

MyRegressionModel(
  (first_linear): Linear(in_features=15, out_features=120, bias=True)
  (relu): ReLU()
  (sigmoida): Sigmoid()
  (second_linear): Linear(in_features=120, out_features=240, bias=True)
  (third_linear): Linear(in_features=240, out_features=60, bias=True)
  (fourth_linear): Linear(in_features=60, out_features=20, bias=True)
  (fifth_linear): Linear(in_features=20, out_features=1, bias=True)
)


### Функция потерь

In [21]:
loss = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0025)

### Обучение

In [22]:
epochs = 50
for epoch in range(epochs):
    for x_b, y_b in train_dl:
        outputs = model(x_b)
        loss_value = loss(outputs, y_b)
        loss_value.backward()
        optimizer.step()
        optimizer.zero_grad()

    # в конце эпохи выводим значение функции потерь для последнего рассмотренного батча
    print(f'Эпоха {epoch + 1}, Значение функции потерь: {loss_value.item()}')

Эпоха 1, Значение функции потерь: 0.09122172743082047
Эпоха 2, Значение функции потерь: 0.09685184806585312
Эпоха 3, Значение функции потерь: 0.06952374428510666
Эпоха 4, Значение функции потерь: 0.006018286570906639
Эпоха 5, Значение функции потерь: 0.0020388267002999783
Эпоха 6, Значение функции потерь: 0.09074535965919495
Эпоха 7, Значение функции потерь: 0.13692475855350494
Эпоха 8, Значение функции потерь: 0.00478344177827239
Эпоха 9, Значение функции потерь: 0.07327857613563538
Эпоха 10, Значение функции потерь: 0.0022220718674361706
Эпоха 11, Значение функции потерь: 0.04111183062195778
Эпоха 12, Значение функции потерь: 0.09083342552185059
Эпоха 13, Значение функции потерь: 0.007106516044586897
Эпоха 14, Значение функции потерь: 0.09611594676971436
Эпоха 15, Значение функции потерь: 0.05016898363828659
Эпоха 16, Значение функции потерь: 0.13284900784492493
Эпоха 17, Значение функции потерь: 0.0022261354606598616
Эпоха 18, Значение функции потерь: 0.0012927378993481398
Эпоха 19,

### Проверка

In [23]:
y_pred = model(torch.from_numpy(X_test).type(torch.float32))
y_pred = torch.round(y_pred)
# для преобразования тензора в массив numpy используем функцию numpy()
# но поскольку y_pred у нас требует градиент (requires_grad), предварительно используем функцию detach()
# она удаляет градиент
print(classification_report(y_test, y_pred.detach().numpy()))

              precision    recall  f1-score   support

           0       0.97      0.91      0.94      9579
           1       0.91      0.97      0.94      9579

    accuracy                           0.94     19158
   macro avg       0.94      0.94      0.94     19158
weighted avg       0.94      0.94      0.94     19158

