# Лабораторная №1 по NLP. Создание полносвязных нейронных сетей для решения задач **регрессии** и **классификации**

## Иморты

In [33]:
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [34]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

## Задача регрессии

In [35]:
data_reg = pd.read_csv('mumbai_houses_task_filtred.csv')
data_reg

Unnamed: 0,price,area,latitude,longitude,bedrooms,bathrooms,balcony,status,neworold,parking,furnished_status,lift,type_of_building
0,22400000,629,19.032800,72.896357,2,2,0,0,1,0,0,0,0
1,35000000,974,19.032800,72.896357,3,2,0,0,1,0,0,0,0
2,31700000,968,19.085600,72.909277,3,3,0,0,1,0,0,0,0
3,18700000,629,19.155756,72.846862,2,2,2,1,1,2,0,2,0
4,13500000,1090,19.177555,72.849887,2,2,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6232,19500000,810,19.138320,72.810020,2,2,0,1,0,0,2,0,0
6233,22000000,1400,19.221920,72.854250,3,3,1,1,0,1,0,1,0
6234,20000000,750,19.144320,72.824111,2,2,0,1,0,0,0,0,0
6235,11000000,700,19.047201,72.872225,2,2,0,1,0,0,1,0,0


In [36]:
data_reg['price'].describe()

count    6.237000e+03
mean     2.686215e+07
std      2.792125e+07
min      1.500000e+06
25%      1.300000e+07
50%      1.900000e+07
75%      3.000000e+07
max      3.600000e+08
Name: price, dtype: float64

Датасет полностью предобработан, но в нем присутствуют идентичные друг другу признаки, поэтому они будут удалены из датасета.

In [37]:
mask_dup = data_reg.T.duplicated(keep='first')
drop_cols = data_reg.columns[mask_dup].tolist()
data_reg = data_reg.loc[:, ~mask_dup]

print("удалены:", drop_cols)
data_reg

удалены: ['parking', 'lift']


Unnamed: 0,price,area,latitude,longitude,bedrooms,bathrooms,balcony,status,neworold,furnished_status,type_of_building
0,22400000,629,19.032800,72.896357,2,2,0,0,1,0,0
1,35000000,974,19.032800,72.896357,3,2,0,0,1,0,0
2,31700000,968,19.085600,72.909277,3,3,0,0,1,0,0
3,18700000,629,19.155756,72.846862,2,2,2,1,1,0,0
4,13500000,1090,19.177555,72.849887,2,2,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6232,19500000,810,19.138320,72.810020,2,2,0,1,0,2,0
6233,22000000,1400,19.221920,72.854250,3,3,1,1,0,0,0
6234,20000000,750,19.144320,72.824111,2,2,0,1,0,0,0
6235,11000000,700,19.047201,72.872225,2,2,0,1,0,1,0


Также у целевого признака очень скошено распределение, поэтому применю логарифмирование

In [38]:
X_reg = data_reg.drop(columns=['price']).to_numpy()
y_reg = data_reg['price'].to_numpy()
y_reg = y_reg.reshape(-1, 1)
y_reg = np.log1p(y_reg)

In [39]:
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.05)
X_reg_train.shape, X_reg_test.shape, y_reg_train.shape, y_reg_test.shape

((5925, 10), (312, 10), (5925, 1), (312, 1))

In [40]:
scaler_X = StandardScaler()
X_reg_train = scaler_X.fit_transform(X_reg_train)
X_reg_test  = scaler_X.transform(X_reg_test)

In [41]:
train_reg_ds = TensorDataset(torch.from_numpy(X_reg_train).type(torch.float32), torch.from_numpy(y_reg_train).type(torch.float32)) 
train_reg_dl = DataLoader(train_reg_ds, batch_size=256, num_workers=4, shuffle=True)
test_reg_ds = TensorDataset(torch.from_numpy(X_reg_test), torch.from_numpy(y_reg_test)) 
test_reg_dl = DataLoader(test_reg_ds, batch_size=256, num_workers=4, shuffle=False)

### Создание модели

In [None]:
class MyRegressionModel(nn.Module): 
    def __init__(self, input_size: int): 
        super(MyRegressionModel, self).__init__() 
        self.first_reg_linear = nn.Linear(in_features=input_size, out_features=64) 
        self.first_reg_relu = nn.ReLU() 
        self.first_reg_dropout = nn.Dropout(p=0.2)
        
        self.second_reg_linear = nn.Linear(in_features=64, out_features=32) 
        self.second_reg_relu = nn.ReLU() 
        self.second_reg_dropout = nn.Dropout(p=0.3)
        
        self.third_reg_linear = nn.Linear(in_features=32, out_features=16) 
        self.third_reg_relu = nn.ReLU() 
        self.third_reg_dropout = nn.Dropout(p=0.2)
        
        self.fourth_reg_linear = nn.Linear(in_features=16, out_features=1)
    
    def forward(self, x): 
        y = self.first_reg_linear(x) 
        y = self.first_reg_relu(y) 
        y = self.first_reg_dropout(y)
        
        y = self.second_reg_linear(y) 
        y = self.second_reg_relu(y) 
        y = self.second_reg_dropout(y)
        
        y = self.third_reg_linear(y) 
        y = self.third_reg_relu(y) 
        y = self.third_reg_dropout(y)
        
        y = self.fourth_reg_linear(y)
        return y

In [44]:
model = MyRegressionModel(input_size=X_reg_train.shape[1])
model

MyRegressionModel(
  (first_reg_linear): Linear(in_features=10, out_features=32, bias=True)
  (first_reg_relu): ReLU()
  (second_reg_linear): Linear(in_features=32, out_features=64, bias=True)
  (second_reg_relu): ReLU()
  (third_reg_linear): Linear(in_features=64, out_features=32, bias=True)
  (third_reg_relu): ReLU()
  (fourth_reg_linear): Linear(in_features=32, out_features=1, bias=True)
)

In [45]:
loss = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0025)

In [46]:
epochs = 100
for epoch in range(epochs):
    for x_b, y_b in train_reg_dl:
        x_b, y_b = x_b.to(device), y_b.to(device)
        outputs = model(x_b)
        loss_value = loss(outputs, y_b)
        loss_value.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f'Эпоха {epoch + 1}, Значение функции потерь: {loss_value.item()}')

Эпоха 1, Значение функции потерь: 104.2607421875
Эпоха 2, Значение функции потерь: 11.959000587463379
Эпоха 3, Значение функции потерь: 6.064947128295898
Эпоха 4, Значение функции потерь: 4.042536735534668
Эпоха 5, Значение функции потерь: 1.6825402975082397
Эпоха 6, Значение функции потерь: 1.7742981910705566
Эпоха 7, Значение функции потерь: 1.5232436656951904
Эпоха 8, Значение функции потерь: 0.8085212111473083
Эпоха 9, Значение функции потерь: 0.5985943078994751
Эпоха 10, Значение функции потерь: 0.3802011013031006
Эпоха 11, Значение функции потерь: 0.4154699742794037
Эпоха 12, Значение функции потерь: 0.49691689014434814
Эпоха 13, Значение функции потерь: 0.3936218023300171
Эпоха 14, Значение функции потерь: 0.4123882055282593
Эпоха 15, Значение функции потерь: 0.518903911113739
Эпоха 16, Значение функции потерь: 0.5226092338562012
Эпоха 17, Значение функции потерь: 0.27432602643966675
Эпоха 18, Значение функции потерь: 0.15652994811534882
Эпоха 19, Значение функции потерь: 0.2359

In [50]:
y_pred = model(torch.from_numpy(X_reg_test).to(device).type(torch.float32)).cpu()
y_pred = y_pred.detach().numpy()
y_pred = np.expm1(y_pred)
y_true = np.expm1(y_reg_test)

from sklearn.metrics import r2_score
r2_score(y_true, y_pred)

0.6047986095010613