## Загрузим нужные библиотеки

In [157]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm

In [125]:
df = pd.read_csv("./content/train_dataset_train.csv")
dft = pd.read_csv("./content/test_dataset_test.csv")

In [137]:
#df.tail()

In [119]:
len_df = len(df)
dfres = pd.concat([df, dft], ignore_index=True)
#len(dfres)
dfres = dfres.fillna(0)
#dfres.info(verbose=True)

In [120]:
def convert_to_dict(df):
    res = dict()
    i = 0
    for el in df:
        if el not in res:
            res[el] = i
            i += 1
    return res

In [121]:
def one_hot_encoding(df, col_name):
    codes = convert_to_dict(df[col_name])
        
    new_columns = {}
    for i in range(len(codes)):
        new_columns[col_name+str(i)] = [0 for i in range(len(df))]
    new_columns = pd.DataFrame(new_columns, index=df.index)
    new_columns = new_columns.astype(np.int8)
    df = pd.concat([df, new_columns], axis=1)

    for i in range(len(df)):
        code = codes[df.iloc[i, df.columns.get_loc(col_name)]]
        df.iloc[i, df.columns.get_loc(col_name+str(code))] = 1
    return df, codes

In [126]:
dfres, codes_place = one_hot_encoding(dfres, 'Место')
#len(codes_place)
#dfres['Место']
dfres, codes_streets = one_hot_encoding(dfres, 'Улица')
#codes_streets
#len(codes_streets)
#dfres.info(verbose=True)
dfres, codes_building = one_hot_encoding(dfres, 'Дом')
#codes_building
#len(codes_building)
dfres, codes_road = one_hot_encoding(dfres, 'Дорога')
#len(codes_road)
dfres, codes_kilometer = one_hot_encoding(dfres, 'Километр')
#len(codes_kilometer)
dfres, codes_meter = one_hot_encoding(dfres, 'Метр')
#len(codes_meter)
dfres, codes_vid_dtp = one_hot_encoding(dfres, 'Вид ДТП')
#codes_vid_dtp
dfres["День"] = dfres["Дата"].str[:2]
dfres["Месяц"] = dfres["Дата"].str[3:5]
dfres["Час"] = dfres["Время"].str[:2]
dfres, codes_day = one_hot_encoding(dfres, 'День')
dfres, codes_month = one_hot_encoding(dfres, 'Месяц')
dfres, codes_hour = one_hot_encoding(dfres, 'Час')

In [129]:
dfres = dfres.set_index('id')
dfres = dfres.drop(['Дата','Время','Место', 'Улица', 'Дом', 'Дорога', 'Километр', 'Метр', 'Вид ДТП','День', 'Месяц','Час'], axis='columns')
#dfres = dfres.drop(['День', 'Месяц','Час'], axis='columns')

In [130]:
dfres.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42559 entries, 490103984 to 490109680
Data columns (total 3737 columns):
 #     Column         Dtype  
---    ------         -----  
 0     Погибло        float64
 1     Погибло детей  float64
 2     Ранено         float64
 3     Ранено детей   float64
 4     Место0         int8   
 5     Место1         int8   
 6     Место2         int8   
 7     Место3         int8   
 8     Место4         int8   
 9     Место5         int8   
 10    Место6         int8   
 11    Место7         int8   
 12    Место8         int8   
 13    Место9         int8   
 14    Место10        int8   
 15    Место11        int8   
 16    Место12        int8   
 17    Место13        int8   
 18    Место14        int8   
 19    Место15        int8   
 20    Место16        int8   
 21    Место17        int8   
 22    Место18        int8   
 23    Место19        int8   
 24    Место20        int8   
 25    Место21        int8   
 26    Место22        int8   
 27    

In [131]:
# Разбиваем обратно набор на обучающий и тестовый
def split_back(df, len_df):
    #features = df.iloc[:, 0:5] + df.iloc[:,11:]
    df1 = df.iloc[:len_df, :]
    df2 = df.iloc[len_df:, :]
    return df1, df2

In [133]:
data_df, test_df = split_back(dfres, len_df)

In [138]:
# разделим датасет на трейн и валидацию, чтобы смотреть на качество
train_df, valid_df = train_test_split(data_df, test_size=0.2, random_state=43)

In [140]:
#data_df.tail() # 490079894

In [139]:
#train_df.shape, valid_df.shape

((28616, 3737), (7155, 3737))

In [141]:
def get_features_labels(df):
    features = df.iloc[:, 4:]
    labels = df.iloc[:, :4]
    return features, labels

In [142]:
x_train, y_train = get_features_labels(train_df)
#x_train.shape
#y_train.shape

(28616, 3733)

## Построим модель

In [158]:
class FloatDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, index):

        features = self.x.iloc[index, :]
        labels = self.y.iloc[index, :]
        
        features = torch.tensor(features, dtype=torch.float32)
        label1 = torch.tensor(labels[0], dtype=torch.float32)
        label2 = torch.tensor(labels[1], dtype=torch.float32)
        label3 = torch.tensor(labels[2], dtype=torch.float32)
        label4 = torch.tensor(labels[3], dtype=torch.float32)
        return {
            'features': features,
            'label1': label1,
            'label2': label2,
            'label3': label3,
            'label4': label4,
        }


In [170]:
class MultiHeadFloatModel(nn.Module):
    def __init__(self):
        super(MultiHeadFloatModel, self).__init__()
        self.fc1 = nn.Linear(3733, 4096) # 3733 - это количество признаков
        self.fc2 = nn.Linear(4096, 1024)
        self.fc3 = nn.Linear(1024, 256)
        self.fc4 = nn.Linear(256, 128)
        
        self.out1 = nn.Linear(128, 1)
        self.out2 = nn.Linear(128, 1)
        self.out3 = nn.Linear(128, 1)
        self.out4 = nn.Linear(128, 1)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        
        out1 = self.out1(x)
        out2 = self.out2(x)
        out3 = self.out3(x)
        out4 = self.out4(x)
        
        return out1, out2, out3, out4

In [171]:
#Функция потерь
def mse_loss_fn(outputs, targets):
    o1, o2, o3, o4 = outputs
    t1, t2, t3, t4 = targets
    l1 = nn.MSELoss()(o1.squeeze(1), t1) # RSF
    l2 = nn.MSELoss()(o2.squeeze(1), t2) # RSF
    l3 = nn.MSELoss()(o3.squeeze(1), t3) # RSF
    l4 = nn.MSELoss()(o4.squeeze(1), t4) # RSF
    return (l1 + l2 + l3 + l4) / 4

In [172]:
train_dataset = FloatDataset(x_train, y_train)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=1)
model = MultiHeadFloatModel()

In [173]:
def train(model, dataloader, optimizer, loss_fn, train_dataset, device):
    model.train()
    counter = 0
    train_running_loss = 0.0
    for i, data in tqdm(enumerate(dataloader), total=int(len(train_dataset)/dataloader.batch_size)):
#        print(data)
        counter += 1
        
        # extract the features and labels
        features = data['features'].to(device)
        target1 = data['label1'].to(device)
        target2 = data['label2'].to(device)
        target3 = data['label3'].to(device)
        target4 = data['label4'].to(device)
        
        # zero-out the optimizer gradients
        optimizer.zero_grad()
        
        outputs = model(features)
        targets = (target1, target2, target3, target4)
        loss = loss_fn(outputs, targets)
        train_running_loss += loss.item()
        
        # backpropagation
        loss.backward()
        # update optimizer parameters
        optimizer.step()
        
    train_loss = train_running_loss / counter
    return train_loss

In [174]:
optimizer = optim.Adam(params=model.parameters(), lr=0.001)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
epochs = 10
model.to(device)

MultiHeadFloatModel(
  (fc1): Linear(in_features=3733, out_features=4096, bias=True)
  (fc2): Linear(in_features=4096, out_features=1024, bias=True)
  (fc3): Linear(in_features=1024, out_features=256, bias=True)
  (fc4): Linear(in_features=256, out_features=128, bias=True)
  (out1): Linear(in_features=128, out_features=1, bias=True)
  (out2): Linear(in_features=128, out_features=1, bias=True)
  (out3): Linear(in_features=128, out_features=1, bias=True)
  (out4): Linear(in_features=128, out_features=1, bias=True)
)

In [246]:
# В случае дополнительной тренировки
#model = MultiHeadFloatModel()
#model.load_state_dict(torch.load('outputs/multi_head_float4.pth'))
#model.to(device)
#epochs = 6

In [247]:
#loss_fn = binary_loss_fn # RSF
loss_fn = mse_loss_fn # RSF
train_loss = []
for epoch in range(epochs):
    print(f"Эпоха {epoch+1} из {epochs}")
    train_epoch_loss = train(model, train_dataloader, optimizer, loss_fn, train_dataset, device)
    train_loss.append(train_epoch_loss)
    print(f"Train Loss: {train_epoch_loss:.4f}")
torch.save(model.state_dict(), 'outputs/multi_head_float.pth')

Эпоха 1 из 6


100%|██████████████████████████████████████████████████████████████████████████| 28616/28616 [1:43:24<00:00,  4.61it/s]


Train Loss: 0.0813
Эпоха 2 из 6


100%|██████████████████████████████████████████████████████████████████████████| 28616/28616 [1:43:16<00:00,  4.62it/s]


Train Loss: 0.0813
Эпоха 3 из 6


100%|██████████████████████████████████████████████████████████████████████████| 28616/28616 [1:43:29<00:00,  4.61it/s]


Train Loss: 0.0813
Эпоха 4 из 6


 38%|███████████████████████████▊                                              | 10779/28616 [38:56<1:04:25,  4.61it/s]


KeyboardInterrupt: 

In [248]:
torch.save(model.state_dict(), 'outputs/multi_head_float.pth')

In [177]:
x_valid, y_valid = get_features_labels(valid_df)
#x_valid.shape

In [178]:
valid_dataset = FloatDataset(x_valid, y_valid)
valid_dataloader = DataLoader(valid_dataset, batch_size=1)

In [234]:
model = MultiHeadFloatModel()
model.load_state_dict(torch.load('outputs/multi_head_float.pth'))
model.to(device)
model.eval()

MultiHeadFloatModel(
  (fc1): Linear(in_features=3733, out_features=4096, bias=True)
  (fc2): Linear(in_features=4096, out_features=1024, bias=True)
  (fc3): Linear(in_features=1024, out_features=256, bias=True)
  (fc4): Linear(in_features=256, out_features=128, bias=True)
  (out1): Linear(in_features=128, out_features=1, bias=True)
  (out2): Linear(in_features=128, out_features=1, bias=True)
  (out3): Linear(in_features=128, out_features=1, bias=True)
  (out4): Linear(in_features=128, out_features=1, bias=True)
)

In [235]:
total_all_labels = []
total_all_targets = []

for i, test_sample in enumerate(valid_dataloader):
    if i < 30:
        print(f"SAMPLE {i}")
    # extract the features and labels
    features = test_sample['features'].to(device)
    target1 = test_sample['label1'].to(device)
    target2 = test_sample['label2'].to(device)
    target3 = test_sample['label3'].to(device)
    target4 = test_sample['label4'].to(device)
    
    outputs = model(features)
            
    # get all the labels
    all_labels = []
#    outputs0 = torch.round(outputs)
    for out in outputs:
        rout = out.squeeze(0).tolist()[0]
        all_labels.append(int(round(rout)))
        
    total_all_labels.append(all_labels) # RSF
    
    targets = (target1, target2, target3, target4)
    
    # get all the targets in int format from tensor format
    all_targets = []
    for target in targets:
        all_targets.append(int(target.squeeze(0).detach().cpu()))
        
    total_all_targets.append(all_targets) # RSF
        
    if i < 30:    
        print(f"ALL PREDICTIONS: {all_labels}")
        print(f"GROUND TRUTHS: {all_targets}")


SAMPLE 0
ALL PREDICTIONS: [0, 0, 0, 0]
GROUND TRUTHS: [0, 0, 0, 0]
SAMPLE 1
ALL PREDICTIONS: [0, 0, 0, 0]
GROUND TRUTHS: [0, 0, 0, 0]
SAMPLE 2
ALL PREDICTIONS: [0, 0, 0, 0]
GROUND TRUTHS: [0, 0, 0, 0]
SAMPLE 3
ALL PREDICTIONS: [0, 0, 0, 0]
GROUND TRUTHS: [0, 0, 0, 0]
SAMPLE 4
ALL PREDICTIONS: [0, 0, 0, 0]
GROUND TRUTHS: [0, 0, 2, 0]
SAMPLE 5
ALL PREDICTIONS: [0, 0, 0, 0]
GROUND TRUTHS: [0, 0, 1, 0]
SAMPLE 6
ALL PREDICTIONS: [0, 0, 0, 0]
GROUND TRUTHS: [0, 0, 0, 0]
SAMPLE 7
ALL PREDICTIONS: [0, 0, 0, 0]
GROUND TRUTHS: [0, 0, 0, 0]
SAMPLE 8
ALL PREDICTIONS: [0, 0, 0, 0]
GROUND TRUTHS: [0, 0, 0, 0]
SAMPLE 9
ALL PREDICTIONS: [0, 0, 0, 0]
GROUND TRUTHS: [0, 0, 0, 0]
SAMPLE 10
ALL PREDICTIONS: [0, 0, 0, 0]
GROUND TRUTHS: [0, 0, 0, 0]
SAMPLE 11
ALL PREDICTIONS: [0, 0, 0, 0]
GROUND TRUTHS: [0, 0, 0, 0]
SAMPLE 12
ALL PREDICTIONS: [0, 0, 0, 0]
GROUND TRUTHS: [0, 0, 0, 0]
SAMPLE 13
ALL PREDICTIONS: [0, 0, 0, 0]
GROUND TRUTHS: [0, 0, 2, 0]
SAMPLE 14
ALL PREDICTIONS: [0, 0, 0, 0]
GROUND TRUTHS: [0,

In [236]:
total_all_labels = np.array(total_all_labels)
total_all_targets = np.array(total_all_targets)

In [237]:
from sklearn.metrics import recall_score

score = 0 

score += recall_score(total_all_targets[:,0], total_all_labels[:,0], average='macro')
score += recall_score(total_all_targets[:,1], total_all_labels[:,1], average='macro')
score += recall_score(total_all_targets[:,2], total_all_labels[:,2], average='macro')
score += recall_score(total_all_targets[:,3], total_all_labels[:,3], average='macro')

In [238]:
print("Recall score:", score/4)

Recall score: 0.306547619047619


In [239]:
test_df

Unnamed: 0_level_0,Погибло,Погибло детей,Ранено,Ранено детей,Место0,Место1,Место2,Место3,Место4,Место5,...,Час14,Час15,Час16,Час17,Час18,Час19,Час20,Час21,Час22,Час23
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
490078911,0.0,0.0,0.0,0.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
490055448,0.0,0.0,0.0,0.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
490054440,0.0,0.0,0.0,0.0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
490037847,0.0,0.0,0.0,0.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
490037049,0.0,0.0,0.0,0.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490014641,0.0,0.0,0.0,0.0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
490033115,0.0,0.0,0.0,0.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
490008221,0.0,0.0,0.0,0.0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
490119585,0.0,0.0,0.0,0.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [240]:
x_test, _ = get_features_labels(test_df)
x_test.shape

(6788, 3733)

In [241]:
x_test

Unnamed: 0_level_0,Место0,Место1,Место2,Место3,Место4,Место5,Место6,Место7,Место8,Место9,...,Час14,Час15,Час16,Час17,Час18,Час19,Час20,Час21,Час22,Час23
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
490078911,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
490055448,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
490054440,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
490037847,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
490037049,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490014641,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
490033115,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
490008221,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
490119585,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [242]:
class TestFloatDataset(Dataset):
    def __init__(self, x):
        self.x = x
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, index):

        features = self.x.iloc[index, :]
        
        features = torch.tensor(features, dtype=torch.float32)
        return {
            'features': features,
        }

In [243]:
test_dataset = TestFloatDataset(x_test)
test_dataloader = DataLoader(test_dataset, batch_size=1)
model = MultiHeadBinaryModel()
model.load_state_dict(torch.load('outputs/multi_head_float10.pth'))
model.to(device)
model.eval()
predicts = []

In [244]:
for i, test_sample in enumerate(test_dataloader):
    features = test_sample['features'].to(device)
    
    outputs = model(features)
            
    # get all the labels
    all_labels = []
    for out in outputs:
        rout = out.squeeze(0).tolist()[0]
        all_labels.append(int(round(rout)))
    predicts.append(all_labels)

In [245]:
predicts

[[0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0,

In [228]:
test_names = test_df.index

In [229]:
test_names

Int64Index([490078911, 490055448, 490054440, 490037847, 490037049, 490042616,
            490084051, 490009858, 490050700, 490095753,
            ...
            490022205, 490066756, 490123920, 490060727, 490119246, 490014641,
            490033115, 490008221, 490119585, 490109680],
           dtype='int64', name='id', length=6788)

In [230]:
submit_df = pd.DataFrame([[name, pred[0], pred[1], pred[2], pred[3]] for name, pred in zip(test_names, predicts)], columns=['id','Погибло','Погибло детей','Ранено','Ранено детей'])

In [231]:
submit_df.to_csv("RSF_submit.csv", index=False)