In [1]:
# Задание на повторение материала предыдущего семестра

In [24]:
# Зависимости
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR, SVC
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import KMeans

from sklearn.metrics import mean_squared_error, f1_score, silhouette_score

In [25]:
# Генерируем уникальный seed
my_code = "Sokolov"
seed_limit = 2 ** 32
my_seed = int.from_bytes(my_code.encode(), "little") % seed_limit

In [26]:
# Данные загружены отсюда: https://www.kaggle.com/dwdkills/russian-demography
# Читаем данные из файла
example_data = pd.read_csv("datasets/russian_demography.csv")

In [27]:
# "year" - год (1990-2017)
# "region" - название региона
# "npg" - естественный прирост населения на 1000 человек
# "birth_rate" - количество рождений на 1000 человек
# "death_rate" - количество смертей на 1000 человек
# "gdw" - коэффициент демографической нагрузки на 100 человек (Отношение числа нетрудоспособных к числу трудоспособных).
# "urbanization" - процент городского населения

example_data.head()

Unnamed: 0,year,region,npg,birth_rate,death_rate,gdw,urbanization
0,1990,Republic of Adygea,1.9,14.2,12.3,84.66,52.42
1,1990,Altai Krai,1.8,12.9,11.1,80.24,58.07
2,1990,Amur Oblast,7.6,16.2,8.6,69.55,68.37
3,1990,Arkhangelsk Oblast,3.7,13.5,9.8,73.26,73.63
4,1990,Astrakhan Oblast,4.7,15.1,10.4,77.05,68.01


In [28]:
# Так как список регионов меняется от года к году, в данных есть строки без значений. Удалим их
example_data.dropna(inplace=True)

In [29]:
# Определим размер валидационной и тестовой выборок
val_test_size = round(0.2*len(example_data))
print(val_test_size)

463


In [30]:
# Создадим обучающую, валидационную и тестовую выборки
random_state = my_seed
train_val, test = train_test_split(example_data, test_size=val_test_size, random_state=random_state)
train, val = train_test_split(train_val, test_size=val_test_size, random_state=random_state)
print(len(train), len(val), len(test))

1389 463 463


In [31]:
# Значения в числовых столбцах преобразуем к отрезку [0,1].
# Для настройки скалировщика используем только обучающую выборку.
columns_to_scale = ['year', 'npg', 'birth_rate', 'death_rate', 'gdw', 'urbanization']

ct = ColumnTransformer(transformers=[('numerical', MinMaxScaler(), columns_to_scale)], remainder='passthrough')
ct.fit(train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('numerical', MinMaxScaler(),
                                 ['year', 'npg', 'birth_rate', 'death_rate',
                                  'gdw', 'urbanization'])])

In [32]:
# Преобразуем значения, тип данных приводим к DataFrame
sc_train = pd.DataFrame(ct.transform(train))
sc_test = pd.DataFrame(ct.transform(test))
sc_val = pd.DataFrame(ct.transform(val))

In [33]:
# Устанавливаем названия столбцов
column_names = columns_to_scale + ['region']
sc_train.columns = column_names
sc_test.columns = column_names
sc_val.columns = column_names

In [34]:
sc_train

Unnamed: 0,year,npg,birth_rate,death_rate,gdw,urbanization,region
0,0.481481,0.226368,0.130802,0.59447,0.490868,0.675527,Volgograd Oblast
1,0.814815,0.422886,0.367089,0.488479,0.428248,0.628126,Omsk Oblast
2,0.481481,0.196517,0.105485,0.62212,0.379022,1.0,Saint Petersburg
3,0.074074,0.276119,0.130802,0.502304,0.804488,0.591463,Smolensk Oblast
4,0.777778,0.323383,0.223629,0.516129,0.440077,0.742045,Samara Oblast
...,...,...,...,...,...,...,...
1384,0.62963,0.310945,0.236287,0.552995,0.295356,0.511588,Mari El Republic
1385,0.925926,0.848259,0.71308,0.078341,0.707949,0.144952,Chechen Republic
1386,0.148148,0.50995,0.362869,0.322581,0.770395,0.181223,Republic of Kalmykia
1387,1.0,0.236318,0.122363,0.56682,0.688642,0.634673,Smolensk Oblast


In [35]:
n = 4
labels = random.sample(columns_to_scale, n)

y_label = labels[0]
x_labels = labels[1:]

print(x_labels)
print(y_label)

['birth_rate', 'npg', 'gdw']
urbanization


In [36]:
x_train = sc_train[x_labels]
x_test = sc_test[x_labels]
x_val = sc_val[x_labels]

y_train = sc_train[y_label]
y_test = sc_test[y_label]
y_val = sc_val[y_label]

In [38]:
x_train

Unnamed: 0,birth_rate,npg,gdw
0,0.130802,0.226368,0.490868
1,0.367089,0.422886,0.428248
2,0.105485,0.196517,0.379022
3,0.130802,0.276119,0.804488
4,0.223629,0.323383,0.440077
...,...,...,...
1384,0.236287,0.310945,0.295356
1385,0.71308,0.848259,0.707949
1386,0.362869,0.50995,0.770395
1387,0.122363,0.236318,0.688642


In [37]:
score_list = []
model_list = [
              SVR( kernel='linear', C=0.8),
              SVR(kernel='poly', degree=3, C=1.0),
              SVR(kernel='rbf', C=1.0),
              GradientBoostingClassifier(**grid.best_params_)
              ]
for i in range(len(model_list)):
    model_list[i].fit(x_train, y_train)
    score_list.append(model_list[i].score(x_val, y_val))
    print(score_list[i])

0.2799842469826075
0.2575827960274488
0.34362484887797395


ValueError: Unknown label type: 'unknown'