In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [2]:
macro_df = pd.read_csv('data/macro.csv', parse_dates=[0])

In [3]:
macro_df

Unnamed: 0,timestamp,oil_urals,gdp_quart,gdp_quart_growth,cpi,ppi,gdp_deflator,balance_trade,balance_trade_growth,usdrub,...,provision_retail_space_modern_sqm,turnover_catering_per_cap,theaters_viewers_per_1000_cap,seats_theather_rfmin_per_100000_cap,museum_visitis_per_100_cap,bandwidth_sports,population_reg_sports_share,students_reg_sports_share,apartment_build,apartment_fund_sqm
0,2010-01-01,76.1000,,,,,,,,,...,690.0,6221.0,527.0,0.41,993.0,,,63.03,22825.0,
1,2010-01-02,76.1000,,,,,,,,,...,690.0,6221.0,527.0,0.41,993.0,,,63.03,22825.0,
2,2010-01-03,76.1000,,,,,,,,,...,690.0,6221.0,527.0,0.41,993.0,,,63.03,22825.0,
3,2010-01-04,76.1000,,,,,,,,29.9050,...,690.0,6221.0,527.0,0.41,993.0,,,63.03,22825.0,
4,2010-01-05,76.1000,,,,,,,,29.8360,...,690.0,6221.0,527.0,0.41,993.0,,,63.03,22825.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2479,2016-10-15,44.3677,19979.4,-0.6,531.0,601.9,133.16,5.823,2.6,62.9573,...,,,,,,,,,,
2480,2016-10-16,44.3677,19979.4,-0.6,531.0,601.9,133.16,5.823,2.6,62.9573,...,,,,,,,,,,
2481,2016-10-17,44.3677,19979.4,-0.6,531.0,601.9,133.16,5.823,2.6,63.0856,...,,,,,,,,,,
2482,2016-10-18,44.3677,19979.4,-0.6,531.0,601.9,133.16,5.823,2.6,62.9512,...,,,,,,,,,,


In [4]:
macro_df['timestamp_year'] = macro_df['timestamp'].dt.year
macro_df['timestamp_month'] = macro_df['timestamp'].dt.month
macro_df['timestamp_day'] = macro_df['timestamp'].dt.day
macro_df.drop(labels='timestamp', axis=1, inplace=True)

In [5]:
macro_df.select_dtypes('object').head()

Unnamed: 0,child_on_acc_pre_school,modern_education_share,old_education_build_share
0,45713,,
1,45713,,
2,45713,,
3,45713,,
4,45713,,


In [6]:
macro_df['child_on_acc_pre_school'].unique()

array(['45,713', '#!', '7,311', '3,013', '16,765', nan], dtype=object)

In [7]:
temp = macro_df['child_on_acc_pre_school'].map(
    {'#!': np.NaN,
     '45,713': 45713,
     '7,311': 7311,
     '3,013': 3013,
     '16,765': 16765,
     np.NaN: np.NaN}
)
macro_df['child_on_acc_pre_school'] = temp
macro_df['child_on_acc_pre_school'].unique()

array([45713.,    nan,  7311.,  3013., 16765.])

In [8]:
macro_df.select_dtypes('object').head()

Unnamed: 0,modern_education_share,old_education_build_share
0,,
1,,
2,,
3,,
4,,


In [9]:
macro_df['modern_education_share'].unique()

array([nan, '90,92', '93,08', '95,4918'], dtype=object)

In [10]:
temp = macro_df['modern_education_share'].map(
    {'90,92': 90.92,
     '93,08': 93.08,
     '95,4918': 95.4918,
     np.NaN: np.NaN}
)
macro_df['modern_education_share'] = temp
macro_df['modern_education_share'].unique()

array([    nan, 90.92  , 93.08  , 95.4918])

In [11]:
macro_df.select_dtypes('object').head()

Unnamed: 0,old_education_build_share
0,
1,
2,
3,
4,


In [12]:
macro_df['old_education_build_share'].unique()

array([nan, '23,14', '25,47', '8,2517'], dtype=object)

In [13]:
temp = macro_df['old_education_build_share'].map(
    {'23,14': 23.14,
     '25,47': 25.47,
     '8,2517': 8.2517,
     np.NaN: np.NaN}
)
macro_df['old_education_build_share'] = temp
macro_df['old_education_build_share'].unique()

array([    nan, 23.14  , 25.47  ,  8.2517])

In [18]:
model = make_pipeline(
    StandardScaler(),
    KNNImputer()
)

model.fit(macro_df)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('knnimputer', KNNImputer())])

In [19]:
macro_df_new = macro_df.copy()
macro_df_new[:] = model.transform(macro_df)

In [23]:
scaler = StandardScaler()
macro_df_scaled = macro_df.copy()
macro_df_scaled[:] = scaler.fit_transform(macro_df)

In [4]:
dict(macro_df.isna().sum())

{'timestamp': 0,
 'oil_urals': 0,
 'gdp_quart': 90,
 'gdp_quart_growth': 90,
 'cpi': 31,
 'ppi': 31,
 'gdp_deflator': 365,
 'balance_trade': 31,
 'balance_trade_growth': 90,
 'usdrub': 3,
 'eurrub': 3,
 'brent': 3,
 'net_capital_export': 396,
 'gdp_annual': 0,
 'gdp_annual_growth': 0,
 'average_provision_of_build_contract': 0,
 'average_provision_of_build_contract_moscow': 365,
 'rts': 10,
 'micex': 10,
 'micex_rgbi_tr': 10,
 'micex_cbi_tr': 1,
 'deposits_value': 0,
 'deposits_growth': 31,
 'deposits_rate': 414,
 'mortgage_value': 0,
 'mortgage_growth': 365,
 'mortgage_rate': 0,
 'grp': 658,
 'grp_growth': 1023,
 'income_per_cap': 293,
 'real_dispos_income_per_cap_growth': 658,
 'salary': 293,
 'salary_growth': 658,
 'fixed_basket': 0,
 'retail_trade_turnover': 293,
 'retail_trade_turnover_per_cap': 293,
 'retail_trade_turnover_growth': 293,
 'labor_force': 293,
 'unemployment': 293,
 'employment': 293,
 'invest_fixed_capital_per_cap': 293,
 'invest_fixed_assets': 293,
 'profitable_ent