## Работа с тестовой выборкой

In [1]:
import numpy as np
import pandas as pd
import pickle 

from sklearn.model_selection import train_test_split,KFold,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.metrics import r2_score as r2,mean_absolute_error as mae,mean_squared_error as mse

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [55]:
df = pd.read_csv('test_norm.csv')

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 18 columns):
Id             10000 non-null int64
DistrictId     10000 non-null float64
Rooms          10000 non-null float64
Square         10000 non-null float64
LifeSquare     10000 non-null float64
Floor          10000 non-null float64
HouseFloor     10000 non-null float64
HouseYear      10000 non-null float64
Ecology_1      10000 non-null float64
Social_1       10000 non-null float64
Social_2       10000 non-null float64
Social_3       10000 non-null float64
Helthcare_2    10000 non-null float64
Shops_1        10000 non-null float64
Ecology_2      10000 non-null int64
Ecology_3      10000 non-null int64
Shops_2        10000 non-null int64
Price          10000 non-null float64
dtypes: float64(14), int64(4)
memory usage: 1.4 MB


In [39]:
df['Floor'] = df['Floor'].astype('float64')
df['DistrictId'] = df['DistrictId'].astype('float64')
df['HouseYear'] = df['HouseYear'].astype('float64')
df['Social_1'] = df['Social_1'].astype('float64')
df['Social_2'] = df['Social_2'].astype('float64')
df['Social_3'] = df['Social_3'].astype('float64')
df['Helthcare_2'] = df['Helthcare_2'].astype('float64')
df['Shops_1'] = df['Shops_1'].astype('float64')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 17 columns):
Id             5000 non-null int64
DistrictId     5000 non-null float64
Rooms          5000 non-null float64
Square         5000 non-null float64
LifeSquare     5000 non-null float64
Floor          5000 non-null float64
HouseFloor     5000 non-null float64
HouseYear      5000 non-null float64
Ecology_1      5000 non-null float64
Social_1       5000 non-null float64
Social_2       5000 non-null float64
Social_3       5000 non-null float64
Helthcare_2    5000 non-null float64
Shops_1        5000 non-null float64
Ecology_3      5000 non-null int64
Ecology_2      5000 non-null int64
Shops_2        5000 non-null int64
dtypes: float64(13), int64(4)
memory usage: 664.2 KB


In [40]:
df['Id'] = df['Id'].astype(str)
df['Id'].dtype

dtype('O')

In [41]:
feature_names=['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'Floor', 'HouseFloor', 'HouseYear',
 'Ecology_1', 'Social_1', 'Social_2', 'Social_3', 'Helthcare_2', 'Shops_1', 'Ecology_2',
 'Ecology_3', 'Shops_2']

In [42]:
feature_names_for_stand=df[feature_names].select_dtypes(include='float64').columns.to_list()
feature_names_for_stand

['DistrictId',
 'Rooms',
 'Square',
 'LifeSquare',
 'Floor',
 'HouseFloor',
 'HouseYear',
 'Ecology_1',
 'Social_1',
 'Social_2',
 'Social_3',
 'Helthcare_2',
 'Shops_1']

In [45]:
with open('scaler.pkl','rb') as file:
    scaler_load=pickle.load(file)

In [46]:
stand_features=scaler_load.transform(df[feature_names_for_stand])

In [47]:
df[feature_names_for_stand]=pd.DataFrame(stand_features,columns=feature_names_for_stand)

In [48]:
df.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Helthcare_2,Shops_1,Ecology_3,Ecology_2,Shops_2
0,725,0.174352,0.139393,-0.309633,-0.050293,-0.482113,0.092604,-0.010066,1.607652,-0.780698,-0.649967,-0.295384,-0.88348,-0.880402,1,1,1
1,15856,0.541447,0.139393,0.614072,0.075356,-0.482113,-1.220226,-0.010041,-0.361948,-1.065895,-0.977177,-0.211458,-0.88348,-0.464264,1,1,1
2,5480,3.202888,-1.090904,-1.42596,-0.277766,-1.245343,-1.38433,-0.01038,-0.998641,0.30305,0.545561,3.313409,2.464303,0.159943,1,1,1
3,15664,-0.078026,0.139393,0.794396,0.190495,2.570806,1.405433,-0.009892,-0.142708,-0.096225,-0.191973,-0.211458,1.12519,-0.256195,1,1,1
4,14275,-0.536895,-1.090904,-0.421902,0.079218,1.616769,0.584915,-0.009842,-0.392373,-1.294053,-1.178844,-0.295384,-0.88348,-0.880402,1,1,0


In [14]:
with open('model.pkl','rb') as file:
    model_load=pickle.load(file)

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 18 columns):
Id             5000 non-null object
DistrictId     5000 non-null float64
Rooms          5000 non-null float64
Square         5000 non-null float64
LifeSquare     5000 non-null float64
Floor          5000 non-null float64
HouseFloor     5000 non-null float64
HouseYear      5000 non-null float64
Ecology_1      5000 non-null float64
Social_1       5000 non-null float64
Social_2       5000 non-null float64
Social_3       5000 non-null float64
Helthcare_2    5000 non-null float64
Shops_1        5000 non-null float64
Ecology_3      5000 non-null int64
Ecology_2      5000 non-null int64
Shops_2        5000 non-null int64
Price          5000 non-null float64
dtypes: float64(14), int64(3), object(1)
memory usage: 703.2+ KB


In [52]:
df['Price']=model_load.predict(df[feature_names])

In [54]:
df[['Id','Price']].to_csv('SBolsh_predictions.csv',index=False)