In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import pickle
warnings.simplefilter("ignore")
%matplotlib inline

sns.set(style="darkgrid")
    
from preprocess import Preprocessing
from sklearn.model_selection import train_test_split
df = pd.read_excel("Data.xlsx")

In [99]:
target_scaler = pickle.load(open('./utils/target_scaler.pkl','rb'))
feature_scaler = pickle.load(open('./utils/feature_scaler.pkl','rb'))
one_hot_enc = pickle.load(open('./utils/one_hot_enc.pkl','rb'))

year_median_floor = pickle.load(open('./utils/year_median_floor.pkl','rb'))
area_median_year = pickle.load(open('./utils/area_median_year.pkl','rb'))

In [212]:
almaty_areas = ['Ауэзовский', 'Бостандыкский', 'Алмалинский', 'Алатауский', 'Медеуский', 'Наурызбайский', 'Турксибский',
                'Жетысуский']

def streets_to_area(curr_area):
   
    if curr_area.item() not in almaty_areas:
        return 'district9'
    return curr_area.item()

def year_to_int(df):
    try:
        curr_year = int(df['year'])
    except:
        curr_year = area_median_year[df['area'].item()]
    return curr_year

def one_hot(df):
    res = one_hot_enc.transform(df['area'][:, None]).toarray()
    cols_name = [f'area{i}' for i in range(1, 9)]
    df_train_one_hot = pd.DataFrame(res, columns=cols_name, dtype=int)
    df = pd.concat([df, df_train_one_hot], axis=1)
    df.drop(columns=['area'], inplace=True)
    return df

def scale_data(df):
    numeric_features = ['rooms', 'sq_m', 'floor', 'floors_all', 'year']
    df[numeric_features] = feature_scaler.transform(df[numeric_features])
    


def preprocess(curr_dict):
    df = pd.DataFrame(curr_dict, index=[0])
    df['area'] = streets_to_area(df['area'])
    df['year'] = year_to_int(df)
    scale_data(df)
    df = one_hot(df)
    
    return df
    
    

In [213]:
df = pd.read_excel("Data.xlsx")
input_dict = df.iloc[0].to_dict()


del input_dict['urls']
del input_dict['price']
input_dict

{'rooms': 2,
 'sq_m': 76.0,
 'floor': 8.0,
 'floors_all': 19.0,
 'area': 'Алмалинский',
 'year': '2012'}

In [214]:
ddf = preprocess(input_dict)

In [216]:
ddf

Unnamed: 0,rooms,sq_m,floor,floors_all,year,area1,area2,area3,area4,area5,area6,area7,area8
0,-0.180533,0.353239,1.065378,2.655437,0.742261,0,1,0,0,0,0,0,0


In [215]:
input_dict.pop('area')

'Алмалинский'

In [119]:
dict_you_want = {key: input_dict[key] for key in input_dict.keys() if key != 'area'}

In [134]:
feature_scaler.transform(pd.DataFrame(dict_you_want, index=[0]))

array([[-0.18053339,  0.35323875,  1.06537755,  2.65543726,  0.74226084]])

In [123]:
dict_you_want['year'] = int(dict_you_want['year'])

In [135]:
pd.DataFrame(input_dict, index=[0])

Unnamed: 0,rooms,sq_m,floor,floors_all,area,year
0,2,76.0,8.0,19.0,Алмалинский,2012


In [117]:
for key in input_dict.keys():
    break

In [118]:
key

'rooms'

In [106]:
input_dict['year'] = ''

In [107]:
curr_dict = preprocess(input_dict)

In [109]:
year_median_floor

{'(1990 - 2010]': 9.0, '[ - 1990]': 5.0, '[2010 - )': 10.0}

In [100]:
area_median_year

{'district9': 2013,
 'Алатауский': 2015,
 'Алмалинский': 1984,
 'Ауэзовский': 1986,
 'Бостандыкский': 1991,
 'Жетысуский': 1990,
 'Медеуский': 2007,
 'Наурызбайский': 2019,
 'Турксибский': 1991}

In [97]:
df = pd.read_csv(f'./data/train.csv')

In [98]:
df

Unnamed: 0,rooms,sq_m,floor,floors_all,year,price,area1,area2,area3,area4,area5,area6,area7,area8
0,2,80.0,20.0,21.0,2007,65000000,0,0,0,1,0,0,0,0
1,2,64.0,4.0,6.0,2008,29000000,0,0,0,0,1,0,0,0
2,1,44.0,7.0,13.0,2015,25300000,1,0,0,0,0,0,0,0
3,1,32.0,2.0,4.0,1963,22500000,0,0,0,1,0,0,0,0
4,4,210.0,3.0,9.0,2008,130000000,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,1,43.0,7.0,9.0,2017,19500000,1,0,0,0,0,0,0,0
8553,2,66.4,8.0,16.0,2006,60000000,0,0,0,1,0,0,0,0
8554,2,65.0,6.0,9.0,2006,80000000,0,0,0,0,0,1,0,0
8555,2,52.0,1.0,1.0,1960,25000000,1,0,0,0,0,0,0,0


In [49]:
\

In [50]:
input_dict

{'rooms': 2,
 'sq_m': 76.0,
 'floor': 8.0,
 'floors_all': 19.0,
 'area': 'Алмалинский',
 'year': '2012'}

In [59]:
pd.DataFrame.from_dict(input_dict.items(), columns=['rooms', 'sq_m', 'floor', 'floors_all', 'area', 'year'])

ValueError: cannot use columns parameter with orient='columns'

In [53]:
input_dict.keys()

dict_keys(['rooms', 'sq_m', 'floor', 'floors_all', 'area', 'year'])

In [26]:
cols = [a for a in df.columns if a not in ['price', 'urls']]
df[cols]

Unnamed: 0,rooms,sq_m,floor,floors_all,area,year
0,2,76.0,8.0,19.0,Алмалинский,2012
1,2,40.0,2.0,2.0,Ташкентская28,1959
2,3,100.8,1.0,3.0,Медеуский,2018
3,2,70.0,14.0,17.0,Ауэзовский,2014
4,2,45.0,3.0,4.0,Бостандыкский,1969
...,...,...,...,...,...,...
11901,2,69.0,6.0,9.0,Алатауский,2011
11902,2,51.3,5.0,5.0,Ауэзовский,1995
11903,2,56.0,1.0,2.0,Алатауский,1992
11904,3,61.8,1.0,5.0,Бостандыкский,1969


In [45]:
one_hot_enc.transform(pd.Series(df['area'].iloc[0])[:, None]).toarray()

array([[0., 1., 0., 0., 0., 0., 0., 0.]])

In [41]:
type(df['area'])

pandas.core.series.Series

In [42]:
pd.Series(df['area'].iloc[0])

0    Алмалинский
dtype: object