##### About the required outputs

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import pickle
import joblib

from preprocessing_ml.scripts import make_training_dataframe

data = pd.read_csv('../input/preprocess_data/to_train.csv', index_col=['Unnamed: 0'])
data.drop(['Tasa hipotecaria Trimestral'],axis=1,inplace=True)

#From Model Selection insights:
to_replace_11 = data[data['Piso de ubicación'] > 10].index
data.loc[to_replace_11,'Piso de ubicación'] = 11
to_drop = data[data['Superficie '] >= data['Superficie '].quantile(.99)].index
data.drop(to_drop, axis=0,inplace=True)

### Get dummies variables.
data = pd.get_dummies(data, columns=['antiguedad_cat','piso_cat'],drop_first=True)

#To decode Districts:

with open('../models/objects/dict_encoders.pickle','rb') as handle:
    dict_encoders = pickle.load(handle)

### Inference

In [111]:
w_s = 74  #In order to get the last 5 periods for train and test.
warnings.filterwarnings('ignore')
train_data, test_data, testing_dates = make_training_dataframe(data, window_size=74)

lr = LinearRegression(fit_intercept=True, normalize=True)
lr.fit(train_data.drop('target',axis=1), train_data['target'])

predictions = lr.predict(test_data.drop('target',axis=1))

to_summit = pd.DataFrame({'District': dict_encoders['Distrito'].inverse_transform(test_data['Distrito'].astype(int)),
                          'Predicted_price_m2':np.exp(predictions),
                          'PCA_comp':test_data['PCA_comp'],
                          'Last_price_m2':np.exp(test_data['LAG_1'])})

In [120]:
#Extra_table
print('For {}'.format(testing_dates))
extra_table = to_summit.groupby('District')[['Predicted_price_m2','Last_price_m2']].agg('mean')
extra_table['pct_change'] = 100*(extra_table['Predicted_price_m2'] - extra_table['Last_price_m2']) / extra_table['Last_price_m2']
extra_table.sort_values('pct_change')

For 2019-01-01


Unnamed: 0_level_0,Predicted_price_m2,Last_price_m2,pct_change
District,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
barranco,4994.235866,5363.235806,-6.880174
san isidro,5631.476701,5999.528048,-6.134672
miraflores,5576.229188,5738.893843,-2.834425
lince,4180.363838,4271.554832,-2.134843
pueblo libre,3763.211912,3818.904044,-1.458328
surco,4328.64881,4354.479117,-0.593189
la molina,3766.630618,3769.152675,-0.066913
magdalena,4139.923701,4114.178464,0.625769
jesús maría,4113.294865,4081.882565,0.769554
san borja,4699.471956,4558.224461,3.098739


##### Saving model and output data

In [129]:
import os 

joblib.dump(lr,
           "../models/"+"Time_series_lr.bin")

['../models/Time_series_lr.bin']

In [131]:
to_summit.to_csv('../outputs/forecasting-2019-01.csv', index=False)

In [134]:
extra_table.to_csv('../outputs/extra_table-2019-01.csv')