This notebook will open preprocessed data got from **_Data_Preprocessing**, split it on training and validation set, save into pickle file.
After use it to train the models

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import csv
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.optimizers import adam
from keras.models import Sequential
from keras import initializers
from keras import optimizers

from sklearn.neural_network import MLPRegressor
from keras.callbacks import ReduceLROnPlateau


Using TensorFlow backend.


In [2]:
dbfile = open('pd_x_pd_y_pdCodebook', 'rb') 
Data = pickle.load(dbfile)                      
dbfile.close()
pd_x = Data['pd_x']
pd_y = Data['pd_y']
pdCodebook = Data['pdCodebook']
x = pd_x.to_numpy()
y = pd_y.to_numpy()
Feature_names=pd_x.columns.to_list()
Label_names=pd_y.columns.to_list()

#Standartization and Splitting
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
y = np.nan_to_num(y) #replaces NaN to 0, which is not ideal
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

Training_set = {'X_train':X_train, 'y_train':y_train, 'Feature_names':Feature_names, 'Label_names':Label_names}
Test_set = {'X_test':X_test, 'y_test':y_test, 'Feature_names':Feature_names, 'Label_names':Label_names}

m_train, n = X_train.shape
m_test,n = X_test.shape



In [3]:
#Save training and validation sets separately
dbfile = open('Training_set', 'wb') 
pickle.dump(Training_set, dbfile)                      
dbfile.close() 
dbfile = open('Test_set', 'wb') 
pickle.dump(Test_set, dbfile)                      
dbfile.close() 

In [4]:
y_test.shape

(9694, 108)

It's important to know, what parameters are most important. Rough estimation of parameter importance is possible to get from coefficients (weights) of Lenear Gerression models. Function **Model_Train_n_Save** trains two models - **Lenear Regression and Multi-layer Perceptron** and saves MLP model into Models folder for further use in GUI. It returns Dataframe **Report** with:
- Label Name
- LR_mae_% - mean absolute error in percents of Lenear Regression, based on validation
- MLP_mae_% - mean absolute error in percents of MLP, based on validation
- Coefs_in_Order - Coeficients of Lenear regression for estimation of parameter importance, in descending order. They can be positive and negative, so moset importan ones are on the beginning and the end, and less important are in the middle. Note: this can be relevant only for models with accepteble validation error.

In [5]:
#function plots and saves Loss curve
def plot_loss(losses, Label_name):
    #make folder for Lass curves
    if (os.path.isdir('Loss_curves')==0):
        os.mkdir('Loss_curves')
    path = os.path.join('Loss_curves',Label_name+'.png')
    print (path)
    epochs = range(len(losses))
    
    plt.plot(epochs, losses, 'b', label='Loss')
    plt.legend(loc=0)
    plt.grid()
    plt.savefig(path)

In [7]:
#training the Linear Regression and MLP model and saving model
def Model_Train_n_Save (X_train, y_train, X_test, y_test, Feature_names, Label_name):
    
    Report = pd.DataFrame(columns = [ 'Label_name','LR_mae_%','MLP_mae_%','Coefs_in_Order'])
   
    ## Linear Regression ##
    model = LinearRegression() #that's a model
    model.fit(X_train, y_train) #training the model

    
    ## Validation for LR ##   
    Report.loc[0, 'LR_mae_%'] = mean_absolute_error(y_test, model.predict(X_test))/ np.mean(y_test) *100
    pdCoefs = pd.DataFrame(data = model.coef_.T, index=Feature_names).sort_values(by=0, ascending=False)
    Report.loc[0, 'Coefs_in_Order'] = pdCoefs.index.to_list()
  
   
    ## Multi-layer Perceptron regressor ##  
    model = MLPRegressor(hidden_layer_sizes=(200,120,80,50), max_iter=400) #that's a model
    model.fit(X_train, y_train) #training the model
    plot_loss(model.loss_curve_, Label_to_predict) #plotting loss curve
    
    Report.loc[0, 'MLP_mae_%'] = mean_absolute_error(y_test, model.predict(X_test)) / np.mean(y_test) *100
    
    Report.loc[0, 'Label_name'] = Label_name
    
    pic_file = open('Models/'+Label_name+'_MLP.pickle', 'wb')
    pickle.dump(model, pic_file)
    pic_file.close()
    
    return Report
    

In [None]:
#For one Label
Label_to_predict = 'TOTALBTUPL'
y_indx = Label_names.index(Label_to_predict)
Model_Report = Model_Train_n_Save (X_train, y_train[:,y_indx].reshape(m_train,1)
                                  , X_test, y_test[:,y_indx].reshape(m_test,1)
                                  , Feature_names, Label_names[y_indx])
Model_Report

  y = column_or_1d(y, warn=True)


In [24]:
########################STOP################################


This code runs model fitting for all possible labels, so it takes long time to run. Don't run it till you are sure that's what you want.

In [13]:
Report = pd.DataFrame(columns = [ 'Label_name','LR_mae_%','MLP_mae_%','Coefs_in_Order'])
y_indx=0
for y_indx in range(len(Label_names)):
    elem = Label_names[y_indx]    
    Model_Report = Model_Train_n_Save (X_train, y_train[:,y_indx].reshape(m_train,1)
                                , X_test, y_test[:,y_indx].reshape(m_test,1)
                                , Feature_names, Label_names[y_indx])           
    Report = pd.concat([Report,Model_Report], ignore_index=True)
Report    

  
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Unnamed: 0,Label_name,LR_mae_%,MLP_mae_%,Coefs_in_Order
0,GALLONLPCDR,,inf,"[FOOTHER, ATTCCOOL, PROTHERM, NUMBERAC, GARGCO..."
1,CUFEETNG,29.8968,10.229,"[WDOTHER, NUMADULT, NUMCHILD, ENERGYASST, CELL..."
2,CUFEETNGCDR,80.7444,29.889,"[DRYER, USELP, LPWATER, NOHEATBULK, USEWOOD, W..."
3,CUFEETNGCOK,46.9661,10.9881,"[USEWOOD, WDWARM, NUMCHILD, NUMADULT, UGCOOK, ..."
4,CUFEETNGSPH,39.7829,4.92215,"[WDOTHER, ENERGYASST, NUMADULT, NUMCHILD, FOPA..."
...,...,...,...,...
103,TOTALDOLHTB,85.4676,12.7079,"[SWIMPOOL, H2OHEATAPT, UGWATER, MONTUB, USEFO,..."
104,TOTALDOLNEC,76.2838,13.269,"[WDOTHER, SWIMPOOL, SOLWATER, NOACEL, H2OHEATA..."
105,TOTALDOLPL,67.343,80.2515,"[POOL, MONPOOL, HEATHOME, FOOTHER, SOLAR, NOAC..."
106,TOTALDOLSPH,37.9682,2.31372,"[USEFO, WDOTHER, ENERGYASST, NUMADULT, NUMCHIL..."


In [14]:
#Save result to pickle
dbfile = open('Report', 'wb') 
pickle.dump(Report, dbfile)                      
dbfile.close() 

In [102]:
#We need just good predictions, where 'MLP_mae_%'<30%
Report = Report.replace([np.inf, -np.inf], np.nan)
Report = Report.dropna()
Rep_shortened = Report[Report['MLP_mae_%']<30].copy()
Good_predict = Rep_shortened['Label_name'].to_list() #list of good predictions

Need to create new **codebook_for_Labels.xlsx** for GUI, so GUI won't run bad predictions, only good ones.

In [103]:
pdCdbkLabels = pd.read_excel('codebook_for_Labels_all.xlsx', delimiter=',', header=0)

pdNewLabelCodebook = pd.DataFrame(np.zeros((len(Good_predict), 4)), columns = ['SAS Variable Name', 'Variable Description', 'Final Response Set', 'Error'])
for ind in range(len(Good_predict)):
    pdNewLabelCodebook.loc[ind,'SAS Variable Name']=Good_predict[ind]
    ind_pdCdbkLabels = pdCdbkLabels[pdCdbkLabels['SAS Variable Name']==Good_predict[ind]].index.values[0]
    pdNewLabelCodebook.loc[ind,'Variable Description']=pdCdbkLabels.loc[ind_pdCdbkLabels, 'Variable Description']
    pdNewLabelCodebook.loc[ind,'Final Response Set']=pdCdbkLabels.loc[ind_pdCdbkLabels, 'Final Response Set']
    ind_Report = Rep_shortened[Rep_shortened['Label_name']==Good_predict[ind]].index.values[0]
    pdNewLabelCodebook.loc[ind,'Error']= Rep_shortened.loc[ind_Report, 'MLP_mae_%']

In [105]:
pdNewLabelCodebook.to_excel('codebook_for_Labels.xlsx')