# Dashboard Model

In [1]:
import pickle 
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, train_test_split

### Load and Split Data

In [2]:
data = pd.read_csv('HADAR - encoded.csv')

In [3]:
features = data.drop(columns=['SalePrice','MSSubClass','GrLivArea'])
target = data.SalePrice

In [4]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=.2, random_state=0)

In [5]:
X_train.shape

(2064, 200)

### Feature Selection

In [6]:
clv = Lasso(random_state=0).fit(X_train, y_train)
search = GridSearchCV(Lasso(random_state=0, tol=.1), 
                      {'alpha': np.arange(0.01,1000,1)},
                     cv=5,
        scoring='neg_mean_squared_error'
                     )
search.fit(X_train, y_train)
search.best_params_

pickle.dump(search, open('flask_fsgs.sav', 'wb'))
search = pickle.load(open('flask_fsgs.sav', 'rb'))
search.best_params_

coefficients = search.best_estimator_.coef_
importance = np.abs(coefficients)
features = X_train.columns
keep_feat = np.array(features)[importance > 0]
X_train = X_train[keep_feat]
X_test = X_test[keep_feat]

df_imp = pd.DataFrame(zip(keep_feat,importance)).sort_values(1, ascending = False)
df_imp = df_imp.loc[df_imp[1]>0]
df_imp[:20]

Unnamed: 0,0,1
18,GarageCars,17741.066462
1,OverallQual,10702.687545
17,GarageYrBlt,8172.486518
49,Neighborhood_Mitchel,6854.990902
2,OverallCond,6387.882746
20,WoodDeckSF,5320.875147
62,Exterior1st_Plywood,4573.435082
22,EnclosedPorch,3785.780533
48,Neighborhood_Gilbert,3751.396328
34,BsmtFinType2_Enc,2918.671188


In [7]:
X = X_train[keep_feat]

In [8]:
las = Lasso(random_state=0)
las.fit(X,y_train)

Lasso(random_state=0)

In [9]:
check_pred = las.predict(X_test[keep_feat])
r2_score(y_test, check_pred)

0.9110554622952276

In [10]:
# save 10 feature set
dummies = ['Neighborhood',
           'BldgType', 
           'HouseStyle',
           'Exterior1st',
          'Exterior2nd',
          'MasVnrType',
          'Foundation',
          'Electrical',
          'GarageType',
          'SaleType',
          'SaleCondition']
# cols = [c for c in keep_feat if c[] != 'test']
# df.loc[:,~df.columns.str.contains('^test', case=False)] 

In [11]:
strings = ["one", "two", "three"]
substring = "wo"

strings_with_substring = [string for string in strings if substring in string]
strings_with_substring

['two']

In [17]:
######### EXCLUDE DUMMIFIED VARIABLES DUE TO TIME CRUNCH ############

# for loop fail
# keep_feat_new = []
# for value in keep_feat:
#     for item in dummies:
#         keep_feat_new = [value for value in keep_feat if item not in value]
# keep_feat.size
keep_feat_new = ['LotArea',
                 'OverallQual',
                 'YearBuilt',
                 'YearRemodAdd',
                 'MasVnrArea',
                 'BsmtFinSF1',
                 'BsmtFinSF2',
                 'BsmtUnfSF',
                 'TotalBsmtSF',
                 '1stFlrSF',
                 '2ndFlrSF',
                 'LowQualFinSF',
                 'BedroomAbvGr',
                 'TotRmsAbvGrd',
                 'Fireplaces',
                 'GarageYrBlt',
                 'GarageCars',
                 'GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MoSold','YrSold','ExterQual_Enc','BsmtExposure_Enc',
 'BsmtFinType1_Enc',
 'BsmtFinType2_Enc',
 'HeatingQC_Enc',
 'KitchenQual_Enc',
 'Functional_Enc',
 'FireplaceQu_Enc',
 'GarageFinish_Enc',
 'GarageQual_Enc',
 'PavedDrive_Enc',
 'PoolQC_Enc',
 'LandContour_HLS']

In [18]:
X2 = X[keep_feat_new]
X2

Unnamed: 0,LotArea,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,BsmtFinType2_Enc,HeatingQC_Enc,KitchenQual_Enc,Functional_Enc,FireplaceQu_Enc,GarageFinish_Enc,GarageQual_Enc,PavedDrive_Enc,PoolQC_Enc,LandContour_HLS
2467,14559,5,1951,2000,70.0,650.0,180.0,178.0,1008.0,1363,...,3,5,1,1,1,1,1,2,0,0
220,11952,7,1977,1977,0.0,0.0,0.0,808.0,808.0,1161,...,1,1,1,0,4,2,1,2,0,0
1101,8546,4,2003,2004,0.0,0.0,0.0,1121.0,1121.0,1121,...,1,5,1,0,0,2,1,2,0,0
17,8248,3,1914,1950,0.0,41.0,0.0,823.0,864.0,864,...,1,1,1,0,0,0,0,0,0,0
58,10542,7,1993,1994,651.0,1173.0,0.0,138.0,1311.0,1325,...,1,5,4,0,1,2,1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1033,9084,4,1940,1950,0.0,0.0,0.0,755.0,755.0,755,...,1,1,1,0,4,1,3,1,0,0
1731,6390,6,1954,1954,0.0,0.0,0.0,936.0,936.0,984,...,1,1,1,0,0,1,1,2,0,0
763,8944,5,1967,1967,0.0,0.0,0.0,1584.0,1584.0,1584,...,1,1,1,3,0,1,1,2,0,0
835,7506,5,1925,1950,0.0,0.0,0.0,747.0,747.0,747,...,1,1,1,0,0,1,3,0,0,0


In [19]:
### lasso round 2 with no dummy data
clv = Lasso(random_state=0).fit(X2, y_train)
search = GridSearchCV(Lasso(random_state=0, tol=.01), 
                      {'alpha': np.arange(0.01,1000,1)},
                     cv=5,
        scoring='neg_mean_squared_error'
                     )
search.fit(X2, y_train)
search.best_params_

pickle.dump(search, open('flask_fsgs2.sav', 'wb'))
search = pickle.load(open('flask_fsgs2.sav', 'rb'))
search.best_params_

coefficients = search.best_estimator_.coef_
importance = np.abs(coefficients)
features = X2.columns
keep_feat = np.array(features)[importance > 0]

df_imp = pd.DataFrame(zip(keep_feat,importance)).sort_values(1, ascending = False)
df_imp = df_imp.loc[df_imp[1]>1000]
df_imp

Unnamed: 0,0,1
37,PoolQC_Enc,20292.198437
38,LandContour_HLS,13914.346771
1,OverallQual,11935.167128
12,BedroomAbvGr,6213.31189
32,Functional_Enc,4918.9588
14,Fireplaces,3334.873689
31,KitchenQual_Enc,3027.979339
26,ExterQual_Enc,2681.583742
27,BsmtExposure_Enc,2321.289157
13,TotRmsAbvGrd,2264.436508


In [20]:
######## 16 feature test --> 76.754% best accuracy  #############

# X3_train = X_train[df_imp[0]]
# X3_test = X_test[df_imp[0]]

In [21]:
########## KEEP 47 FEATURES LEFT BY LASSO ############

X3_train = X_train[keep_feat]
X3_test = X_test[keep_feat]

### Train Model

In [22]:
las2 = Lasso(random_state=0)
las2.fit(X3_train, y_train)

Lasso(random_state=0)

In [23]:
y_pred = las2.predict(X3_test)
r2_score(y_test, y_pred)

0.8898291126292244

In [24]:
######## SCALER ##########
# mms = MinMaxScaler()
# mms.fit(X3_train)
# X3_train = mms.transform(X3_train)
# pickle.dump(mms, open('MinMaxScaler.pkl', 'wb'))

In [25]:
# pipe = make_pipeline(
#             MinMaxScaler(),
#             Lasso()
# )
# pipe.fit(X3_train,y_train)
# print('pipe score',pipe.score(X3_test,y_test))     

In [26]:
######### GRID SEARCH CV ##########

# Grid Search Cross Validation
# params = {'alpha': np.arange(-1000,1000,10),
#           'selection': ['cyclic','random']}
# clf = las.fit(X3_train, y_train)
# gs = GridSearchCV(clf, param_grid=params, cv=3)
# gs.fit(X3_train,y_train)
# print('best CV score:', gs.best_score_)
# print('best hyperparameter combination:', gs.best_params_)

In [27]:
# y_pred = gs.best_estimator_.predict(X3_test)
# r2_score(y_test, y_pred)

### Pickle for Flask

In [28]:
# no grid search / no scaler lasso performed the best 
# --> not enough time to keep experimenting
model = las2
pickle.dump(model, open('dash_model.pkl','wb'))

### Dashboard Pipeline
- ~~dummify~~
- ~~scale~~
- predict

In [29]:
model = pickle.load(open('dash_model.pkl','rb'))
sample_input = X3_test.sample(1, random_state=1)
prediction = model.predict(sample_input)

In [None]:
pd.DataFrame(sample_input).to_csv('sample input.csv')

In [86]:
y_test[634]

107400

In [84]:
data.sample(1, random_state=2)

Unnamed: 0,GrLivArea,SalePrice,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_VWD,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
1475,1869,124900,50,8820,5,6,1890,1996,0.0,1088.0,...,0,0,0,0,1,0,0,0,1,0


In [31]:
prediction

array([105171.329933])

In [81]:
# check if works with array vs df for app
sample_array = sample_input.to_numpy()
model.predict(sample_array)

array([105171.329933])

In [32]:
# calculate mean absolute error to provide prediction range in app 
y_pred = model.predict(X3_test)
mae = mean_absolute_error(y_test, y_pred)
mae

18118.90217577298

#### References for creating app

In [92]:
data['GarageFinish_Enc'].max()

3

In [89]:
sample_input.columns

Index(['LotArea', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'BedroomAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MoSold', 'YrSold', 'ExterQual_Enc', 'BsmtExposure_Enc',
       'BsmtFinType1_Enc', 'BsmtFinType2_Enc', 'HeatingQC_Enc',
       'KitchenQual_Enc', 'Functional_Enc', 'FireplaceQu_Enc',
       'GarageFinish_Enc', 'GarageQual_Enc', 'PavedDrive_Enc', 'PoolQC_Enc',
       'LandContour_HLS'],
      dtype='object')

In [98]:
sample_input.columns

Index(['LotArea', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'BedroomAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MoSold', 'YrSold', 'ExterQual_Enc', 'BsmtExposure_Enc',
       'BsmtFinType1_Enc', 'BsmtFinType2_Enc', 'HeatingQC_Enc',
       'KitchenQual_Enc', 'Functional_Enc', 'FireplaceQu_Enc',
       'GarageFinish_Enc', 'GarageQual_Enc', 'PavedDrive_Enc', 'PoolQC_Enc',
       'LandContour_HLS'],
      dtype='object')

In [None]:
#### for translating dummified data input.... not enough time

# sample_raw = {}

# index_dict = dict(zip(df.columns,range(df.shape[1])))

# new_vector = np.zeroes(297)
# try:
#     new_vector[index_dict[origin]] = 1
# except:
#     pass
# try:
#     new_vector[index_dict[destination]] = 1
# except:
#     pass
# try:
#     new_vector[index_dict[carrier]] = 1
# except:
#     pass