In [23]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import seaborn as sns
from scipy import stats
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures


pd.options.display.max_columns = 999  # Allos us to display all columns
pd.options.display.max_rows = 999  # Allos us to display all columns

df_train = pd.read_csv('./datasets/cleaned_train4.csv',keep_default_na=False)
df_test = pd.read_csv('./datasets/cleaned_test4.csv',keep_default_na=False)

df_train.columns = df_train.columns.str.lower() # column names to lower case
df_test.columns = df_test.columns.str.lower() # column names to lower case

In [24]:
df_train['base']=df_train['saleprice'].mean()

In [25]:
df_train['base']=df_train['saleprice'].mean()
MRSE = metrics.mean_squared_error(y_true=df_train['saleprice'], y_pred=df_train['base'] )**.5
print(f'Mean Root Square Error w/ all cols: {round(MRSE,2)}.')

Mean Root Square Error w/ all cols: 79239.34.


In [55]:
features = [
 'overall qual',
 'exter qual',
 'gr liv area',
 'kitchen qual',
 'garage area',
 'garage cars',
 'total bsmt sf',
 '1st flr sf',
 'bsmt qual',
 'year built',
 'garage finish',
 'year remod/add',
 'fireplace qu',
 'full bath',
 'foundation_pconc',
 'totrms abvgrd',
 'mas vnr area',
 'fireplaces',
 'heating qc',
 'neighborhood_nridght',
 'bsmt exposure',
 'bsmtfin sf 1',
 'exteriors',
 'exterior 1st',
 'mas vnr type_0',
 'exterior 2nd',
 'garage type_1',
 'sale type_new',
 'garage type_5',
 'foundation_cblock',
 'bsmtfin type 1',
 'open porch sf',
 'wood deck sf',
 'lot frontage',
 'mas vnr type_1',
 'lot area',
 'lot shape',
 'paved drive',
 'garage qual',
 'bsmt full bath',
 'half bath',
 'ms zoning_rm',
 'central air',
 'roof style_hip',
 'garage cond',
 'neighborhood_noridge',
 'garage yr blt',
 'mas vnr type_3',
 'neighborhood_stonebr',
 'pid',
 'roof style_gable',
 '2nd flr sf',
 'electrical',
 'ms zoning_rl',
 'garage type_0',
 'foundation_brktil',
 'bsmt cond',
 'garage type_3',
 'conditions',
 'sale type_wd ',
 'land contour_hls',
 'neighborhood_oldtown',
 'condition 1',
 'house style_2story',
 'house style_1.5fin',
 'bsmt unf sf',
 'neighborhood_names',
 'neighborhood_idotrr',
 'neighborhood_edwards',
 'lot config_culdsac',
 'alley_2',
 'fence',
 'neighborhood_somerst',
 'bedroom abvgr',
 'enclosed porch',
 'alley_0',
 'neighborhood_brkside',
 'screen porch',
 'neighborhood_sawyer',
 'condition 2',
 'ms zoning_c (all)',
 'kitchen abvgr',
 'foundation_slab',
 'neighborhood_timber',
 'functional',
 'bldg type_twnhs',
 'neighborhood_meadowv',
 'bldg type_2fmcon',
 'sale type_cod',
 'ms zoning_fv',
 'bldg type_1fam',
 'bldg type_duplex',
 'roof matl_wdshngl',
 'land contour_bnk',
 'overall cond',
 'neighborhood_brdale',
 'heating_gasa',
 'land contour_lvl',
 'ms subclass',
 'neighborhood_veenker',
 'neighborhood_collgcr',
 'lot config_inside',
 'neighborhood_swisu',
 'heating_grav',
 'garage type_2',
 ## 'heating_wall',
 'street',
 'roof matl_compshg',
 'house style_1.5unf']
 # 'house style_sfoyer']
 # 'bldg type_twnhse',
 # 'sale type_conld']
 # 'neighborhood_crawfor',
 # 'land slope',
 # 'garage type_4',
##  'ms zoning_a (agr)',
#  'neighborhood_clearcr',
#  'id',
#  '3ssn porch',
#  'misc feature_0',
#  'misc feature_2',
#  'neighborhood_npkvill',
#  'roof matl_wdshake',
#  'bsmt half bath',
#  'house style_2.5fin',
#  'house style_slvl',
#  'roof style_gambrel',
#  'low qual fin sf',
# ##  'neighborhood_grnhill',
#  'heating_gasw',
#  'mas vnr type_4',
#  'exter cond',
#  'neighborhood_mitchel',
#  'sale type_conlw',
#  'neighborhood_nwames',
#  'ms zoning_i (all)',
#  'ms zoning_rh',
#  'sale type_con',
#  'land contour_low',
#  'mo sold',
#  'sale type_oth',
#  'garage type_6',
#  'pool qc',
#  'utilities',
#  'neighborhood_blueste',
#  'neighborhood_blmngtn',
#  'neighborhood_gilbert',
#  'foundation_stone',
#  'pool area',
# ##  'heating_othw',
#  'house style_1story',
#  'roof style_mansard',
#  'sale type_conli',
#  'misc feature_4',
# ##  'misc feature_1',
# ##  'roof matl_membran',
#  'neighborhood_sawyerw',
#  'bsmtfin sf 2',
#  'yr sold',
#  'lot config_fr2',
#  'alley_1',
#  'lot config_fr3',
#  'bsmtfin type 2',
#  'roof style_shed',
# ##  'neighborhood_landmrk',
#  'house style_2.5unf',
#  'roof style_flat',
#  'misc feature_3',
#  'misc val',
#  'roof matl_tar&grv',
# ##  'roof matl_clytile',
#  'lot config_corner',
#  'sale type_cwd',
#  'neighborhood_greens',
#  'foundation_wood']
# ##  'misc feature_5']


X = df_train[features]
y=df_train['saleprice']



In [56]:

model = LinearRegression()
model.fit(X,y)
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42) 

print(f'LR training R^2 score: {round(model.score(X_train, y_train),4)}')
print(f'LR test R^2 score: {round(model.score(X_test, y_test),4)}')
print(f'LR CrossVal score: {round(cross_val_score(model, X_train, y_train).mean(),8)}')

MRSE = metrics.mean_squared_error(y_true=df_train['saleprice'], y_pred=model.predict(df_train[features] ))**.5
print(f'Mean Root Square Error w/ all cols: {round(MRSE,2)}.')




pred = model.predict(df_test[features])   # make predictions
df_test['saleprice'] = pred               # add predictions to the test data

LR training R^2 score: 0.8846
LR test R^2 score: 0.9111
LR CrossVal score: 0.84017379
Mean Root Square Error w/ all cols: 26150.7.


In [57]:
# LR training R^2 score: 0.8853
# LR test R^2 score: 0.9121
# LR CrossVal score: 0.8298518
# Mean Root Square Error w/ all cols: 26059.57.

In [58]:
### PREDICTIONS BEFORE LOG

# pred = model.predict(df_test[features])   # make predictions
# df_test['saleprice'] = pred               # add predictions to the test data

In [59]:
## to do the log transformation....

y_train_log = y_train.map(np.log)
y_test_log = y_test.map(np.log)

cross_val_score(model, X_train, y_train_log).mean()
model.fit(X_train, y_train_log)

print(f'Log train score {model.score(X_train, y_train_log)}')
print(f'Log test score {model.score(X_test, y_test_log)}')

MRSE = metrics.mean_squared_error(y_true=df_train['saleprice'], y_pred=model.predict(df_train[features] ))**.5
print(f'Mean Root Square Error w/ all cols: {round(MRSE,2)}.')


pred_test = model.predict(df_test[features])
df_test['saleprice'] = np.exp(pred_test)


Log train score 0.9094909629215004
Log test score 0.8828179456155364
Mean Root Square Error w/ all cols: 198004.3.


In [44]:
# Log train score 0.9128293650786863
# Log test score 0.8745333564488338
# Mean Root Square Error w/ all cols: 198004.3.

In [45]:
### PREDICTIONS AFTER LOG


pred_test = model.predict(df_test[features])
df_test['saleprice'] = np.exp(pred_test)


In [46]:
# ss = StandardScaler()
# ss.fit(X)
# X_scaled = ss.transform(X)
# #X_scaled[:1, :]
# Y_scaled = ss.transform(y)
# #Y_scaled[:1, :]

In [47]:
# #X = df_train[features]

# poly = PolynomialFeatures(include_bias=False)
# X_poly = poly.fit_transform(X_train)
# pd.DataFrame(X_poly, columns=poly.get_feature_names(features))

# cross_val_score(model, X_poly, y_train_log, cv=5).mean()
# #model.fit(X_train, y_train_log)

In [48]:
submission = df_test[['id','saleprice']]

In [49]:
submission.set_index('id', inplace = True)

In [50]:
submission.to_csv('./datasets/submissions/submission22.csv')