In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression

In [3]:
df = pd.read_csv("../datasets/train_1st_clean.csv")
df.shape

(2051, 82)

In [4]:
kaggle_data = pd.read_csv('../datasets/test.csv')
kaggle_data.shape

(878, 80)

In [5]:
df = df.drop(columns='Unnamed: 0')
df.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [6]:
kaggle_data.columns = [col.replace(" ", "_").lower() for col in kaggle_data.columns]
kaggle_data.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [9]:
from sklearn.preprocessing import PolynomialFeatures

In [10]:
new_features = ['year_built', 'total_bsmt_sf', 'garage_cars', 'garage_area', 'gr_liv_area', 'overall_qual']

In [11]:
X = df[new_features]
y = df['saleprice']

In [12]:
poly = PolynomialFeatures(include_bias=False)
poly.fit(X)

PolynomialFeatures(include_bias=False)

In [13]:
X_poly = poly.transform(X)
X_poly

array([[1.976000e+03, 7.250000e+02, 2.000000e+00, ..., 2.187441e+06,
        8.874000e+03, 3.600000e+01],
       [1.996000e+03, 9.130000e+02, 2.000000e+00, ..., 4.502884e+06,
        1.485400e+04, 4.900000e+01],
       [1.953000e+03, 1.057000e+03, 1.000000e+00, ..., 1.117249e+06,
        5.285000e+03, 2.500000e+01],
       ...,
       [1.928000e+03, 8.960000e+02, 2.000000e+00, ..., 3.659569e+06,
        1.147800e+04, 3.600000e+01],
       [1.956000e+03, 1.200000e+03, 1.000000e+00, ..., 1.440000e+06,
        4.800000e+03, 1.600000e+01],
       [1.999000e+03, 9.940000e+02, 2.000000e+00, ..., 3.254416e+06,
        1.262800e+04, 4.900000e+01]])

In [15]:
poly.get_feature_names(new_features)

['year_built',
 'total_bsmt_sf',
 'garage_cars',
 'garage_area',
 'gr_liv_area',
 'overall_qual',
 'year_built^2',
 'year_built total_bsmt_sf',
 'year_built garage_cars',
 'year_built garage_area',
 'year_built gr_liv_area',
 'year_built overall_qual',
 'total_bsmt_sf^2',
 'total_bsmt_sf garage_cars',
 'total_bsmt_sf garage_area',
 'total_bsmt_sf gr_liv_area',
 'total_bsmt_sf overall_qual',
 'garage_cars^2',
 'garage_cars garage_area',
 'garage_cars gr_liv_area',
 'garage_cars overall_qual',
 'garage_area^2',
 'garage_area gr_liv_area',
 'garage_area overall_qual',
 'gr_liv_area^2',
 'gr_liv_area overall_qual',
 'overall_qual^2']

In [20]:
df_shortened_poly = pd.DataFrame(X_poly, columns=poly.get_feature_names(new_features))

### Need to add the id column and put first so can merge with dummy df later

In [21]:
df_shortened_poly['id'] = df['id']
cols = df_shortened_poly.columns.tolist()
cols = cols[-1:] + cols[:-1]
df_shortened_poly = df_shortened_poly[cols].copy() #  .copy() gets rid of the data error problem when making the saleprice column
df_shortened_poly.head()

Unnamed: 0,id,year_built,total_bsmt_sf,garage_cars,garage_area,gr_liv_area,overall_qual,year_built^2,year_built total_bsmt_sf,year_built garage_cars,...,garage_cars^2,garage_cars garage_area,garage_cars gr_liv_area,garage_cars overall_qual,garage_area^2,garage_area gr_liv_area,garage_area overall_qual,gr_liv_area^2,gr_liv_area overall_qual,overall_qual^2
0,109,1976.0,725.0,2.0,475.0,1479.0,6.0,3904576.0,1432600.0,3952.0,...,4.0,950.0,2958.0,12.0,225625.0,702525.0,2850.0,2187441.0,8874.0,36.0
1,544,1996.0,913.0,2.0,559.0,2122.0,7.0,3984016.0,1822348.0,3992.0,...,4.0,1118.0,4244.0,14.0,312481.0,1186198.0,3913.0,4502884.0,14854.0,49.0
2,153,1953.0,1057.0,1.0,246.0,1057.0,5.0,3814209.0,2064321.0,1953.0,...,1.0,246.0,1057.0,5.0,60516.0,260022.0,1230.0,1117249.0,5285.0,25.0
3,318,2006.0,384.0,2.0,400.0,1444.0,5.0,4024036.0,770304.0,4012.0,...,4.0,800.0,2888.0,10.0,160000.0,577600.0,2000.0,2085136.0,7220.0,25.0
4,255,1900.0,676.0,2.0,484.0,1445.0,6.0,3610000.0,1284400.0,3800.0,...,4.0,968.0,2890.0,12.0,234256.0,699380.0,2904.0,2088025.0,8670.0,36.0


# Poly finished, need to get dummy columns now

In [22]:
df_dummies = pd.get_dummies(df, columns=['house_style', 'lot_shape', 'ms_zoning'])

In [24]:
new_df = pd.merge(df_dummies, df_shortened_poly, on='id')

In [28]:
new_df.shape

(2051, 124)

In [27]:
new_df.columns.value_counts().sum() # no duplicate columns only unique

124

In [34]:
pd.set_option('display.max_columns', 124)

In [35]:
new_df.head()

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,street,alley,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,overall_qual_x,overall_cond,year_built_x,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf_x,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area_x,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,fireplace_qu,garage_type,garage_yr_blt,garage_finish,garage_cars_x,garage_area_x,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,saleprice,house_style_1.5Fin,house_style_1.5Unf,house_style_1Story,house_style_2.5Fin,house_style_2.5Unf,house_style_2Story,house_style_SFoyer,house_style_SLvl,lot_shape_IR1,lot_shape_IR2,lot_shape_IR3,lot_shape_Reg,ms_zoning_A (agr),ms_zoning_C (all),ms_zoning_FV,ms_zoning_I (all),ms_zoning_RH,ms_zoning_RL,ms_zoning_RM,year_built_y,total_bsmt_sf_y,garage_cars_y,garage_area_y,gr_liv_area_y,overall_qual_y,year_built^2,year_built total_bsmt_sf,year_built garage_cars,year_built garage_area,year_built gr_liv_area,year_built overall_qual,total_bsmt_sf^2,total_bsmt_sf garage_cars,total_bsmt_sf garage_area,total_bsmt_sf gr_liv_area,total_bsmt_sf overall_qual,garage_cars^2,garage_cars garage_area,garage_cars gr_liv_area,garage_cars overall_qual,garage_area^2,garage_area gr_liv_area,garage_area overall_qual,gr_liv_area^2,gr_liv_area overall_qual,overall_qual^2
0,109,533352170,60,,13517,Pave,,Lvl,AllPub,CulDSac,Gtl,Sawyer,RRAe,Norm,1Fam,6,8,1976,2005,Gable,CompShg,HdBoard,Plywood,BrkFace,289.0,Gd,TA,CBlock,TA,TA,No,GLQ,533.0,Unf,0.0,192.0,725.0,GasA,Ex,Y,SBrkr,725,754,0,1479,0.0,0.0,2,1,3,1,Gd,6,Typ,0,,Attchd,1976.0,RFn,2.0,475.0,TA,TA,Y,0,44,0,0,0,0,,,,0,3,2010,WD,130500,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1976.0,725.0,2.0,475.0,1479.0,6.0,3904576.0,1432600.0,3952.0,938600.0,2922504.0,11856.0,525625.0,1450.0,344375.0,1072275.0,4350.0,4.0,950.0,2958.0,12.0,225625.0,702525.0,2850.0,2187441.0,8874.0,36.0
1,544,531379050,60,43.0,11492,Pave,,Lvl,AllPub,CulDSac,Gtl,SawyerW,Norm,Norm,1Fam,7,5,1996,1997,Gable,CompShg,VinylSd,VinylSd,BrkFace,132.0,Gd,TA,PConc,Gd,TA,No,GLQ,637.0,Unf,0.0,276.0,913.0,GasA,Ex,Y,SBrkr,913,1209,0,2122,1.0,0.0,2,1,4,1,Gd,8,Typ,1,TA,Attchd,1997.0,RFn,2.0,559.0,TA,TA,Y,0,74,0,0,0,0,,,,0,4,2009,WD,220000,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1996.0,913.0,2.0,559.0,2122.0,7.0,3984016.0,1822348.0,3992.0,1115764.0,4235512.0,13972.0,833569.0,1826.0,510367.0,1937386.0,6391.0,4.0,1118.0,4244.0,14.0,312481.0,1186198.0,3913.0,4502884.0,14854.0,49.0
2,153,535304180,20,68.0,7922,Pave,,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,5,7,1953,2007,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,Gd,CBlock,TA,TA,No,GLQ,731.0,Unf,0.0,326.0,1057.0,GasA,TA,Y,SBrkr,1057,0,0,1057,1.0,0.0,1,0,3,1,Gd,5,Typ,0,,Detchd,1953.0,Unf,1.0,246.0,TA,TA,Y,0,52,0,0,0,0,,,,0,1,2010,WD,109000,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1953.0,1057.0,1.0,246.0,1057.0,5.0,3814209.0,2064321.0,1953.0,480438.0,2064321.0,9765.0,1117249.0,1057.0,260022.0,1117249.0,5285.0,1.0,246.0,1057.0,5.0,60516.0,260022.0,1230.0,1117249.0,5285.0,25.0
3,318,916386060,60,73.0,9802,Pave,,Lvl,AllPub,Inside,Gtl,Timber,Norm,Norm,1Fam,5,5,2006,2007,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,Unf,0.0,Unf,0.0,384.0,384.0,GasA,Gd,Y,SBrkr,744,700,0,1444,0.0,0.0,2,1,3,1,TA,7,Typ,0,,BuiltIn,2007.0,Fin,2.0,400.0,TA,TA,Y,100,0,0,0,0,0,,,,0,4,2010,WD,174000,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,2006.0,384.0,2.0,400.0,1444.0,5.0,4024036.0,770304.0,4012.0,802400.0,2896664.0,10030.0,147456.0,768.0,153600.0,554496.0,1920.0,4.0,800.0,2888.0,10.0,160000.0,577600.0,2000.0,2085136.0,7220.0,25.0
4,255,906425045,50,82.0,14235,Pave,,Lvl,AllPub,Inside,Gtl,SawyerW,Norm,Norm,1Fam,6,8,1900,1993,Gable,CompShg,Wd Sdng,Plywood,,0.0,TA,TA,PConc,Fa,Gd,No,Unf,0.0,Unf,0.0,676.0,676.0,GasA,TA,Y,SBrkr,831,614,0,1445,0.0,0.0,2,0,3,1,TA,6,Typ,0,,Detchd,1957.0,Unf,2.0,484.0,TA,TA,N,0,59,0,0,0,0,,,,0,3,2010,WD,138500,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1900.0,676.0,2.0,484.0,1445.0,6.0,3610000.0,1284400.0,3800.0,919600.0,2745500.0,11400.0,456976.0,1352.0,327184.0,976820.0,4056.0,4.0,968.0,2890.0,12.0,234256.0,699380.0,2904.0,2088025.0,8670.0,36.0


# Initial Lin Reg on New_DF

In [29]:
lr = LinearRegression()

In [None]:
new_df.get_feature_names(features)

In [38]:
combined_features = ['year_built_y', # copy and pasted so format looks weird, taking out "['overall_qual^2house_style_1.5Fin'] not in index" error
 'total_bsmt_sf_y',
 'garage_cars_y',
 'garage_area_y',
 'gr_liv_area_y',
 'overall_qual_y',
 'year_built^2',
 'year_built total_bsmt_sf',
 'year_built garage_cars',
 'year_built garage_area',
 'year_built gr_liv_area',
 'year_built overall_qual',
 'total_bsmt_sf^2',
 'total_bsmt_sf garage_cars',
 'total_bsmt_sf garage_area',
 'total_bsmt_sf gr_liv_area',
 'total_bsmt_sf overall_qual',
 'garage_cars^2',
 'garage_cars garage_area',
 'garage_cars gr_liv_area',
 'garage_cars overall_qual',
 'garage_area^2',
 'garage_area gr_liv_area',
 'garage_area overall_qual',
 'gr_liv_area^2',
 'gr_liv_area overall_qual',
 #'overall_qual^2''house_style_1.5Fin', 
                     'house_style_1.5Unf', 'house_style_1Story', 'house_style_2.5Fin', 
                     'house_style_2.5Unf', 'house_style_2Story', 'house_style_SFoyer', 
                     'house_style_SLvl', 'lot_shape_IR1', 'lot_shape_IR2', 'lot_shape_IR3', 
                     'lot_shape_Reg', 'ms_zoning_C (all)', 'ms_zoning_FV', 'ms_zoning_I (all)', 
                     'ms_zoning_RH', 'ms_zoning_RL', 'ms_zoning_RM']

In [39]:
new_X = new_df[combined_features]
new_X

Unnamed: 0,year_built_y,total_bsmt_sf_y,garage_cars_y,garage_area_y,gr_liv_area_y,overall_qual_y,year_built^2,year_built total_bsmt_sf,year_built garage_cars,year_built garage_area,year_built gr_liv_area,year_built overall_qual,total_bsmt_sf^2,total_bsmt_sf garage_cars,total_bsmt_sf garage_area,total_bsmt_sf gr_liv_area,total_bsmt_sf overall_qual,garage_cars^2,garage_cars garage_area,garage_cars gr_liv_area,garage_cars overall_qual,garage_area^2,garage_area gr_liv_area,garage_area overall_qual,gr_liv_area^2,gr_liv_area overall_qual,house_style_1.5Unf,house_style_1Story,house_style_2.5Fin,house_style_2.5Unf,house_style_2Story,house_style_SFoyer,house_style_SLvl,lot_shape_IR1,lot_shape_IR2,lot_shape_IR3,lot_shape_Reg,ms_zoning_C (all),ms_zoning_FV,ms_zoning_I (all),ms_zoning_RH,ms_zoning_RL,ms_zoning_RM
0,1976.0,725.0,2.0,475.0,1479.0,6.0,3904576.0,1432600.0,3952.0,938600.0,2922504.0,11856.0,525625.0,1450.0,344375.0,1072275.0,4350.0,4.0,950.0,2958.0,12.0,225625.0,702525.0,2850.0,2187441.0,8874.0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0
1,1996.0,913.0,2.0,559.0,2122.0,7.0,3984016.0,1822348.0,3992.0,1115764.0,4235512.0,13972.0,833569.0,1826.0,510367.0,1937386.0,6391.0,4.0,1118.0,4244.0,14.0,312481.0,1186198.0,3913.0,4502884.0,14854.0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0
2,1953.0,1057.0,1.0,246.0,1057.0,5.0,3814209.0,2064321.0,1953.0,480438.0,2064321.0,9765.0,1117249.0,1057.0,260022.0,1117249.0,5285.0,1.0,246.0,1057.0,5.0,60516.0,260022.0,1230.0,1117249.0,5285.0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
3,2006.0,384.0,2.0,400.0,1444.0,5.0,4024036.0,770304.0,4012.0,802400.0,2896664.0,10030.0,147456.0,768.0,153600.0,554496.0,1920.0,4.0,800.0,2888.0,10.0,160000.0,577600.0,2000.0,2085136.0,7220.0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0
4,1900.0,676.0,2.0,484.0,1445.0,6.0,3610000.0,1284400.0,3800.0,919600.0,2745500.0,11400.0,456976.0,1352.0,327184.0,976820.0,4056.0,4.0,968.0,2890.0,12.0,234256.0,699380.0,2904.0,2088025.0,8670.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,2007.0,1884.0,2.0,520.0,1728.0,8.0,4028049.0,3781188.0,4014.0,1043640.0,3468096.0,16056.0,3549456.0,3768.0,979680.0,3255552.0,15072.0,4.0,1040.0,3456.0,16.0,270400.0,898560.0,4160.0,2985984.0,13824.0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
2047,1940.0,861.0,2.0,539.0,861.0,4.0,3763600.0,1670340.0,3880.0,1045660.0,1670340.0,7760.0,741321.0,1722.0,464079.0,741321.0,3444.0,4.0,1078.0,1722.0,8.0,290521.0,464079.0,2156.0,741321.0,3444.0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
2048,1928.0,896.0,2.0,342.0,1913.0,6.0,3717184.0,1727488.0,3856.0,659376.0,3688264.0,11568.0,802816.0,1792.0,306432.0,1714048.0,5376.0,4.0,684.0,3826.0,12.0,116964.0,654246.0,2052.0,3659569.0,11478.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2049,1956.0,1200.0,1.0,294.0,1200.0,4.0,3825936.0,2347200.0,1956.0,575064.0,2347200.0,7824.0,1440000.0,1200.0,352800.0,1440000.0,4800.0,1.0,294.0,1200.0,4.0,86436.0,352800.0,1176.0,1440000.0,4800.0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0


In [43]:
cross_val_score(lr, new_X, y).mean()

0.8570847886494102

In [45]:
X_train, X_test, y_train, y_test = train_test_split(new_X, y, random_state = 42)

In [46]:
lr.fit(X_train, y_train)

LinearRegression()

In [47]:
lr.score(X_train, y_train), lr.score(X_test, y_test)

(0.8728197394503288, 0.8701135390164657)

# Need to do exact same process to test set

In [78]:
kaggle_X = kaggle_data[new_features]
#kaggle_X

In [50]:
poly1 = PolynomialFeatures(include_bias=False)
poly1.fit(kaggle_X)

PolynomialFeatures(include_bias=False)

In [52]:
Kaggle_X_poly = poly.transform(kaggle_X)
Kaggle_X_poly

array([[1.910000e+03, 1.020000e+03, 1.000000e+00, ..., 3.717184e+06,
        1.156800e+04, 3.600000e+01],
       [1.977000e+03, 1.967000e+03, 2.000000e+00, ..., 3.869089e+06,
        9.835000e+03, 2.500000e+01],
       [2.006000e+03, 6.540000e+02, 2.000000e+00, ..., 2.238016e+06,
        1.047200e+04, 4.900000e+01],
       ...,
       [1.968000e+03, 9.520000e+02, 1.000000e+00, ..., 1.466521e+06,
        6.055000e+03, 2.500000e+01],
       [1.971000e+03, 8.640000e+02, 2.000000e+00, ..., 7.464960e+05,
        3.456000e+03, 1.600000e+01],
       [1.955000e+03, 9.230000e+02, 1.000000e+00, ..., 8.556250e+05,
        4.625000e+03, 2.500000e+01]])

In [73]:
 kaggle_poly_cols = poly1.get_feature_names(new_features)

In [54]:
kaggle_shortened_poly = pd.DataFrame(Kaggle_X_poly, columns=poly1.get_feature_names(new_features))

In [56]:
kaggle_shortened_poly.head(1)

Unnamed: 0,year_built,total_bsmt_sf,garage_cars,garage_area,gr_liv_area,overall_qual,year_built^2,year_built total_bsmt_sf,year_built garage_cars,year_built garage_area,year_built gr_liv_area,year_built overall_qual,total_bsmt_sf^2,total_bsmt_sf garage_cars,total_bsmt_sf garage_area,total_bsmt_sf gr_liv_area,total_bsmt_sf overall_qual,garage_cars^2,garage_cars garage_area,garage_cars gr_liv_area,garage_cars overall_qual,garage_area^2,garage_area gr_liv_area,garage_area overall_qual,gr_liv_area^2,gr_liv_area overall_qual,overall_qual^2
0,1910.0,1020.0,1.0,440.0,1928.0,6.0,3648100.0,1948200.0,1910.0,840400.0,3682480.0,11460.0,1040400.0,1020.0,448800.0,1966560.0,6120.0,1.0,440.0,1928.0,6.0,193600.0,848320.0,2640.0,3717184.0,11568.0,36.0


In [57]:
kaggle_shortened_poly['id'] = kaggle_data['id']
cols = kaggle_shortened_poly.columns.tolist()
cols = cols[-1:] + cols[:-1]
kaggle_shortened_poly = kaggle_shortened_poly[cols].copy() #  .copy() gets rid of the data error problem when making the saleprice column
kaggle_shortened_poly.head()

Unnamed: 0,id,year_built,total_bsmt_sf,garage_cars,garage_area,gr_liv_area,overall_qual,year_built^2,year_built total_bsmt_sf,year_built garage_cars,year_built garage_area,year_built gr_liv_area,year_built overall_qual,total_bsmt_sf^2,total_bsmt_sf garage_cars,total_bsmt_sf garage_area,total_bsmt_sf gr_liv_area,total_bsmt_sf overall_qual,garage_cars^2,garage_cars garage_area,garage_cars gr_liv_area,garage_cars overall_qual,garage_area^2,garage_area gr_liv_area,garage_area overall_qual,gr_liv_area^2,gr_liv_area overall_qual,overall_qual^2
0,2658,1910.0,1020.0,1.0,440.0,1928.0,6.0,3648100.0,1948200.0,1910.0,840400.0,3682480.0,11460.0,1040400.0,1020.0,448800.0,1966560.0,6120.0,1.0,440.0,1928.0,6.0,193600.0,848320.0,2640.0,3717184.0,11568.0,36.0
1,2718,1977.0,1967.0,2.0,580.0,1967.0,5.0,3908529.0,3888759.0,3954.0,1146660.0,3888759.0,9885.0,3869089.0,3934.0,1140860.0,3869089.0,9835.0,4.0,1160.0,3934.0,10.0,336400.0,1140860.0,2900.0,3869089.0,9835.0,25.0
2,2414,2006.0,654.0,2.0,426.0,1496.0,7.0,4024036.0,1311924.0,4012.0,854556.0,3000976.0,14042.0,427716.0,1308.0,278604.0,978384.0,4578.0,4.0,852.0,2992.0,14.0,181476.0,637296.0,2982.0,2238016.0,10472.0,49.0
3,1989,1923.0,968.0,2.0,480.0,968.0,5.0,3697929.0,1861464.0,3846.0,923040.0,1861464.0,9615.0,937024.0,1936.0,464640.0,937024.0,4840.0,4.0,960.0,1936.0,10.0,230400.0,464640.0,2400.0,937024.0,4840.0,25.0
4,625,1963.0,1394.0,2.0,514.0,1394.0,6.0,3853369.0,2736422.0,3926.0,1008982.0,2736422.0,11778.0,1943236.0,2788.0,716516.0,1943236.0,8364.0,4.0,1028.0,2788.0,12.0,264196.0,716516.0,3084.0,1943236.0,8364.0,36.0


## Kaggle Poly finished, need to do Kaggle Dummies now

In [58]:
kaggle_dummies = pd.get_dummies(kaggle_data, columns=['house_style', 'lot_shape', 'ms_zoning'])

In [59]:
new_kaggle_data = pd.merge(kaggle_dummies, kaggle_shortened_poly, on='id')

In [61]:
new_kaggle_data.shape

(878, 122)

In [62]:
new_kaggle_data.columns.value_counts().sum() # no duplicate columns only unique

122

In [79]:
new_kaggle_data.head()

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,street,alley,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,overall_qual_x,overall_cond,year_built_x,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf_x,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area_x,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,fireplace_qu,garage_type,garage_yr_blt,garage_finish,garage_cars_x,garage_area_x,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,house_style_1.5Fin,house_style_1.5Unf,house_style_1Story,house_style_2.5Fin,house_style_2.5Unf,house_style_2Story,house_style_SFoyer,house_style_SLvl,lot_shape_IR1,lot_shape_IR2,lot_shape_IR3,lot_shape_Reg,ms_zoning_C (all),ms_zoning_FV,ms_zoning_I (all),ms_zoning_RH,ms_zoning_RL,ms_zoning_RM,year_built_y,total_bsmt_sf_y,garage_cars_y,garage_area_y,gr_liv_area_y,overall_qual_y,year_built^2,year_built total_bsmt_sf,year_built garage_cars,year_built garage_area,year_built gr_liv_area,year_built overall_qual,total_bsmt_sf^2,total_bsmt_sf garage_cars,total_bsmt_sf garage_area,total_bsmt_sf gr_liv_area,total_bsmt_sf overall_qual,garage_cars^2,garage_cars garage_area,garage_cars gr_liv_area,garage_cars overall_qual,garage_area^2,garage_area gr_liv_area,garage_area overall_qual,gr_liv_area^2,gr_liv_area overall_qual,overall_qual^2
0,2658,902301120,190,69.0,9142,Pave,Grvl,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,2fmCon,6,8,1910,1950,Gable,CompShg,AsbShng,AsbShng,,0.0,TA,Fa,Stone,Fa,TA,No,Unf,0,Unf,0,1020,1020,GasA,Gd,N,FuseP,908,1020,0,1928,0,0,2,0,4,2,Fa,9,Typ,0,,Detchd,1910.0,Unf,1,440,Po,Po,Y,0,60,112,0,0,0,,,,0,4,2006,WD,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1910.0,1020.0,1.0,440.0,1928.0,6.0,3648100.0,1948200.0,1910.0,840400.0,3682480.0,11460.0,1040400.0,1020.0,448800.0,1966560.0,6120.0,1.0,440.0,1928.0,6.0,193600.0,848320.0,2640.0,3717184.0,11568.0,36.0
1,2718,905108090,90,,9662,Pave,,Lvl,AllPub,Inside,Gtl,Sawyer,Norm,Norm,Duplex,5,4,1977,1977,Gable,CompShg,Plywood,Plywood,,0.0,TA,TA,CBlock,Gd,TA,No,Unf,0,Unf,0,1967,1967,GasA,TA,Y,SBrkr,1967,0,0,1967,0,0,2,0,6,2,TA,10,Typ,0,,Attchd,1977.0,Fin,2,580,TA,TA,Y,170,0,0,0,0,0,,,,0,8,2006,WD,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1977.0,1967.0,2.0,580.0,1967.0,5.0,3908529.0,3888759.0,3954.0,1146660.0,3888759.0,9885.0,3869089.0,3934.0,1140860.0,3869089.0,9835.0,4.0,1160.0,3934.0,10.0,336400.0,1140860.0,2900.0,3869089.0,9835.0,25.0
2,2414,528218130,60,58.0,17104,Pave,,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,7,5,2006,2006,Gable,CompShg,VinylSd,VinylSd,,0.0,Gd,TA,PConc,Gd,Gd,Av,GLQ,554,Unf,0,100,654,GasA,Ex,Y,SBrkr,664,832,0,1496,1,0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,2006.0,RFn,2,426,TA,TA,Y,100,24,0,0,0,0,,,,0,9,2006,New,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,2006.0,654.0,2.0,426.0,1496.0,7.0,4024036.0,1311924.0,4012.0,854556.0,3000976.0,14042.0,427716.0,1308.0,278604.0,978384.0,4578.0,4.0,852.0,2992.0,14.0,181476.0,637296.0,2982.0,2238016.0,10472.0,49.0
3,1989,902207150,30,60.0,8520,Pave,,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,1Fam,5,6,1923,2006,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,Gd,TA,CBlock,TA,TA,No,Unf,0,Unf,0,968,968,GasA,TA,Y,SBrkr,968,0,0,968,0,0,1,0,2,1,TA,5,Typ,0,,Detchd,1935.0,Unf,2,480,Fa,TA,N,0,0,184,0,0,0,,,,0,7,2007,WD,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1923.0,968.0,2.0,480.0,968.0,5.0,3697929.0,1861464.0,3846.0,923040.0,1861464.0,9615.0,937024.0,1936.0,464640.0,937024.0,4840.0,4.0,960.0,1936.0,10.0,230400.0,464640.0,2400.0,937024.0,4840.0,25.0
4,625,535105100,20,,9500,Pave,,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,6,5,1963,1963,Gable,CompShg,Plywood,Plywood,BrkFace,247.0,TA,TA,CBlock,Gd,TA,No,BLQ,609,Unf,0,785,1394,GasA,Gd,Y,SBrkr,1394,0,0,1394,1,0,1,1,3,1,TA,6,Typ,2,Gd,Attchd,1963.0,RFn,2,514,TA,TA,Y,0,76,0,0,185,0,,,,0,7,2009,WD,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1963.0,1394.0,2.0,514.0,1394.0,6.0,3853369.0,2736422.0,3926.0,1008982.0,2736422.0,11778.0,1943236.0,2788.0,716516.0,1943236.0,8364.0,4.0,1028.0,2788.0,12.0,264196.0,716516.0,3084.0,1943236.0,8364.0,36.0


# Making predictions from train set onto test set

In [64]:
pd.set_option('display.max_columns', 125)

In [80]:
kaggle_predictions = lr.predict(new_kaggle_data[combined_features])

In [81]:
new_kaggle_data['saleprice'] = kaggle_predictions

In [82]:
new_kaggle_data[['id', 'saleprice']]

Unnamed: 0,id,saleprice
0,2658,146879.723861
1,2718,169603.083087
2,2414,173867.766572
3,1989,112514.949861
4,625,180968.669625
...,...,...
873,1662,193121.138901
874,1234,196426.123062
875,1373,133390.570081
876,1672,109574.217296


In [83]:
final_kaggle_df = new_kaggle_data[['id', 'saleprice']]

In [84]:
final_kaggle_df.to_csv('../datasets/Kaggle_mlr1_poly_and_locations.csv', index=False)