In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
import statsmodels.api as sm
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
import cpi
from sklearn import tree
import warnings
from IPython.display import Image
import pydotplus
import graphviz
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
warnings.filterwarnings('ignore')
from sklearn import preprocessing
from sklearn import utils

In [14]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
house_prices_df = pd.read_sql_query('select * from houseprices',con=engine)

# Can close the connection since we're only doing one query
engine.dispose()

print(house_prices_df.head())

   id  mssubclass mszoning  lotfrontage  lotarea street alley lotshape  \
0   1          60       RL         65.0     8450   Pave  None      Reg   
1   2          20       RL         80.0     9600   Pave  None      Reg   
2   3          60       RL         68.0    11250   Pave  None      IR1   
3   4          70       RL         60.0     9550   Pave  None      IR1   
4   5          60       RL         84.0    14260   Pave  None      IR1   

  landcontour utilities  ... poolarea poolqc fence miscfeature miscval mosold  \
0         Lvl    AllPub  ...        0   None  None        None       0      2   
1         Lvl    AllPub  ...        0   None  None        None       0      5   
2         Lvl    AllPub  ...        0   None  None        None       0      9   
3         Lvl    AllPub  ...        0   None  None        None       0      2   
4         Lvl    AllPub  ...        0   None  None        None       0     12   

  yrsold  saletype  salecondition  saleprice  
0   2008        WD   

In [15]:

# Here I'm introducing an inflation adjusted sale price using the cpi module
house_prices_df['inf_adj_saleprice'] = house_prices_df.apply(lambda x: cpi.inflate(x.saleprice,x.yrsold), axis=1)

# Now I'm adding the CPI for both the years the house was built and sold
house_prices_df['cpi_yr_built'] = house_prices_df['yearbuilt'].apply(lambda x: cpi.get(x) if x > 1912 else 0)
house_prices_df['cpi_yr_sold'] = house_prices_df['yrsold'].apply(lambda x: cpi.get(x) if x > 1912 else 0)

# I'll combine the basement, first floor, and second floor square-footage features into one
# Then I'll multiply the total square-footage feathre by the overal qualitry rating to get a combined feature
house_prices_df['totalsf'] = house_prices_df['totalbsmtsf'] + house_prices_df['firstflrsf'] + house_prices_df['secondflrsf']
house_prices_df['int_over_sf'] = house_prices_df['totalsf'] * house_prices_df['overallqual']

print(house_prices_df.head(10))

# Here I'm splitting the db into numerical, numerical (that I consider categorical), and non-numerical categorical

df_num = house_prices_df.select_dtypes(include=['int64', 'float64'])
dropc = ['id', 'yrsold', 'overallqual', 'overallcond', 'bsmtfullbath', 'bsmthalfbath', 'fullbath', 'halfbath', 'bedroomabvgr', 'kitchenabvgr', 'totrmsabvgrd', 'fireplaces', 'garagecars', 'mosold']
df3 = house_prices_df.loc[:, dropc]
df_obj = house_prices_df.select_dtypes(include=['object'])

# Here I'm dropping any int/float fields I consider to be categorical and not fit for a scatterplot. I'll us a bar plot for those
df2 = df_num.drop(dropc, axis = 1)
df3 = df3.drop('id', axis = 1)

df_num.info()
df3.info()
# print(df3.head())

   id  mssubclass mszoning  lotfrontage  lotarea street alley lotshape  \
0   1          60       RL         65.0     8450   Pave  None      Reg   
1   2          20       RL         80.0     9600   Pave  None      Reg   
2   3          60       RL         68.0    11250   Pave  None      IR1   
3   4          70       RL         60.0     9550   Pave  None      IR1   
4   5          60       RL         84.0    14260   Pave  None      IR1   
5   6          50       RL         85.0    14115   Pave  None      IR1   
6   7          20       RL         75.0    10084   Pave  None      Reg   
7   8          60       RL          NaN    10382   Pave  None      IR1   
8   9          50       RM         51.0     6120   Pave  None      Reg   
9  10         190       RL         50.0     7420   Pave  None      Reg   

  landcontour utilities  ... mosold yrsold saletype salecondition saleprice  \
0         Lvl    AllPub  ...      2   2008       WD        Normal    208500   
1         Lvl    AllPub  ..

In [16]:
col = ['int_over_sf', 'overallqual', 'totalsf', 'grlivarea', 'garagecars']

house_prices_df[col].corr()

Unnamed: 0,int_over_sf,overallqual,totalsf,grlivarea,garagecars
int_over_sf,1.0,0.84304,0.938579,0.819653,0.587337
overallqual,0.84304,1.0,0.668155,0.593007,0.600671
totalsf,0.938579,0.668155,1.0,0.874373,0.536413
grlivarea,0.819653,0.593007,0.874373,1.0,0.467247
garagecars,0.587337,0.600671,0.536413,0.467247,1.0


In [20]:
for c in col:
    print(type(house_prices_df[c]))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [8]:
house_prices_df = pd.concat([house_prices_df,pd.get_dummies(house_prices_df.mszoning, prefix="mszoning", drop_first=True)], axis=1)
house_prices_df = pd.concat([house_prices_df,pd.get_dummies(house_prices_df.street, prefix="street", drop_first=True)], axis=1)
dummy_column_names = list(pd.get_dummies(house_prices_df.mszoning, prefix="mszoning", drop_first=True).columns)
dummy_column_names = dummy_column_names + list(pd.get_dummies(house_prices_df.street, prefix="street", drop_first=True).columns)

In [9]:
house_prices_df = pd.concat([house_prices_df,pd.get_dummies(house_prices_df['mszoning'], prefix="mszoning", drop_first=True)], axis=1)
house_prices_df = pd.concat([house_prices_df,pd.get_dummies(house_prices_df.street, prefix="street", drop_first=True)], axis=1)

dummy_column_names = list(pd.get_dummies(house_prices_df['garagecars', prefix="mszoning", drop_first=True).columns)
dummy_column_names = dummy_column_names + list(pd.get_dummies(house_prices_df.street, prefix="street", drop_first=True).columns)

SyntaxError: invalid syntax (<ipython-input-9-8c9e2ba7091f>, line 4)

In [None]:
lab_enc = preprocessing.LabelEncoder()

for i in col:
    house_prices_df[i] = lab_enc.fit_transform(house_prices_df[i])
    
house_prices_df['saleprice'] = lab_enc.fit_transform(house_prices_df['saleprice'])

In [17]:
# Y: Target Varialbe
Y = np.log1p(house_prices_df['saleprice'])
# X: features
X = house_prices_df[col]

In [18]:
for i in col:
    print(i)
    print(utils.multiclass.type_of_target(house_prices_df[i]))
    print('')

int_over_sf
multiclass

overallqual
multiclass

totalsf
multiclass

grlivarea
multiclass

garagecars
multiclass



In [12]:

decision_tree = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=1,
    max_depth=4,
    random_state = 4047
)
decision_tree.fit(X, Y)

# Render our tree.
dot_data = tree.export_graphviz(
    decision_tree, out_file=None,
    feature_names=customers.columns,
    class_names=['Not Returning', 'Returning'],
    filled=True
)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())

ValueError: Unknown label type: 'continuous'

In [22]:
rfc = ensemble.RandomForestRegressor()


cross_val_score(rfc, X, Y, cv=10)

array([0.7360737 , 0.83819637, 0.77548712, 0.74763281, 0.80784488,
       0.81435116, 0.81002374, 0.78926464, 0.77446588, 0.77436476])