## Baseline_model

In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import SCORERS

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
import xgboost as xgb

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
df_baseline = pd.read_csv('/home/sergei/Desktop/LighthouseLabs/Re_enter_bootcamp/Final_Project/Austin_House_data/baseline_dataset.csv', index_col=None)

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
df_baseline.head(5)

Unnamed: 0.1,Unnamed: 0,new_price,latitude,longitude,propertyTaxRate,garageSpaces,hasAssociation,hasCooling,hasGarage,hasHeating,hasSpa,hasView,homeType,parkingSpaces,yearBuilt,numPriceChanges,numOfAccessibilityFeatures,numOfAppliances,numOfParkingFeatures,numOfPatioAndPorchFeatures,numOfSecurityFeatures,numOfWaterfrontFeatures,numOfWindowFeatures,numOfCommunityFeatures,lotSizeSqFt,livingAreaSqFt,numOfPrimarySchools,numOfElementarySchools,numOfMiddleSchools,numOfHighSchools,avgSchoolDistance,avgSchoolRating,avgSchoolSize,MedianStudentsPerTeacher,numOfBathrooms,numOfBedrooms,numOfStories
0,0,757493.0,30.486408,-97.794724,2.21,0,True,True,False,True,False,False,Single Family,0,2014,13,0,4,2,0,0,0,0,0,10672.0,4564.0,1,0,1,1,3.266667,7.666667,1259,14,6.0,5,2
1,1,417108.0,30.494375,-97.796516,2.21,2,True,True,True,True,False,True,Single Family,2,2007,4,0,6,3,0,0,0,0,0,9060.0,3233.0,1,0,1,1,1.933333,8.333333,1481,16,4.0,5,2
2,2,201441.0,30.255707,-97.576958,1.98,2,False,True,True,True,False,False,Single Family,2,2006,3,1,2,2,1,0,0,0,0,7448.0,1511.0,1,0,1,1,2.433333,2.666667,1478,13,2.0,3,1
3,3,458883.0,30.495638,-97.797874,2.21,0,True,True,False,True,False,False,Single Family,0,2008,2,0,3,2,0,0,0,0,0,7666.0,2228.0,1,0,1,1,1.9,8.333333,1481,16,2.0,3,1
4,4,664216.0,30.488775,-97.794899,2.21,2,True,True,True,True,False,False,Single Family,2,2013,2,0,4,3,0,0,0,0,0,8494.0,3494.0,1,0,1,1,3.3,7.666667,1259,14,5.0,4,2


In [5]:
df_baseline.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
df_baseline.head(1)

Unnamed: 0,new_price,latitude,longitude,propertyTaxRate,garageSpaces,hasAssociation,hasCooling,hasGarage,hasHeating,hasSpa,hasView,homeType,parkingSpaces,yearBuilt,numPriceChanges,numOfAccessibilityFeatures,numOfAppliances,numOfParkingFeatures,numOfPatioAndPorchFeatures,numOfSecurityFeatures,numOfWaterfrontFeatures,numOfWindowFeatures,numOfCommunityFeatures,lotSizeSqFt,livingAreaSqFt,numOfPrimarySchools,numOfElementarySchools,numOfMiddleSchools,numOfHighSchools,avgSchoolDistance,avgSchoolRating,avgSchoolSize,MedianStudentsPerTeacher,numOfBathrooms,numOfBedrooms,numOfStories
0,757493.0,30.486408,-97.794724,2.21,0,True,True,False,True,False,False,Single Family,0,2014,13,0,4,2,0,0,0,0,0,10672.0,4564.0,1,0,1,1,3.266667,7.666667,1259,14,6.0,5,2


In [7]:
X = df_baseline.drop('new_price', axis=1)

In [8]:
y = df_baseline.new_price
y_log=np.log(y)

In [9]:
# determine categorical and numerical features
numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns

In [10]:
# define the data preparation for the columns
t = [('cat', OneHotEncoder(), categorical_ix), ('num', MinMaxScaler(), numerical_ix)]
col_transform = ColumnTransformer(transformers=t)

In [11]:
# Select random state
randomstate=30

In [12]:
# split the data
X_train,X_test,y_train,y_test=train_test_split(X,y_log,test_size=0.25,random_state=randomstate)

In [13]:
# define the model
model = LinearRegression()

In [14]:
# define the data preparation and modeling pipeline
pipeline = Pipeline(steps=[('prep',col_transform), ('m',model)])

In [15]:
# define the model cross-validation configuration
cv = KFold(n_splits=10, shuffle=True, random_state=randomstate)

In [16]:
# evaluate the pipeline using cross validation and calculate MAE
score_R2 = cross_val_score(pipeline, X, y, scoring='r2', cv=cv, n_jobs=-1)
score_NMAE = cross_val_score(pipeline, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
score_NRMSE = cross_val_score(pipeline, X, y, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)

In [17]:
print(score_R2.mean(),score_NMAE.mean())

0.520778989165867 -124097.07935256604


In [18]:
# Predicting on log transformed target variable (less skewed target variable distribution) and finding actual MAE in $ when target variable inverse log transformed

pipeline.fit(X_train, y_train)
y_predict = pipeline.predict(X_test)

#R2 and MAE
print(r2_score(y_test, y_predict))
print(mean_absolute_error(np.exp(y_test), np.exp(y_predict)))

0.5680580727583858
113825.82641362768


In [None]:
# Not bad for the baseline model. I assume this good result is achieved because of our thorough work with outliers in EDA.

## Modelling after the Feature engineering

In [19]:
# loading datasets
train = pd.read_csv('/home/sergei/Desktop/LighthouseLabs/Re_enter_bootcamp/Final_Project/Austin_House_data/train1_dataset.csv', index_col=0)
test = pd.read_csv('/home/sergei/Desktop/LighthouseLabs/Re_enter_bootcamp/Final_Project/Austin_House_data/test1_dataset.csv', index_col=0)

In [20]:
train.head(1)

Unnamed: 0,new_price,garageSpaces,hasSpa,hasView,homeType,lotSizeSqFt,livingAreaSqFt,avgSchoolRating,numOfBathrooms,numOfBedrooms,numOfStories,numberOfSchools,rank,lat_long
0,421143.0,0.0,0.0,0.0,Single Family,8494.0,1665.0,2.333333,2.0,3.0,1.0,3.0,19,1381.0


In [21]:
test.head(1)

Unnamed: 0,new_price,garageSpaces,hasSpa,hasView,homeType,lotSizeSqFt,livingAreaSqFt,avgSchoolRating,numOfBathrooms,numOfBedrooms,numOfStories,numberOfSchools,rank,lat_long
0,447608.0,2.0,0.0,0.0,Single Family,8973.0,2057.0,6.0,2.0,3.0,1.0,3.0,17,1393.0


In [22]:
train.dtypes

new_price          float64
garageSpaces       float64
hasSpa             float64
hasView            float64
homeType            object
lotSizeSqFt        float64
livingAreaSqFt     float64
avgSchoolRating    float64
numOfBathrooms     float64
numOfBedrooms      float64
numOfStories       float64
numberOfSchools    float64
rank                 int64
lat_long           float64
dtype: object

In [23]:
train.head(1)

Unnamed: 0,new_price,garageSpaces,hasSpa,hasView,homeType,lotSizeSqFt,livingAreaSqFt,avgSchoolRating,numOfBathrooms,numOfBedrooms,numOfStories,numberOfSchools,rank,lat_long
0,421143.0,0.0,0.0,0.0,Single Family,8494.0,1665.0,2.333333,2.0,3.0,1.0,3.0,19,1381.0


In [24]:
X_train.head(1)

Unnamed: 0,latitude,longitude,propertyTaxRate,garageSpaces,hasAssociation,hasCooling,hasGarage,hasHeating,hasSpa,hasView,homeType,parkingSpaces,yearBuilt,numPriceChanges,numOfAccessibilityFeatures,numOfAppliances,numOfParkingFeatures,numOfPatioAndPorchFeatures,numOfSecurityFeatures,numOfWaterfrontFeatures,numOfWindowFeatures,numOfCommunityFeatures,lotSizeSqFt,livingAreaSqFt,numOfPrimarySchools,numOfElementarySchools,numOfMiddleSchools,numOfHighSchools,avgSchoolDistance,avgSchoolRating,avgSchoolSize,MedianStudentsPerTeacher,numOfBathrooms,numOfBedrooms,numOfStories
8082,30.251057,-97.754852,1.98,1,True,True,True,True,False,True,Condo,1,2001,4,0,4,2,0,0,0,1,0,3916.0,1822.0,1,0,1,1,0.7,4.666667,889,13,2.0,2,1


In [25]:
# Separating and log-transforming target variable
X_train = train.drop('new_price', axis=1)
X_test = test.drop('new_price', axis=1)
y_train = train.new_price
y_test = test.new_price
y_train_log=np.log(y_train)
y_test_log=np.log(y_test)

In [26]:
# determine categorical and numerical features
numerical_ix = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = X_train.select_dtypes(include=['object']).columns

In [27]:
# define the data preparation for the columns
t = [('cat', OneHotEncoder(), categorical_ix), ('num', MinMaxScaler(), numerical_ix)]
col_transform = ColumnTransformer(transformers=t)

In [28]:
# define the data preparation and modeling pipeline
pipeline = Pipeline(steps=[('prep',col_transform), ('m',model)])

In [29]:
# Predicting on log transformed target variable (less skewed target variable distribution) and finding actual MAE in $ when target variable inverse log transformed

pipeline.fit(X_train, y_train_log)
y_predict_log = pipeline.predict(X_test)

#R2 and MAE
print(r2_score(y_test_log, y_predict_log))
print(mean_absolute_error(np.exp(y_test_log), np.exp(y_predict_log)))

0.6637593459883704
96706.78486476942


In [None]:
# The results improved - as we already explain 2/3 of the variation and our Absolute Mean Error decreased by 17000$

## Modelling after NLP

In [30]:
# loading datasets
train = pd.read_csv('/home/sergei/Desktop/LighthouseLabs/Re_enter_bootcamp/Final_Project/Austin_House_data/train_nlp_dataset.csv', index_col=0)
test = pd.read_csv('/home/sergei/Desktop/LighthouseLabs/Re_enter_bootcamp/Final_Project/Austin_House_data/test_nlp_dataset.csv', index_col=0)

In [31]:
# Separating and log-transforming target variable
X_train = train.drop('new_price', axis=1)
X_test = test.drop('new_price', axis=1)
y_train = train.new_price
y_test = test.new_price
y_train_log=np.log(y_train)
y_test_log=np.log(y_test)

In [32]:
# determine categorical and numerical features
numerical_ix = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = X_train.select_dtypes(include=['object']).columns

In [33]:
# define the data preparation for the columns
t = [('cat', OneHotEncoder(), categorical_ix), ('num', MinMaxScaler(), numerical_ix)]
col_transform = ColumnTransformer(transformers=t)

In [34]:
# define the data preparation and modeling pipeline
pipeline = Pipeline(steps=[('prep',col_transform), ('m',model)])

In [36]:
pipeline.fit(X_train, y_train_log)
y_predict_log = pipeline.predict(X_test)

#R2 and MAE
print(r2_score(y_test_log, y_predict_log))
print(mean_absolute_error(np.exp(y_test_log), np.exp(y_predict_log)))

0.6618740326818651
97124.30312214974


In [None]:
# The results are disappointing at first since our model have not improved, but from my point of view it is because of the limitations of linear regression model and we have more than 100 features.
# Let's try another more complex one, for example SVM

# Modelling SVM