In [1]:
from path import Path
import pandas as pd
import statsmodels.api as sm
from matplotlib import pyplot as plt
import seaborn as sns 
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

In [2]:
data = Path('resources\kc_house_data.csv')
df = pd.read_csv(data)
df.head(20)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
5,7237550310,20140512T000000,1225000.0,4,4.5,5420,101930,1.0,0,0,...,11,3890,1530,2001,0,98053,47.6561,-122.005,4760,101930
6,1321400060,20140627T000000,257500.0,3,2.25,1715,6819,2.0,0,0,...,7,1715,0,1995,0,98003,47.3097,-122.327,2238,6819
7,2008000270,20150115T000000,291850.0,3,1.5,1060,9711,1.0,0,0,...,7,1060,0,1963,0,98198,47.4095,-122.315,1650,9711
8,2414600126,20150415T000000,229500.0,3,1.0,1780,7470,1.0,0,0,...,7,1050,730,1960,0,98146,47.5123,-122.337,1780,8113
9,3793500160,20150312T000000,323000.0,3,2.5,1890,6560,2.0,0,0,...,7,1890,0,2003,0,98038,47.3684,-122.031,2390,7570


In [3]:
df['Month'] = pd.DatetimeIndex(df['date']).month

In [4]:
df = df.drop(['date'], axis = 1)

In [5]:
df.columns.tolist()

['id',
 'price',
 'bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'zipcode',
 'lat',
 'long',
 'sqft_living15',
 'sqft_lot15',
 'Month']

In [6]:
col_order = ['id','Month','price','bedrooms','bathrooms','sqft_living',
            'sqft_lot','floors','waterfront','view','condition','grade','sqft_above','sqft_basement','yr_built',
             'yr_renovated','zipcode','lat','long','sqft_living15','sqft_lot15']

In [7]:
df = df[col_order]

In [8]:
df.head(10)

Unnamed: 0,id,Month,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,12,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,2,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,12,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,2,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
5,7237550310,5,1225000.0,4,4.5,5420,101930,1.0,0,0,...,11,3890,1530,2001,0,98053,47.6561,-122.005,4760,101930
6,1321400060,6,257500.0,3,2.25,1715,6819,2.0,0,0,...,7,1715,0,1995,0,98003,47.3097,-122.327,2238,6819
7,2008000270,1,291850.0,3,1.5,1060,9711,1.0,0,0,...,7,1060,0,1963,0,98198,47.4095,-122.315,1650,9711
8,2414600126,4,229500.0,3,1.0,1780,7470,1.0,0,0,...,7,1050,730,1960,0,98146,47.5123,-122.337,1780,8113
9,3793500160,3,323000.0,3,2.5,1890,6560,2.0,0,0,...,7,1890,0,2003,0,98038,47.3684,-122.031,2390,7570


In [9]:
# Creating X
X = df.drop(['price'], axis = 1)
X.head()

Unnamed: 0,id,Month,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,12,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,2,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,12,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,2,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [10]:
# Creating y
y = df["price"]
y.head()

0    221900.0
1    538000.0
2    180000.0
3    604000.0
4    510000.0
Name: price, dtype: float64

In [11]:
# data = Path('Resources/kc_house_data.csv')
# df = pd.read_csv(data).set_index("id" )
# df.head(20)

In [13]:
# df['date'] = pd.to_datetime(df['date']) #Dropping date column bcz i won't gonna day and month and creating new year column
# df['year'] = df['date'].dt.year
# df = df.drop("date",axis=1)
# df

In [14]:
# X = df.drop(['date', 'price', 'zipcode', 'lat', 'long'], axis = 1)
# X = df.drop(['price'], axis = 1)

In [15]:
# y = df["price"]
# X = df.drop(columns="price")

In [16]:
y.head()

0    221900.0
1    538000.0
2    180000.0
3    604000.0
4    510000.0
Name: price, dtype: float64

In [17]:
X.head()

Unnamed: 0,id,Month,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,12,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,2,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,12,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,2,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [18]:
import statsmodels.api as sm

In [19]:
mod = sm.OLS(y, X)

In [20]:
res = mod.fit()

In [21]:
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.700
Model:                            OLS   Adj. R-squared:                  0.700
Method:                 Least Squares   F-statistic:                     2805.
Date:                Tue, 16 Nov 2021   Prob (F-statistic):               0.00
Time:                        21:09:57   Log-Likelihood:            -2.9458e+05
No. Observations:               21613   AIC:                         5.892e+05
Df Residuals:                   21594   BIC:                         5.893e+05
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
id            -1.286e-06   4.82e-07     -2.670

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [23]:
X_train.shape

(16209, 20)

In [24]:
# MAchine lerning model Linear Regression.
from sklearn.linear_model import LinearRegression

linear_reg = LinearRegression().fit(X_train, y_train)

In [25]:
y_pred = linear_reg.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test})
results.head(5)

Unnamed: 0,Prediction,Actual
735,452520.7,365000.0
2830,741087.1,865000.0
4106,1230364.0,1038000.0
16218,1665828.0,1490000.0
19964,726956.6,711000.0


In [26]:
cross_val_score(linear_reg, X, y, cv=10)

array([0.6955615 , 0.69899595, 0.68927195, 0.6926956 , 0.68035148,
       0.70561594, 0.71323353, 0.71040972, 0.69936734, 0.67369246])

In [27]:
# Linear Regression score
linear_reg.score(X, y, sample_weight=None)

0.7003446845355261

In [28]:
# Machinbe learning model Random Tree

In [29]:
from sklearn.tree import DecisionTreeRegressor

In [30]:
regressor = DecisionTreeRegressor(random_state=79)

In [31]:
# Machinbe learning model Random tree
y_pred = regressor.fit(X_train, y_train).predict(X_test)
results_tree = pd.DataFrame({"Prediction": y_pred, "Actual": y_test})
results_tree.head(5)

Unnamed: 0,Prediction,Actual
735,385000.0,365000.0
2830,825000.0,865000.0
4106,1225000.0,1038000.0
16218,1699000.0,1490000.0
19964,769000.0,711000.0


In [32]:
cross_val_score(regressor, X, y, cv=10)

array([0.81759649, 0.78681602, 0.76128754, 0.76092808, 0.6734723 ,
       0.64395646, 0.75322446, 0.76112512, 0.72789787, 0.80135755])

In [33]:
# Tree score
regressor.score(X, y, sample_weight=None)

0.9212965521318995

In [34]:
# Machinbe learning model Random forest

In [35]:
from sklearn.ensemble import RandomForestRegressor

In [36]:
regressor_forest = RandomForestRegressor(max_depth=2, random_state=79)

In [37]:
y_pred = regressor_forest.fit(X_train, y_train).predict(X_test)
results_forest = pd.DataFrame({"Prediction": y_pred, "Actual": y_test})
results_forest.head(5)

Unnamed: 0,Prediction,Actual
735,315501.1,365000.0
2830,530858.0,865000.0
4106,912326.9,1038000.0
16218,1421313.0,1490000.0
19964,809477.9,711000.0


In [38]:
cross_val_score(regressor_forest, X, y, cv=10)

array([0.51834514, 0.53612901, 0.55970558, 0.49169129, 0.51062762,
       0.54908181, 0.56218064, 0.55970309, 0.55793741, 0.51878254])

In [39]:
# Forest score
regressor_forest.score(X, y, sample_weight=None)

0.5362334751238881