In [93]:
# importing the needed libraries

import pandas as pd
from sklearn.model_selection import train_test_split
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
import pickle
import matplotlib.pyplot as plt

# Functions for Each Regression models

In [94]:
# function for Select K best algorithm.

def selectKbest(indep_X, dep_Y, n):
    test = SelectKBest(score_func = chi2, k = n)
    fit1 = test.fit(indep_X,dep_Y)
    selectK_features = fit1.transform(indep_X)
    return selectK_features

In [95]:
# function to split the data into training and testing data

def split_scalar(indep_X, dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size= 0.25, random_state = 0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test, y_train, y_test

In [96]:
# function to calculate r score for the model created..

def r2_prediction(regressor,X_test,y_test):
    y_pred = regressor.predict(X_test)
    from sklearn.metrics import r2_score
    r2 = r2_score(y_test, y_pred)
    return r2

Linear , SVM_linear, SVM_nonLinear, Decision Tree, Random Forest model created by using function

In [97]:
def Linear(X_train, y_train, X_test):
    from sklearn.linear_model import LinearRegression
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    r2 = r2_prediction(regressor, X_test, y_test)
    return r2

In [98]:
def svm_linear(X_train,y_train,X_test):

        from sklearn.svm import SVR
        regressor = SVR(kernel = 'linear')
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

def svm_NL(X_train,y_train,X_test):

        from sklearn.svm import SVR
        regressor = SVR(kernel = 'rbf')
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [99]:
def Decision(X_train,y_train,X_test):
    from sklearn.tree import DecisionTreeRegressor
    regressor = DecisionTreeRegressor(random_state = 0)
    regressor.fit(X_train, y_train)
    r2 = r2_prediction(regressor,X_test,y_test)
    return r2

In [100]:
def random(X_train,y_train,X_test):

        from sklearn.ensemble import RandomForestRegressor
        regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [101]:
# creating a dataframe/table with the obtained accuracy of the respective models.

def selectK_regression(acclin, accsvml, accsvmnl, accdes, accrf):
    dataframe=pd.DataFrame(index=['ChiSquare'],columns=['Linear','SVMl','SVMnl','Decision','Random'])
    for number,idex in enumerate(dataframe.index):

        dataframe['Linear'][idex]=acclin[number]
        dataframe['SVMl'][idex]=accsvml[number]
        dataframe['SVMnl'][idex]=accsvmnl[number]
        dataframe['Decision'][idex]=accdes[number]
        dataframe['Random'][idex]=accrf[number]
    return dataframe



# code

In [102]:
# reading the dataset

dataset1 = pd.read_csv("sales_Price.csv", index_col = None)
df2 = dataset1

In [103]:
df2.head(5)

Unnamed: 0.1,Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,sqft_living15,sqft_lot15
0,0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,1340,5650
1,1,538000.0,3,2.0,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,1690,7639
2,2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,2720,8062
3,3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,1360,5000
4,4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,1800,7503


In [104]:
df2 = df2.drop(['Unnamed: 0'], axis = 1)


In [105]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          21613 non-null  float64
 1   bedrooms       21613 non-null  int64  
 2   bathrooms      21613 non-null  float64
 3   sqft_living    21613 non-null  int64  
 4   sqft_lot       21613 non-null  int64  
 5   floors         21613 non-null  float64
 6   waterfront     21613 non-null  int64  
 7   view           21613 non-null  int64  
 8   condition      21613 non-null  int64  
 9   grade          21613 non-null  int64  
 10  sqft_above     21613 non-null  int64  
 11  sqft_basement  21613 non-null  int64  
 12  yr_built       21613 non-null  int64  
 13  yr_renovated   21613 non-null  int64  
 14  sqft_living15  21613 non-null  int64  
 15  sqft_lot15     21613 non-null  int64  
dtypes: float64(3), int64(13)
memory usage: 2.6 MB


In [106]:
df2['waterfront'].value_counts()

waterfront
0    21450
1      163
Name: count, dtype: int64

In [107]:
df2 = pd.get_dummies(df2, drop_first = True, dtype = int) # converts categorical data into dummy or indicator variables

In [108]:
df2.fillna(0 , inplace = True) # replacing with NA
df2.isnull().sum()

price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
sqft_living15    0
sqft_lot15       0
dtype: int64

In [109]:
df2.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,sqft_living15,sqft_lot15
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,1340,5650
1,538000.0,3,2.0,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,1690,7639
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,2720,8062
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,1360,5000
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,1800,7503


In [110]:
#changing both values to int

df2['bathrooms'] = df2['bathrooms'].apply(np.int64)
df2['price'] = df2['price'].apply(np.int64)

In [111]:
# classification is the output variable, so storing it in y, x is input

indep_X= df2.iloc[:,1:].values
dep_Y=df2.iloc[:,0].values

In [133]:
Kbest = selectKbest(indep_X,dep_Y,15) #calling K(n) best selector

In [134]:
Kbest

array([[3.000e+00, 1.000e+00, 1.180e+03, ..., 0.000e+00, 1.340e+03,
        5.650e+03],
       [3.000e+00, 2.000e+00, 2.570e+03, ..., 1.991e+03, 1.690e+03,
        7.639e+03],
       [2.000e+00, 1.000e+00, 7.700e+02, ..., 0.000e+00, 2.720e+03,
        8.062e+03],
       ...,
       [2.000e+00, 1.000e+00, 1.020e+03, ..., 0.000e+00, 1.020e+03,
        2.007e+03],
       [3.000e+00, 2.000e+00, 1.600e+03, ..., 0.000e+00, 1.410e+03,
        1.287e+03],
       [2.000e+00, 1.000e+00, 1.020e+03, ..., 0.000e+00, 1.020e+03,
        1.357e+03]])

In [135]:
# empty list to store each model accuracy

acclin=[]
accsvml=[]
accsvmnl=[]
accdes=[]
accrf=[]

In [136]:
X_train, X_test, y_train, y_test=split_scalar(Kbest,dep_Y)

In [137]:
# calling each regression model/

#for i in Kbest:
r2_lin=Linear(X_train,y_train,X_test)
acclin.append(r2_lin)

r2_sl=svm_linear(X_train,y_train,X_test)
accsvml.append(r2_sl)

r2_NL=svm_NL(X_train,y_train,X_test)
accsvmnl.append(r2_NL)

r2_d=Decision(X_train,y_train,X_test)
accdes.append(r2_d)

r2_r=random(X_train,y_train,X_test)
accrf.append(r2_r)

In [120]:
result=selectK_regression(acclin,accsvml,accsvmnl,accdes,accrf)

result # k = 5



Unnamed: 0,Linear,SVMl,SVMnl,Decision,Random
ChiSquare,0.492093,0.004717,-0.047217,0.196818,0.541879


In [126]:
result=selectK_regression(acclin,accsvml,accsvmnl,accdes,accrf)

result # k = 10

Unnamed: 0,Linear,SVMl,SVMnl,Decision,Random
ChiSquare,0.546925,0.054379,-0.047639,0.400811,0.64723


In [132]:
result=selectK_regression(acclin,accsvml,accsvmnl,accdes,accrf)

result # k =12

Unnamed: 0,Linear,SVMl,SVMnl,Decision,Random
ChiSquare,0.637921,0.080536,-0.047618,0.556875,0.740249


In [138]:
result=selectK_regression(acclin,accsvml,accsvmnl,accdes,accrf)

result # k =15

Unnamed: 0,Linear,SVMl,SVMnl,Decision,Random
ChiSquare,0.646975,0.088428,-0.048193,0.573754,0.73929


In [None]:
# when K = 12 is used it showed 74 percent in Random Forest...