In [1]:
# importing the needed libraries

import pandas as pd
from sklearn.model_selection import train_test_split
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
import pickle
import matplotlib.pyplot as plt

# Functions for Each Regression models

In [2]:
# function for Select K best algorithm.

def selectKbest(indep_X, dep_Y, n):
    test = SelectKBest(score_func = chi2, k = n)
    fit1 = test.fit(indep_X,dep_Y)
    selectK_features = fit1.transform(indep_X)
    return selectK_features

In [3]:
# function to split the data into training and testing data

def split_scalar(indep_X, dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size= 0.25, random_state = 0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test, y_train, y_test

In [4]:
# function to calculate r score for the model created..

def r2_prediction(regressor,X_test,y_test):
    y_pred = regressor.predict(X_test)
    from sklearn.metrics import r2_score
    r2 = r2_score(y_test, y_pred)
    return r2

Linear , SVM_linear, SVM_nonLinear, Decision Tree, Random Forest model created by using function

In [5]:
def Linear(X_train, y_train, X_test):
    from sklearn.linear_model import LinearRegression
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    r2 = r2_prediction(regressor, X_test, y_test)
    return r2

In [6]:
def svm_linear(X_train,y_train,X_test):

        from sklearn.svm import SVR
        regressor = SVR(kernel = 'linear')
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

def svm_NL(X_train,y_train,X_test):

        from sklearn.svm import SVR
        regressor = SVR(kernel = 'rbf')
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [7]:
def Decision(X_train,y_train,X_test):
    from sklearn.tree import DecisionTreeRegressor
    regressor = DecisionTreeRegressor(random_state = 0)
    regressor.fit(X_train, y_train)
    r2 = r2_prediction(regressor,X_test,y_test)
    return r2

In [8]:
def random(X_train,y_train,X_test):

        from sklearn.ensemble import RandomForestRegressor
        regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [9]:
# creating a dataframe/table with the obtained accuracy of the respective models.

def selectK_regression(acclin, accsvml, accsvmnl, accdes, accrf):
    dataframe=pd.DataFrame(index=['ChiSquare'],columns=['Linear','SVMl','SVMnl','Decision','Random'])
    for number,idex in enumerate(dataframe.index):

        dataframe['Linear'][idex]=acclin[number]
        dataframe['SVMl'][idex]=accsvml[number]
        dataframe['SVMnl'][idex]=accsvmnl[number]
        dataframe['Decision'][idex]=accdes[number]
        dataframe['Random'][idex]=accrf[number]
    return dataframe



# code

In [10]:
# reading the dataset

dataset1 = pd.read_csv("sales_Price.csv", index_col = None)
df2 = dataset1

In [11]:
df2.head(5)

Unnamed: 0.1,Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,Rating
0,0,221900.0,3.0,2.0,1180.0,5650,1.0,0,0,3.0,...,1180,0,1955,0,98178,47.5112,122.257,1340,5650.0,Excellent
1,1,538000.0,3.0,2.0,2570.0,7242,2.0,0,0,3.0,...,2170,400,1951,0,98125,47.721,122.319,1690,7639.0,Excellent
2,2,180000.0,2.0,2.0,770.0,10000,1.0,0,0,3.0,...,770,0,1933,0,98028,47.7379,122.233,2720,8062.0,Good
3,3,604000.0,4.0,2.0,1960.0,5000,1.0,0,0,5.0,...,1050,910,1965,0,98136,47.5208,122.393,1360,5000.0,Excellent
4,4,510000.0,3.0,2.0,1680.0,8080,1.0,0,0,3.0,...,1680,0,1987,0,98074,47.6168,122.045,1800,7503.0,Excellent


In [12]:
df2 = df2.drop(['Unnamed: 0'], axis = 1)

In [13]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          21613 non-null  float64
 1   bedrooms       21613 non-null  float64
 2   bathrooms      21613 non-null  float64
 3   sqft_living    21613 non-null  float64
 4   sqft_lot       21613 non-null  int64  
 5   floors         21613 non-null  float64
 6   waterfront     21613 non-null  int64  
 7   view           21613 non-null  int64  
 8   condition      21613 non-null  float64
 9   grade          21613 non-null  float64
 10  sqft_above     21613 non-null  int64  
 11  sqft_basement  21613 non-null  int64  
 12  yr_built       21613 non-null  int64  
 13  yr_renovated   21613 non-null  int64  
 14  zipcode        21613 non-null  int64  
 15  lat            21613 non-null  float64
 16  long           21613 non-null  float64
 17  sqft_living15  21613 non-null  int64  
 18  sqft_l

In [14]:
df2 = pd.get_dummies(df2, drop_first = True, dtype = int) # converts categorical data into dummy or indicator variables

In [15]:
df2.fillna(0 , inplace = True) # replacing with NA
df2.isnull().sum()

price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
Rating_Good      0
dtype: int64

In [16]:
df2.drop(['Rating_Good'],axis = 1)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,221900.0,3.0,2.0,1180.0,5650,1.0,0,0,3.0,7.0,1180,0,1955,0,98178,47.5112,122.257,1340,5650.0
1,538000.0,3.0,2.0,2570.0,7242,2.0,0,0,3.0,7.0,2170,400,1951,0,98125,47.7210,122.319,1690,7639.0
2,180000.0,2.0,2.0,770.0,10000,1.0,0,0,3.0,6.0,770,0,1933,0,98028,47.7379,122.233,2720,8062.0
3,604000.0,4.0,2.0,1960.0,5000,1.0,0,0,5.0,7.0,1050,910,1965,0,98136,47.5208,122.393,1360,5000.0
4,510000.0,3.0,2.0,1680.0,8080,1.0,0,0,3.0,8.0,1680,0,1987,0,98074,47.6168,122.045,1800,7503.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,360000.0,3.0,2.0,1530.0,1131,3.0,0,0,3.0,8.0,1530,0,2009,0,98103,47.6993,122.346,1530,1509.0
21609,400000.0,4.0,2.0,2310.0,5813,2.0,0,0,3.0,8.0,2310,0,2014,0,98146,47.5107,122.362,1830,7200.0
21610,402101.0,2.0,2.0,1020.0,1350,2.0,0,0,3.0,7.0,1020,0,2009,0,98144,47.5944,122.299,1020,2007.0
21611,400000.0,3.0,2.0,1600.0,2388,2.0,0,0,3.0,8.0,1600,0,2004,0,98027,47.5345,122.069,1410,1287.0


In [17]:
#changing both values to int

df2['bathrooms'] = df2['bathrooms'].apply(np.int64)
df2['price'] = df2['price'].apply(np.int64)

In [18]:
# classification is the output variable, so storing it in y, x is input

indep_X= df2.iloc[:,1:].values
dep_Y=df2.iloc[:,0].values

In [25]:
Kbest = selectKbest(indep_X,dep_Y,15) #calling K(n) best selector

In [26]:
# empty list to store each model accuracy

acclin=[]
accsvml=[]
accsvmnl=[]
accdes=[]
accrf=[]

In [27]:
X_train, X_test, y_train, y_test=split_scalar(Kbest,dep_Y)

In [28]:
# calling each regression model/

# for i in Kbest:
r2_lin=Linear(X_train,y_train,X_test)
acclin.append(r2_lin)

r2_sl=svm_linear(X_train,y_train,X_test)
accsvml.append(r2_sl)

r2_NL=svm_NL(X_train,y_train,X_test)
accsvmnl.append(r2_NL)

r2_d=Decision(X_train,y_train,X_test)
accdes.append(r2_d)

r2_r=random(X_train,y_train,X_test)
accrf.append(r2_r)

In [51]:
result=selectK_regression(acclin,accsvml,accsvmnl,accdes,accrf)

result. 



Unnamed: 0,Linear,SVMl,SVMnl,Decision,Random
ChiSquare,0.742151,0.15112,-0.044934,0.768133,0.879023


random forset gave good accuracy with "all" k features..

In [24]:
result=selectK_regression(acclin,accsvml,accsvmnl,accdes,accrf)

result # k =12

Unnamed: 0,Linear,SVMl,SVMnl,Decision,Random
ChiSquare,0.664432,0.132984,-0.04451,0.468063,0.705874


In [29]:
result=selectK_regression(acclin,accsvml,accsvmnl,accdes,accrf)

result # k =15

Unnamed: 0,Linear,SVMl,SVMnl,Decision,Random
ChiSquare,0.742151,0.15112,-0.044934,0.773617,0.878538


In [None]:
# when K = 15 is used it showed 87 percent in Random Forest...