In [2]:
import pandas as pd
import numpy as np

In [3]:
housing = pd.read_excel("housingdata.xlsx")
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200


In [4]:
housing.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
median_house_value      0
dtype: int64

In [5]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null int64
total_rooms           20640 non-null int64
total_bedrooms        20433 non-null float64
population            20640 non-null int64
households            20640 non-null int64
median_income         20640 non-null float64
ocean_proximity       20640 non-null object
median_house_value    20640 non-null int64
dtypes: float64(4), int64(5), object(1)
memory usage: 1.6+ MB


In [6]:
#From the above information we found out that the total bedrooms column has missing data
#As the data is of discrete type replace the na values with median value of total_bedrooms


In [7]:
housing.total_bedrooms.fillna(housing.total_bedrooms.median(),inplace = True)
housing.isna().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
ocean_proximity       0
median_house_value    0
dtype: int64

In [8]:
#We have a categorical column ocean_proximity. As model building requires entire data to be numeric, 
#convert categorical data to numeric using One Hot Encoder


In [9]:
features = housing.iloc[:,:-1].values
label = housing.iloc[:,-1].values

In [10]:
housing.ocean_proximity.unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [11]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

OceanEncoder = LabelEncoder()
features[:,8] = OceanEncoder.fit_transform(features[:,8])

OHE = OneHotEncoder(categorical_features =[8])
features = OHE.fit_transform(features).toarray()


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [12]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

In [54]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import cross_val_score

scores = cross_val_score(model,features,label,cv=10)

print(scores)
print("Average Score Expected",np.average(scores))

[-1.04977039  0.56521245  0.20386101  0.20853328  0.54088425  0.37620224
  0.27290395  0.17402553  0.12777043  0.32971686]
Average Score Expected 0.17493396157619506


In [55]:
from sklearn.model_selection import StratifiedShuffleSplit

ss = StratifiedShuffleSplit(n_splits=10, #n_splits should be equal to no of cv value in cross_val_score
              random_state=2,
              test_size=0.2)

i=0
for train,test in ss.split(features,label):
    i = i+1
    X_train,X_test = features[train],features[test]
    y_train,y_test = label[train],label[test]
    
    model.fit(X_train,y_train)
    
    if model.score(X_test,y_test) >= 0.50:
        print("Test Score: {} train score: {} for Sample Split: {}".format(model.score(X_test,y_test),model.score(X_train,y_train),i))

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

for i in range(1,100):
    X_train,X_test,Y_train,Y_test = train_test_split(features,label,random_state=i,test_size=0.2)
    
    model = LinearRegression()
    model.fit(X_train,Y_train)
    
    test_score = model.score(X_test,Y_test)
    train_score = model.score(X_train,Y_train)
    
    if test_score > train_score:
        print("Train Score:{} Test Score:{} for random state:{}".format(train_score,test_score,i))

Train Score:0.6434822518893747 Test Score:0.6531597816618034 for random state:3
Train Score:0.6439298215129546 Test Score:0.6509064272123612 for random state:5
Train Score:0.6430637891938789 Test Score:0.6544671065257387 for random state:8
Train Score:0.644983145129434 Test Score:0.6465190215794524 for random state:10
Train Score:0.6445453250410282 Test Score:0.6480365654556466 for random state:11
Train Score:0.6449183829938719 Test Score:0.647121435461011 for random state:13
Train Score:0.6439357679582444 Test Score:0.651283199337908 for random state:22
Train Score:0.6419750701793757 Test Score:0.6585429403428698 for random state:23
Train Score:0.6422946563674216 Test Score:0.6577792838981762 for random state:24
Train Score:0.6448575025444155 Test Score:0.6475160987064845 for random state:25
Train Score:0.6438363934963036 Test Score:0.6517508598209304 for random state:26
Train Score:0.6446083070841679 Test Score:0.6484852997064094 for random state:27
Train Score:0.6400143700748325 Tes

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train,X_test,Y_train,Y_test = train_test_split(features,label,random_state=44,test_size=0.2)

model = LinearRegression()
model.fit(X_train,Y_train)
    
test_score = model.score(X_test,Y_test)
train_score = model.score(X_train,Y_train)
    
print("Train Score:{} Test Score:{} ".format(train_score,test_score))

Train Score:0.641091937856301 Test Score:0.6617774292863091 


In [22]:
# We are getting test score of 66 percent for linear Regression

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

for i in range(1,100):
    X_train,X_test,Y_train,Y_test = train_test_split(features,label,random_state=i,test_size=0.2)
    
    model2 = DecisionTreeRegressor(max_depth = 6)
    model2.fit(X_train,Y_train)
    
    test_score = model2.score(X_test,Y_test)
    train_score = model2.score(X_train,Y_train)
    
    if test_score > train_score:
        print("Train Score:{} Test Score:{} for random state:{}".format(train_score,test_score,i))

Train Score:0.6731657484836934 Test Score:0.6739065966586669 for random state:23
Train Score:0.6800030852440735 Test Score:0.6850201108942348 for random state:33
Train Score:0.6743657240472378 Test Score:0.6767271621730004 for random state:63


In [45]:
X_train,X_test,Y_train,Y_test = train_test_split(features,label,random_state=33,test_size=0.2)
    
model2 = DecisionTreeRegressor(max_depth = 6)
model2.fit(X_train,Y_train)
    
test_score = model2.score(X_test,Y_test)
train_score = model2.score(X_train,Y_train)
    
print("Train Score:{} Test Score:{} ".format(train_score,test_score))

Train Score:0.6800030852440735 Test Score:0.6818326871882512 


In [46]:
# We are getting test score of 68 percent for Decision Tree Regressor

In [50]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

for i in range(1,100):
    X_train,X_test,Y_train,Y_test = train_test_split(features,label,random_state=i,test_size=0.2)
    
    model3 = RandomForestRegressor(n_estimators=10,max_depth=6)
    model3.fit(X_train,Y_train)
    
    test_score = model3.score(X_test,Y_test)
    train_score = model3.score(X_train,Y_train)
    
    if test_score > train_score:
        print("Train Score:{} Test Score:{} for random state:{}".format(train_score,test_score,i))

Train Score:0.6972932301073871 Test Score:0.6983872139219761 for random state:23
Train Score:0.704098798566442 Test Score:0.7092819167944047 for random state:33
Train Score:0.7030918114320699 Test Score:0.7054341556008825 for random state:37
Train Score:0.7035028721348131 Test Score:0.7046509335784883 for random state:60
Train Score:0.7033080499537416 Test Score:0.7048352615408466 for random state:63
Train Score:0.7024826836837803 Test Score:0.7025803826863322 for random state:66


In [51]:
X_train,X_test,Y_train,Y_test = train_test_split(features,label,random_state=33,test_size=0.2)
    
model3 = RandomForestRegressor(n_estimators=10,max_depth=6)
model3.fit(X_train,Y_train)
    
test_score = model3.score(X_test,Y_test)
train_score = model3.score(X_train,Y_train)
    
print("Train Score:{} Test Score:{}".format(train_score,test_score))

Train Score:0.7031038586999623 Test Score:0.713081263503918


In [52]:
# We are getting test score of 71 percent for Random Forest Classifier