In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
housing_data = pd.read_csv('Documents/dataset/housing.csv')
#gives the first 5 records 
housing_data.head(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
5,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,NEAR BAY
6,-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,NEAR BAY
7,-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0,NEAR BAY
8,-122.26,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804,226700.0,NEAR BAY
9,-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,261100.0,NEAR BAY


In [4]:
#drops the rows which has unavailable data(missing values)

housing_data = housing_data.dropna()

In [5]:
housing_data.shape

(20433, 10)

In [6]:
#to check how many of the median_house_value records are 500001
#958 records are of same upper value. Something like this could skew the training of our machine learning model. 
#so it's best to drop these records from dataset.
#working with skewed data where lots of records are skewed around the same value, there are techiniques to work around such data

housing_data.loc[housing_data['median_house_value'] == 500001].count()

longitude             958
latitude              958
housing_median_age    958
total_rooms           958
total_bedrooms        958
population            958
households            958
median_income         958
median_house_value    958
ocean_proximity       958
dtype: int64

In [7]:
#simply ignore/drop the skewed data
#first calculate the index positions of all of the records(inner bracket function)

housing_data = housing_data.drop(housing_data.loc[housing_data['median_house_value'] == 500001].index)

In [8]:
housing_data.shape

(19475, 10)

In [9]:
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [10]:
housing_data['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [11]:
#convert categorical values to numeric using one-hot encoding
housing_data = pd.get_dummies(housing_data, columns = ['ocean_proximity'])

In [12]:
housing_data.shape

(19475, 14)

In [13]:
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
3510,-118.46,34.25,32.0,2217.0,422.0,1064.0,427.0,3.6989,208600.0,1,0,0,0,0
19532,-120.9,37.66,19.0,3377.0,669.0,2426.0,663.0,2.9783,82500.0,0,1,0,0,0
15388,-117.05,33.26,14.0,3103.0,569.0,1704.0,539.0,3.7644,264700.0,1,0,0,0,0
8081,-118.19,33.81,21.0,1835.0,427.0,1038.0,384.0,4.4559,198500.0,0,0,0,0,1
12558,-121.45,38.54,47.0,1159.0,250.0,810.0,244.0,2.7787,56000.0,0,1,0,0,0


In [15]:
# the dataset used for Regression can also be used for classification by calculating the median. And then predicting whether a
# particular neighbourhood has a house price above the median value or below the median value.

median = housing_data['median_house_value'].median()

median

173800.0

In [16]:
#add a new column-above_median with boolean values. It'll contain the value true if the value of a house is above the median
#false othserwise.

housing_data['above_median'] = (housing_data['median_house_value'] - median)>0

In [18]:
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,above_median
7379,-118.24,33.97,41.0,1182.0,346.0,1644.0,346.0,2.1473,115100.0,1,0,0,0,0,False
15458,-117.26,33.18,9.0,4540.0,793.0,2235.0,746.0,4.5781,225600.0,0,0,0,0,1,True
13628,-117.33,34.09,29.0,1960.0,415.0,1681.0,435.0,2.9292,84500.0,0,1,0,0,0,False
9691,-121.66,36.7,33.0,3252.0,630.0,2010.0,641.0,3.4222,158100.0,1,0,0,0,0,False
16963,-122.31,37.54,46.0,2444.0,397.0,952.0,402.0,4.75,388200.0,0,0,0,0,1,True


In [19]:
X = housing_data.drop(['median_house_value','above_median'], axis=1)

Y = housing_data['above_median']

In [20]:
X.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN'],
      dtype='object')

In [21]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.2, random_state=0 )

In [22]:
x_train.shape, x_test.shape

((15580, 13), (3895, 13))

In [23]:
from sklearn.linear_model import LogisticRegression
#Solver specifies to scikit learn the alogirthm that it should use under the hood to solve this logistic regression optimization problem.
#The algorithm to use in the optimization problem - the liblinear solver is a good choice for small datasets 
#and binary classification.

logistic_model = LogisticRegression(solver= 'liblinear').fit(x_train, y_train)
logistic_model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
#R-square is the measure of how well our linear model captures the underlying variation in our training data.

print("Training Score : ", logistic_model.score(x_train, y_train))

Training Score :  0.8207317073170731


In [26]:
y_pred=logistic_model.predict(x_test)

In [27]:
#create a dataframe to view the predicted and the actual scores.

y_pred_actual = pd.DataFrame({'predicted':y_pred, 'actual':y_test})

y_pred_actual.head(10)

Unnamed: 0,predicted,actual
19234,True,True
1859,False,False
51,False,False
11192,False,True
20355,False,False
13937,False,False
7965,False,False
9446,False,False
11377,True,True
10365,True,True


In [30]:
from sklearn.metrics import accuracy_score

print("Testing Score: ", accuracy_score(y_test,y_pred))

Testing Score:  0.8189987163029525
