## Using Logistic Regression to classify the price of California house

In [2]:
import sklearn
import pandas as pd
import matplotlib.pyplot as plt

In [18]:
# read data using pandas
housing_data = pd.read_csv('datasets/housing.csv')
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
417,-122.26,37.9,37.0,2220.0,335.0,903.0,362.0,7.8336,371300.0,NEAR BAY
136,-122.19,37.83,29.0,1791.0,271.0,661.0,269.0,6.8538,368900.0,NEAR BAY
17840,-121.89,37.44,8.0,2534.0,,1527.0,364.0,7.8532,422800.0,<1H OCEAN
8309,-118.11,33.77,15.0,9103.0,1847.0,3333.0,1712.0,5.1508,367300.0,NEAR OCEAN
12164,-117.22,33.74,7.0,1810.0,386.0,931.0,355.0,2.5221,109200.0,<1H OCEAN


In [19]:
# drop records with missing fields
housing_data = housing_data.dropna()
housing_data.shape

(20433, 10)

In [20]:
# drop records of houses with price capped at 500001
housing_data = housing_data.drop(housing_data.loc[housing_data['median_house_value'] == 500001].index)

In [21]:
housing_data.shape

(19475, 10)

In [22]:
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [24]:
# one-hot encode ocean_proximity
housing_data = pd.get_dummies(housing_data, columns=['ocean_proximity'])

In [25]:
housing_data.shape

(19475, 14)

In [26]:
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,0,1,0


In [27]:
# convert regression problem to classification problem
median = housing_data['median_house_value'].median()
median

173800.0

In [30]:
# add new field
housing_data['above_median'] = housing_data['median_house_value'] - median > 0

In [31]:
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,above_median
20545,-121.75,38.55,26.0,4802.0,950.0,2199.0,939.0,3.7452,227700.0,0,1,0,0,0,True
15010,-117.03,32.75,24.0,7879.0,1655.0,3898.0,1534.0,3.0897,187300.0,0,0,0,0,1,True
1564,-121.92,37.74,8.0,452.0,51.0,140.0,43.0,12.5915,432400.0,1,0,0,0,0,True
16017,-122.46,37.72,52.0,2951.0,406.0,1115.0,397.0,6.7228,405200.0,0,0,0,0,1,True
13570,-117.29,34.14,52.0,1683.0,266.0,646.0,256.0,4.0481,97300.0,0,1,0,0,0,False


In [33]:
# set up features (labels) and Y values for the classification
X = housing_data.drop(['median_house_value', 'above_median'], axis=1)
Y = housing_data['above_median']

In [36]:
# visualize training features
X.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN'],
      dtype='object')

In [37]:
# split datasets into training and test
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)

In [38]:
x_train.shape, x_test.shape

((15580, 13), (3895, 13))

In [39]:
y_train.shape, y_test.shape

((15580,), (3895,))

In [40]:
# import logistic regression estimator
from sklearn.linear_model import LogisticRegression

In [42]:
# train model
# liblinear solver is a good choice for small dataset and binary classification to solve the optimization problem
logistic_model = LogisticRegression(solver='liblinear').fit(x_train, y_train)

In [43]:
# evaluate training score
print(f'Training Score: {logistic_model.score(x_train, y_train)}')

Training Score: 0.8194480102695764


In [45]:
# predict
y_pred = logistic_model.predict(x_test)

In [48]:
# create a new dataframe
df_pred_actual = pd.DataFrame({'predicted': y_pred, 'actual': y_test})
df_pred_actual.sample(10)

Unnamed: 0,predicted,actual
16030,True,True
6795,True,False
20234,True,True
11553,True,True
10832,True,True
7254,False,False
15387,True,True
15557,False,False
1542,True,True
7923,True,True


In [50]:
# evaluate model accuracy on test data
from sklearn.metrics import accuracy_score
print(f'Accuracy Score: {accuracy_score(y_test, y_pred)}')

Accuracy Score: 0.8233632862644416
