In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
housing_data = pd.read_csv('datasets/housing.csv')

In [3]:
housing_data = housing_data.dropna()

In [4]:
housing_data = housing_data.drop(housing_data.loc[housing_data['median_house_value'] == 500001].index)

In [5]:
housing_data['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [6]:
housing_data = pd.get_dummies(housing_data, columns=['ocean_proximity'])

In [7]:
housing_data.shape

(19475, 14)

In [8]:
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
17801,-121.79,37.38,22.0,3650.0,527.0,1637.0,520.0,5.3774,325600.0,1,0,0,0,0
10303,-117.79,33.88,17.0,8562.0,1351.0,3822.0,1316.0,6.0829,252600.0,1,0,0,0,0
5956,-117.87,34.13,32.0,1741.0,373.0,872.0,333.0,3.4219,194500.0,1,0,0,0,0
1377,-122.09,38.02,37.0,1742.0,339.0,1128.0,345.0,3.8824,113700.0,0,0,0,1,0
15429,-117.24,33.21,18.0,1846.0,419.0,1581.0,387.0,3.0982,111300.0,1,0,0,0,0


In [9]:
median = housing_data['median_house_value'].median()
median

173800.0

In [11]:
housing_data['above_median'] = (housing_data['median_house_value'] - median) > 0

In [12]:
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,above_median
8365,-118.35,33.97,26.0,1725.0,431.0,1130.0,404.0,3.2708,128100.0,1,0,0,0,0,False
18581,-121.77,36.92,9.0,4934.0,1112.0,3198.0,977.0,3.5,194800.0,1,0,0,0,0,True
11960,-117.43,33.91,15.0,14281.0,2511.0,7540.0,2245.0,4.3222,138000.0,0,1,0,0,0,False
13380,-117.61,34.13,21.0,8416.0,1386.0,4308.0,1341.0,4.4611,164600.0,0,1,0,0,0,False
6282,-117.92,34.03,35.0,1469.0,306.0,1285.0,308.0,3.9219,159500.0,1,0,0,0,0,False


In [13]:
X = housing_data.drop(['median_house_value', 'above_median'], axis=1)
Y = housing_data['above_median']

In [16]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [17]:
x_train.shape, x_test.shape

((15580, 13), (3895, 13))

In [18]:
y_train.shape, y_test.shape

((15580,), (3895,))

In [19]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(solver='liblinear').fit(x_train, y_train)

In [20]:
print('Train_scroe: ', logistic_model.score(x_train, y_train))

Train_scroe:  0.8162387676508344


In [21]:
logistic_model.coef_

array([[ 6.40418025e-02,  6.34646469e-02,  2.50405548e-02,
        -3.26416463e-04,  2.28837909e-03, -1.21903033e-03,
         3.33813540e-03,  1.26245292e+00,  6.84432624e-01,
        -1.77088862e+00,  1.25377066e-02,  4.37207482e-01,
         5.77491847e-01]])

In [22]:
y_pred = logistic_model.predict(x_test)

In [23]:
df_pred_actual = pd.DataFrame({'predicted': y_pred, 'actual': y_test})
df_pred_actual.head(10)

Unnamed: 0,predicted,actual
14720,True,True
14757,False,False
8428,True,True
16138,True,True
16169,True,True
12119,False,True
1225,False,False
8967,True,True
15506,True,False
9048,False,False


In [24]:
from sklearn.metrics import accuracy_score
print('Testing_score', accuracy_score(y_test, y_pred))

Testing_score 0.8359435173299101
