In [1]:
"""
importing neceaasry libraries 
"""
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import warnings

warnings.filterwarnings('ignore')

In [2]:
"""
reading the data
"""
df = pd.read_csv('housing.csv')

# Data Preparation

In [3]:
"""
selecting relevant features 
"""

features = ['latitude', 'longitude', 'housing_median_age', 'total_rooms', 'total_bedrooms',
            'population', 'households', 'median_income', 'median_house_value', 'ocean_proximity']

data = df[features]

"""
Feature Engineering
"""
data['rooms_per_household'] = (data['total_rooms'] / data['households'])

data['bedrooms_per_room'] = (data['total_bedrooms'] / data['total_rooms'])

data['population_per_household'] = (data['population'] / data['households'])

data['above_average'] = (data['median_house_value'] > data['median_house_value'].mean()).astype('int')



data.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household,above_average
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556,1
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842,1
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226,1
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945,1
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467,1


In [4]:
"""
Mode Ocean Proximity
"""
data['ocean_proximity'].mode()

0    <1H OCEAN
dtype: object

In [5]:
"""
Splitting the data into:
Training data, Validation data and Testing data
"""

df_full_train, df_test = train_test_split(data, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

y_train = df_train.above_average.values 
y_val = df_val.above_average.values 
y_test = df_test.above_average.values 

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [6]:
"""
Correlation Matrix for numerical columns 
"""
numerical = ['latitude', 'longitude', 'housing_median_age', 'total_rooms', 'total_bedrooms',
            'population', 'households', 'median_income', 'median_house_value', 'rooms_per_household',
             'bedrooms_per_room', 'population_per_household']

corr = data[numerical].corr().unstack().sort_values(ascending=False) 
corr[corr<1].iloc[:1]

total_bedrooms  households    0.979728
dtype: float64

Features with the biggest correlation are: total_bedrooms and households

In [7]:
"""
Calculating mutual info score
"""
categorical = ['ocean_proximity']

def mutual_score(col):
    """
    This function calculates the mutual info score between the target column and a specified categorical column
    params: categorical column
    returns: mutual info score
    rtype: float
    """
    mi = mutual_info_score(y_train, col)
    return round(mi, 2)

df_train[categorical].apply(mutual_score)

ocean_proximity    0.1
dtype: float64

In [8]:
"""
training and testing the model
"""
#DictVectorizer performs one hot enconding on categorical columns
dv = DictVectorizer(sparse=False)

#creating an instance of the logistic regression model
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

#convert train data to dictionary
train_dict = df_train.to_dict(orient='records')

#one hot encoding
X_train = dv.fit_transform(train_dict)

#Training the model 
model.fit(X_train, y_train)

#convert validation data to dictionary
val_dict = df_val.to_dict(orient='records')

#one hot encoding
X_val = dv.transform(val_dict)

#predicting for the validation data
y_pred = model.predict(X_val)

#checking the accuracy
score = round(accuracy_score(y_pred, y_val),2)
print(score)

0.84


In [9]:
"""
Training and testing by feature omission
"""
real_score = score
columns = df_train.columns.to_list()

for col in columns:
    cols = columns.copy()
    cols.remove(col)
    train_dict = df_train[cols].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

    model.fit(X_train, y_train)
    
    val_dict = df_val[cols].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    y_pred = model.predict(X_val)
    
    score = accuracy_score(y_pred, y_val)
    print(f"{col}: score: {score},        difference: {real_score - score}")

latitude: score: 0.8323643410852714,        difference: 0.007635658914728616
longitude: score: 0.8318798449612403,        difference: 0.008120155038759624
housing_median_age: score: 0.8316375968992248,        difference: 0.008362403100775184
total_rooms: score: 0.8362403100775194,        difference: 0.003759689922480547
total_bedrooms: score: 0.8372093023255814,        difference: 0.0027906976744185297
population: score: 0.8263081395348837,        difference: 0.013691860465116279
households: score: 0.8340600775193798,        difference: 0.005939922480620141
median_income: score: 0.7853682170542635,        difference: 0.05463178294573645
ocean_proximity: score: 0.8202519379844961,        difference: 0.01974806201550383
rooms_per_household: score: 0.8352713178294574,        difference: 0.004728682170542564
bedrooms_per_room: score: 0.8362403100775194,        difference: 0.003759689922480547
population_per_household: score: 0.8357558139534884,        difference: 0.0042441860465115555


total rooms has the smallest difference 

In [10]:
"""
Using Ridge Regression to check accuracy
"""
#Log transformation of target column
df['median_house_value'] = np.log1p(df['median_house_value'])

#splitting the data
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

y_train = df_train['median_house_value'].values
y_val = df_val['median_house_value'].values
y_test = df_test['median_house_value'].values

del df_train['median_house_value']
del df_test['median_house_value']
del df_val['median_house_value']

#preparing data for training and testing
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')

X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

#Calculating rmse for all alphas
mse = mean_squared_error

tune_param = [0, 0.01, 0.1, 1, 10]
for a in tune_param:    
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)    
    score = np.sqrt(mse(y_pred, y_val))
        
    print(f"The score for tune parameter {a}: {round(score,3)}")




The score for tune parameter 0: 0.525
The score for tune parameter 0.01: 0.525
The score for tune parameter 0.1: 0.525
The score for tune parameter 1: 0.525
The score for tune parameter 10: 0.525
