In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Ridge
import numpy as np
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv("housing.csv")

In [3]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [5]:
df["total_bedrooms"].fillna(0, inplace = True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [7]:
df["rooms_per_household"] = df["total_rooms"]/df["households"]
df["bedrooms_per_room"] = df["total_bedrooms"]/df["total_rooms"]
df["population_per_household"] = df["population"]/df["households"]

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   longitude                 20640 non-null  float64
 1   latitude                  20640 non-null  float64
 2   housing_median_age        20640 non-null  float64
 3   total_rooms               20640 non-null  float64
 4   total_bedrooms            20640 non-null  float64
 5   population                20640 non-null  float64
 6   households                20640 non-null  float64
 7   median_income             20640 non-null  float64
 8   median_house_value        20640 non-null  float64
 9   ocean_proximity           20640 non-null  object 
 10  rooms_per_household       20640 non-null  float64
 11  bedrooms_per_room         20640 non-null  float64
 12  population_per_household  20640 non-null  float64
dtypes: float64(12), object(1)
memory usage: 2.0+ MB


In [9]:
df["ocean_proximity"].mode().iloc[0]

'<1H OCEAN'

In [10]:
target = "median_house_value"
y = df[target]
X = df.drop(target, axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)

In [11]:
X_train.corr()[["total_bedrooms", "population", "population_per_household"]].loc[["households", "total_rooms"]]

Unnamed: 0,total_bedrooms,population,population_per_household
households,0.979132,0.907327,-0.032996
total_rooms,0.931785,0.853763,-0.029712


In [12]:
above_average = 1

In [13]:
y_train_new = y_train.map(lambda x: above_average if x>y_train.mean() else 0)

score = mutual_info_score(X_train["ocean_proximity"], y_train_new)

round(score, 2)

0.1

In [14]:
train_dicts = X_train.to_dict(orient = "records")
val_dicts = X_val.to_dict(orient="records")
dv = DictVectorizer(sparse = False)
X_train_new = dv.fit_transform(train_dicts)
X_val_new = dv.fit_transform(val_dicts)

In [15]:
model = LogisticRegression(solver = "liblinear", C = 1.0, max_iter = 1000, random_state = 42)
model.fit(X_train_new, y_train_new)
model.predict(X_val_new)
y_val_new = y_val.map(lambda x: above_average if x> y_val.mean() else 0)
score = round(model.score(X_val_new, y_val_new), 2)
score

0.83

In [16]:
choice = ["total_rooms", "total_bedrooms", "population", "households"]
scores = []
rates = []
for feat in choice: 
    X_temp = X_train.copy()
    X_temp.drop(columns = feat, inplace = True)
    X_temp_new = dv.fit_transform(X_temp.to_dict(orient = "records"))
    mod = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    mod.fit(X_temp_new, y_train_new)
    scores.append(mod.score(X_temp_new, y_train_new))
    rates.append(score - mod.score(X_temp_new, y_train_new))

In [17]:
choice[rates.index(min(rates))]

'households'

In [18]:
y_train = np.log1p(y_train)
y_val = np.log1p(y_val)

alpha = [0, 0.01, 0.1, 1, 10]
RMSE = []
for a in alpha:
    model2 = Ridge(alpha = a, solver = "sag", random_state = 42)
    model2.fit(X_train_new, y_train)
    RMSE.append(mean_squared_error(model2.predict(X_val_new), y_val))
    
alpha[RMSE.index(min(RMSE))]

0

In [19]:
RMSE

[0.27404899452935183,
 0.27404899455270554,
 0.27404899476287997,
 0.2740489968471075,
 0.27404901772439055]