# 1. Data preparation

In [None]:
import os
import tarfile
import requests

DOWNLOAD_URL = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.tgz"
DATA_PATH = "./housing.tgz"

r = requests.get(DOWNLOAD_URL)
with open(DATA_PATH, "wb") as raw_file:
    raw_file.write(r.content)

data = tarfile.open(DATA_PATH)
data.extractall(path="./")
data.close()


In [1]:
import pandas as pd
import numpy as np

from uetai.logger import WandbLogger

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
housing = pd.read_csv("housing.csv")
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
# init logger
logger = WandbLogger(project_name='study-case-2')

[34m[1mwandb[0m: Currently logged in as: [33mnmd2000[0m (use `wandb login --relogin` to force relogin)


In [18]:
from sklearn.datasets import fetch_california_housing

housing_data = fetch_california_housing()

In [21]:
descr = housing_data['DESCR']
feature_names = housing_data['feature_names']

data = housing_data['data']
target = housing_data['target']
df1 = pd.DataFrame(data=data)

df1.rename(columns={0: feature_names[0], 1: feature_names[1], 2: feature_names[2], 3: feature_names[3],
            4: feature_names[4], 5: feature_names[5], 6: feature_names[6], 7: feature_names[7]}, inplace=True)

df2 = pd.DataFrame(data=target)
df2.rename(columns={0: 'Target'}, inplace=True)
housing = pd.concat([df1, df2], axis=1)

In [23]:
housing.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [4]:
# logging dataset and profiling it
logger.log_artifact(
    artifact_name='housing',
    artifact_path='housing.csv',
    auto_profiling=True,
)

Summarize dataset: 100%|██████████| 105/105 [00:06<00:00, 16.06it/s, Completed]                                    
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.54s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.22s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 184.20it/s]


<wandb.sdk.wandb_artifacts.Artifact at 0x7fb28cad6e20>

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(housing.loc[:, housing.columns != 'Target'], housing['Target'], random_state=66)

# Train a simple Tree

In [25]:
from sklearn.ensemble import RandomForestRegressor

rand_reg = RandomForestRegressor()
rand_reg.fit(X_train, y_train)

In [31]:
from sklearn.model_selection import cross_val_score

cross_val_score(rand_reg, X_test, y_test, cv=2)

array([0.77621969, 0.73847789])

In [34]:
import shap

explainer = shap.TreeExplainer(rand_reg)
logger.shap_summary_plot(explainer=explainer, X_test=X_test, attributes=feature_names)