# Regression practice

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing, load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import seaborn as sns

# Set random seed for reproducibility
np.random.seed(0)

# Load the california housing dataset
california = fetch_california_housing(as_frame=True)
X, y = california.data, california.target

## Task:
- select an appropriate data split
- transform your data for model input
- train a linear regression model
- experiment with various metrics

In [None]:
# EDA
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [None]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Data transformation
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
#print('Non - scaled')
#print(X_train[:10])
#print('Scaled')
#print(X_train_scaled[:10])



# Model training
model = LinearRegression().fit(X_train_scaled, y_train)
model2 = LinearRegression().fit(X_train, y_train)

# Model inference
y_pred = model.predict(X_test_scaled)
y_pred2 = model2.predict(X_test)


# Model evaluation
# MAE, MSE, RMSE, ...?
eval = mean_squared_error(y_test, y_pred)
eval2 = mean_squared_error(y_test, y_pred2)
#mse = np.mean((y_test - y_pred)**2)
print(eval)
print(eval2)

ValueError: Found input variables with inconsistent numbers of samples: [4128, 16512]

In [None]:
diabetes_data = load_diabetes(as_frame=True)

X = diabetes_data.data
y = diabetes_data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression().fit(X_train, y_train)

y_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)

mean_pred = np.full(shape=y_test.shape, fill_value=np.mean(y_train))
mse = mean_squared_error(y_test, y_pred)
mse_baseline = mean_squared_error(y_test, mean_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
print(f'Mean squared error is {mse}')
print(f'Baseline MSE is {mse_baseline}')
print(f'MSE on train set is {mse_train}')

Mean squared error is 2900.1936284934814
Baseline MSE is 5361.533457238513
MSE on train set is 2868.5497028355776


In [None]:
import plotly.express as px
correlation_matrix = X.corr()
px.imshow(correlation_matrix)