In [39]:
import modin.pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import ydata_profiling


In [21]:
%matplotlib inline


In [22]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

In [23]:
type(housing)

sklearn.utils._bunch.Bunch

In [24]:
housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [25]:
print(housing.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [26]:
print(housing.feature_names)

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [27]:
print(housing.target)

[4.526 3.585 3.521 ... 0.923 0.847 0.894]


In [28]:
print(housing.data)

[[   8.3252       41.            6.98412698 ...    2.55555556
    37.88       -122.23      ]
 [   8.3014       21.            6.23813708 ...    2.10984183
    37.86       -122.22      ]
 [   7.2574       52.            8.28813559 ...    2.80225989
    37.85       -122.24      ]
 ...
 [   1.7          17.            5.20554273 ...    2.3256351
    39.43       -121.22      ]
 [   1.8672       18.            5.32951289 ...    2.12320917
    39.43       -121.32      ]
 [   2.3886       16.            5.25471698 ...    2.61698113
    39.37       -121.24      ]]


# Prepare the data

In [29]:
df = pd.DataFrame(housing.data, columns=housing.feature_names)

In [30]:
type(df)

modin.pandas.dataframe.DataFrame

In [31]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [32]:
from ydata_profiling import ProfileReport

In [33]:
df['price'] = housing.target

In [34]:
df.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude', 'price'],
      dtype='object')

In [35]:
df.info()

<class 'modin.pandas.dataframe.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
 8   price       20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [36]:
df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,price
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


## Nulls

In [37]:
df.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
price         0
dtype: int64

### No Nulls YAAYYY

---

# Exploratory data analysis

In [40]:
pr = ProfileReport(df)
pr

AttributeError: 'DataFrame' object has no attribute 'rdd'

In [41]:
df.corr()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,price
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176,0.688075
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197,0.105623
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754,0.151948
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344,-0.046701
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773,-0.02465
AveOccup,0.018766,0.013191,-0.004852,-0.006181,0.069863,1.0,0.002366,0.002476,-0.023737
Latitude,-0.079809,0.011173,0.106389,0.069721,-0.108785,0.002366,1.0,-0.924664,-0.14416
Longitude,-0.015176,-0.108197,-0.02754,0.013344,0.099773,0.002476,-0.924664,1.0,-0.045967
price,0.688075,0.105623,0.151948,-0.046701,-0.02465,-0.023737,-0.14416,-0.045967,1.0


In [None]:
import folium
from folium.plugins import HeatMap

# Define the center of the map based on average latitude & longitude in `df`
map_center = [df["Latitude"].mean(), df["Longitude"].mean()]
m = folium.Map(location=map_center, zoom_start=10)  # Create the base map

# Prepare data for heatmap (Latitude, Longitude, Price)
heat_data = [[row["Latitude"], row["Longitude"], row["price"]] for _, row in df.iterrows()]

# Add heatmap layer to the map
HeatMap(heat_data).add_to(m)

# Show the heatmap
m


# Outliers

In [None]:
fig, ax = plt.subplots(figsize = (15,15))
sns.boxplot(data=df, ax=ax)
plt.show()

# Split into inde and dependent features

In [None]:
x = df.iloc[ : , :-1 ]
y = df.iloc[ : , -1 ]

In [None]:
x

In [None]:
y

## Split into train and test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state = 42)

# Standardization

## 1. Standard Scaler
## 2. MinMaxScaler

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
XtrainNorm = scaler.fit_transform(Xtrain)

In [None]:
XtrainNorm

In [None]:
fig, ax = plt.subplots(figsize = (15,15))
sns.boxplot(data=XtrainNorm, ax=ax)
plt.show()

In [None]:
XtestNorm = scaler.transform(Xtest)

In [None]:
fig, ax = plt.subplots(figsize = (15,15))
sns.boxplot(data=XtestNorm, ax=ax)
plt.show()

## Train set -> fit_transform
## Test set -> transform

# WHY?

# Training Set vs. Test Set in Data Preprocessing

## Training Set (`fit_transform`)
When working with data preprocessing techniques like scaling or normalization, the `fit_transform` method is used on the training data.

- The `fit` step **learns** the parameters (e.g., mean and standard deviation for scaling).
- The `transform` step **applies** those learned parameters to the training data.

## Test Set (`transform`)
The test set should **never** be used to learn parameters, as it would introduce **data leakage**.

- Instead, we apply the **same transformation** learned from the training set to ensure consistency.
- This ensures that the test data is processed in the same way as the training data, preventing **bias**.


# Why don't we transform before train-test split?

We **don't transform before splitting** to **prevent data leakage** and ensure the model generalizes well.

## Why?
1. **Avoiding Data Leakage**  
   - If we apply transformations before splitting, we use information from the entire dataset, including the test set.
   - This introduces bias, leading to overly optimistic performance estimates.

2. **Ensuring Real-World Generalization**  
   - In real-world scenarios, new data won't have access to the entire dataset's statistics.
   - By fitting transformations **only on the training set**, we mimic real-world conditions.

3. **Consistent Scaling**  
   - If we normalize the entire dataset before splitting, the test set might have different statistical properties.
   - Instead, we **fit** the transformation on the training set and **apply** it to the test set.




---
---

# Standard Normal Form

The **Standard Normal Form** refers to a **normal distribution** with a mean of **zero** and a standard deviation of **one**. It is often denoted as:

$$ X \sim \mathcal{N}(0,1) $$

## **Equation of Standard Normal Distribution**
The probability density function (PDF) of the standard normal distribution is given by:

$$ f(x) = \frac{1}{\sqrt{2\pi}} e^{-\frac{x^2}{2}} $$

where:
- \( x \) represents the variable.
- \( \pi \) is the mathematical constant (~3.14159).
- \( e \) is Euler's number (~2.71828).

## **Why Standard Normal Form?**
- It simplifies statistical calculations.
- Any normal distribution can be **converted** to standard normal form using **z-score transformation**:

$$ Z = \frac{X - \mu}{\sigma} $$

where:
- \( X \) is the original value.
- \( \mu \) is the mean of the distribution.
- \( \sigma \) is the standard deviation.

Using this transformation, any normal distribution \( \mathcal{N}(\mu, \sigma) \) can be converted into a standard normal distribution \( \mathcal{N}(0,1) \).

Would you like a visual representation or more examples? 😊


---
---

# Model Training

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()

In [None]:
reg.fit( XtrainNorm , ytrain )

In [None]:
print(reg.coef_)

In [None]:
print(reg.intercept_)

In [None]:
from IPython.display import display, Math

def format_regression_equation(intercept, coefficients):
    equation = r"y = {:.2f}".format(intercept)
    for i, beta in enumerate(coefficients):
        equation += r" + {:.2f}x_{}".format(beta, i+1)

    display(Math(equation))

# Example usage:
reg_intercept = reg.intercept_
reg_coef = reg.coef_

format_regression_equation(reg_intercept, reg_coef)


---
---

# Model Prediction

In [None]:
pred = reg.predict( XtestNorm ); pred

### Calculate error

In [None]:
residuals = ytest - pred
residuals

### Distribution plot

In [None]:
sns.displot( residuals , kind='kde'); plt.show()

# Model Evaluation

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
print( mean_squared_error(ytest, pred) )
print( mean_absolute_error(ytest, pred) )
print( r2_score(ytest, pred) )

# Save the model

In [None]:
import pickle
pickle.dump(reg , open('LinearRegressionModel.pkl' , 'wb'))

In [None]:
# Model = pickle.load(open('LinearRegressionModel.pkl' , 'rb'))