In [1]:
# In data preprocessing, a transformation means changing the form, scale, or representation
# of your raw data so that it becomes clean, consistent, and understandable for a model.

# A transformation is any mathematical or logical operation applied to raw data to make it suitable for analysis or modeling.  

In [1]:
import pandas as pd 
import numpy as np 

#-----------------------BELOW IS SHULLFE PART---------------------------

In [2]:
df = pd.read_csv("4CF_Data.csv")    # since we load the data, Lets get a quick overview of the data.

In [3]:
df.shape                                      # We have 2060 Rows, 10 columns.

(20640, 10)

In [4]:
df.info()                                               # Missing values for total_bedrooms.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [5]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
df.dtypes                        # We have a non numeric (Object) datatype thats - ocean_proximity 
df['ocean_proximity'].value_counts()   # We only have 5 INLAND cat, in ocean Proximity.

ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64

In [7]:
df.isnull().sum()         # 207 Missing values for Bedroom 

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [8]:
df.columns 

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [9]:
# We had created a new columns named Income cat, that was to catogorise median house into 5 different bins.

In [10]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(df, df["ocean_proximity"]):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

# Stratfied Shuffle splitted the data, taken care of ( income cate, ocean Proximity )  

#-----------------------------END OF THE SHULLFE PART---------------------------

In [11]:
# NOW THAT WE HAVE OUR NON BIAS (SHUFFLED )DATA--------------Steps TO Data PREPROCESS--------------

In [12]:
# Step 1. 

# Before you apply any machine learning algorithm (like Linear Regression, Random Forest, etc.) to your data,
# you must first separate the features (X) from the label (y).

In [13]:
# We will put or test data aside and will work on the training data. Start with creating a copy of 
# the training data, so that we dont compormise our original data in any way. 

TraiSet_copy = strat_train_set.copy()                  # Copy created

In [14]:
TraiSet_copy.head()                     # median_house_value is the label we have in this data.  

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,96500.0,INLAND
5039,-118.34,33.99,34.0,397.0,132.0,250.0,121.0,1.675,166700.0,<1H OCEAN
10341,-117.8,33.78,17.0,4138.0,805.0,2442.0,780.0,4.7804,242000.0,<1H OCEAN
16325,-121.33,37.98,10.0,1564.0,397.0,643.0,347.0,2.7031,150000.0,INLAND
10209,-117.92,33.88,32.0,1683.0,273.0,719.0,263.0,5.3649,243600.0,<1H OCEAN


In [15]:
# Step 2.

# Seperate labels (median_house_value) and features (Rest). Before you apply any machine learning algorithm
# (like Linear Regression, Random Forest, etc.) to your data,you must first separate 
# the features (X) from the label (y).

In [16]:
data_feature = strat_train_set.drop("median_house_value", axis=1) 
data_label = strat_train_set["median_house_value"].copy()

In [17]:
data_label

2271      96500.0
5039     166700.0
10341    242000.0
16325    150000.0
10209    243600.0
           ...   
4827     227800.0
17532    350000.0
4534     141800.0
4665     193800.0
19307    244100.0
Name: median_house_value, Length: 16512, dtype: float64

In [18]:
data_feature

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
2271,-119.80,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,INLAND
5039,-118.34,33.99,34.0,397.0,132.0,250.0,121.0,1.6750,<1H OCEAN
10341,-117.80,33.78,17.0,4138.0,805.0,2442.0,780.0,4.7804,<1H OCEAN
16325,-121.33,37.98,10.0,1564.0,397.0,643.0,347.0,2.7031,INLAND
10209,-117.92,33.88,32.0,1683.0,273.0,719.0,263.0,5.3649,<1H OCEAN
...,...,...,...,...,...,...,...,...,...
4827,-118.32,34.03,31.0,2206.0,501.0,1194.0,435.0,1.9531,<1H OCEAN
17532,-121.89,37.34,20.0,1106.0,494.0,851.0,448.0,0.8894,<1H OCEAN
4534,-118.21,34.03,45.0,1860.0,472.0,1893.0,456.0,2.6573,<1H OCEAN
4665,-118.30,34.05,31.0,1744.0,720.0,2034.0,633.0,2.2684,<1H OCEAN


In [19]:
# Step 3. 

# Handling Missing Data / NaN. Will use SimpleImputer from Scikit-Learn.
# This is a scikit-learn class used to automatically replace missing values
# in a dataset with some computed value (like mean, median, or mode).

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")      # replace NaNs with the median of the column, And as the 
                                                # imputer can’t compute the median of text, we exclude ocean_prox

strat_train_set_num = strat_train_set.select_dtypes(include=[np.number])         # excluded ocean_prox

# We will fit the imputer to numeric data = strat_train_set_num
imputer.fit(strat_train_set_num)


In [20]:
# Once the imputation is done, Now apply the learned medians to transform the data with transform func.
x = imputer.transform(strat_train_set_num) 

# We have tranaformed the imputed data (NaN handelled)

In [64]:
# Rule of thumb! 

# “Fit learns; Transform applies.”
# Fit = remember what to replace
# Transform = actually replace it.

In [1]:
# Other available strategies:

# "mean" – replaces with mean value

# "most_frequent" – for the most common value (can handle categorical)

# "constant" – fill with a fixed value using fill_value = any