## 1.Creating Income Categories

In [1]:
import pandas as pd                     # Import Pandas for data handling
import numpy as np                      # Import NumPy for numerical operations

# Load the dataset
data = pd.read_csv("housing.csv")       # Read CSV file and store it as a DataFrame

# Create income categories
data["income_cat"] = pd.cut(            # Create a new column by binning median_income
    data["median_income"],              # Column to be divided into categories
    bins=[0, 1.5, 3.0, 4.5, 6.0, np.inf],# Define income ranges (bins) #np.inf means infinite value
    labels=[1, 2, 3, 4, 5]               # Assign category labels to each range
)


## 2.Stratified Shuffle Split in Scikit-Learn

##### Scikit-learn provides a built-in way to perform stratified sampling using  StratifiedShuffleSplit .

###### Here’s how you can use it:

In [2]:
from sklearn.model_selection import StratifiedShuffleSplit   # Import class for stratified splitting

# Assume income_cat is already created from median_income
split = StratifiedShuffleSplit(
    n_splits=1,          # Number of train-test splits to generate
    test_size=0.2,       # 20% of data will be used as test set
    random_state=42      # Fix randomness for reproducibility
)

for train_index, test_index in split.split(data, data["income_cat"]):
    strat_train_set = data.loc[train_index]   # Select training data using stratified indices
    strat_test_set = data.loc[test_index]     # Select test data using stratified indices


## 3. Lets remove income category coloumn

In [3]:
# Code to remove income category coloumn
for sett in (strat_train_set , strat_test_set):
    sett.drop("income_cat",axis=1,inplace=True)

In [4]:
strat_train_set
df=strat_train_set.copy() #training dataset ki copy bana lo aur ousko df me save krlo ab hum aone sare kaam df me krege joki humra training dataset h
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
12655,-121.46,38.52,29,3873,797.0,2237,706,2.1736,72100,INLAND
15502,-117.23,33.09,7,5320,855.0,2015,768,6.3373,279600,NEAR OCEAN
2908,-119.04,35.37,44,1618,310.0,667,300,2.8750,82700,INLAND
14053,-117.13,32.75,24,1877,519.0,898,483,2.2264,112500,NEAR OCEAN
20496,-118.70,34.28,27,3536,646.0,1837,580,4.4964,238300,<1H OCEAN
...,...,...,...,...,...,...,...,...,...,...
15174,-117.07,33.03,14,6665,1231.0,2026,1001,5.0900,268500,<1H OCEAN
12661,-121.42,38.51,15,7901,1422.0,4769,1418,2.8139,90400,INLAND
19263,-122.72,38.44,48,707,166.0,458,172,3.1797,140400,<1H OCEAN
19140,-122.70,38.31,14,3155,580.0,1208,501,4.1964,258100,<1H OCEAN


In [5]:
strat_test_set.head() # now do not touch this data set we will use this data set only while testing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
5241,-118.39,34.12,29,6447,1012.0,2184,960,8.2816,500001,<1H OCEAN
17352,-120.42,34.89,24,2020,307.0,855,283,5.0099,162500,<1H OCEAN
3505,-118.45,34.25,36,1453,270.0,808,275,4.3839,204600,<1H OCEAN
7777,-118.1,33.91,35,1653,325.0,1072,301,3.2708,159700,<1H OCEAN
14155,-117.07,32.77,38,3779,614.0,1495,614,4.3529,184000,NEAR OCEAN


# Further Preprocessing & Handling Missing Data

Before feeding your data into a machine learning algorithm, you need to clean and
prepare it.

## 1.Prepare Data for Training

It’s best to write transformation functions instead of applying them manually. This ensures:
- Reproducibility on any dataset
- Reusability across projects
- Compatibility with live systems
- Easier experimentation
  
Start by creating a clean copy and separating the predictors and labels:

In [6]:
housing = df.drop("median_house_value", axis=1) # This will delete coloumn median_house_value
housing_labels = df["median_house_value"].copy() # This will store data of coloumn median_house_value

In [7]:
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
12655,-121.46,38.52,29,3873,797.0,2237,706,2.1736,INLAND
15502,-117.23,33.09,7,5320,855.0,2015,768,6.3373,NEAR OCEAN
2908,-119.04,35.37,44,1618,310.0,667,300,2.8750,INLAND
14053,-117.13,32.75,24,1877,519.0,898,483,2.2264,NEAR OCEAN
20496,-118.70,34.28,27,3536,646.0,1837,580,4.4964,<1H OCEAN
...,...,...,...,...,...,...,...,...,...
15174,-117.07,33.03,14,6665,1231.0,2026,1001,5.0900,<1H OCEAN
12661,-121.42,38.51,15,7901,1422.0,4769,1418,2.8139,INLAND
19263,-122.72,38.44,48,707,166.0,458,172,3.1797,<1H OCEAN
19140,-122.70,38.31,14,3155,580.0,1208,501,4.1964,<1H OCEAN


In [8]:
housing_labels

12655     72100
15502    279600
2908      82700
14053    112500
20496    238300
          ...  
15174    268500
12661     90400
19263    140400
19140    258100
19773     62700
Name: median_house_value, Length: 16512, dtype: int64

## 2.Handling Missing Data

Some features, like total_bedrooms , contain missing values. You can:

1. Drop rows with missing values
2. Drop the entire column
3. <b>Impute missing values (recommended)</b>

We’ll use option 3 using SimpleImputer from Scikit-Learn, which allows consistent
handling across all datasets (train, test, new data):

In [9]:
from sklearn.impute import SimpleImputer        # Import class to handle missing values

imputer = SimpleImputer(strategy="median")      # Create imputer that replaces NaN with median

housing_num = housing.select_dtypes(
    include=[np.number]                         # Select only numeric columns from dataset #Becoz our data contain categorial data in ocean_proximity
)

imputer.fit(housing_num)                        # Learn median values from numeric data


0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False


> <b>This computes the median for each numerical column and stores it in imputer.statistics_ :

In [10]:
imputer.statistics_  # output: this will give haar coloumn ka median jisko hum replace krdenge haar coloumn ki null/NaN value se 

#scikit-learn har learned variable ke aage ye _ (underscore) laga deta hai this is its notation for learned variable

array([-118.51   ,   34.26   ,   29.     , 2119.     ,  433.     ,
       1164.     ,  408.     ,    3.54155])

<b>Now apply the learned medians to transform the data

In [11]:
X = imputer.transform(housing_num) # here we transformed imputer
X # this will give result in numpy array

array([[-1.2146e+02,  3.8520e+01,  2.9000e+01, ...,  2.2370e+03,
         7.0600e+02,  2.1736e+00],
       [-1.1723e+02,  3.3090e+01,  7.0000e+00, ...,  2.0150e+03,
         7.6800e+02,  6.3373e+00],
       [-1.1904e+02,  3.5370e+01,  4.4000e+01, ...,  6.6700e+02,
         3.0000e+02,  2.8750e+00],
       ...,
       [-1.2272e+02,  3.8440e+01,  4.8000e+01, ...,  4.5800e+02,
         1.7200e+02,  3.1797e+00],
       [-1.2270e+02,  3.8310e+01,  1.4000e+01, ...,  1.2080e+03,
         5.0100e+02,  4.1964e+00],
       [-1.2214e+02,  3.9970e+01,  2.7000e+01, ...,  6.2500e+02,
         1.9700e+02,  3.1319e+00]], shape=(16512, 8))

- Other available strategies:
   - "mean" – replaces with mean value
   - "most_frequent" – for the most common value (can handle categorical)
   - "constant" – fill with a fixed value using fill_value=...

### <b>Lets convert this numpy array into dataframe<b>

In [12]:
housing = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.8750
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264
20496,-118.70,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964
...,...,...,...,...,...,...,...,...
15174,-117.07,33.03,14.0,6665.0,1231.0,2026.0,1001.0,5.0900
12661,-121.42,38.51,15.0,7901.0,1422.0,4769.0,1418.0,2.8139
19263,-122.72,38.44,48.0,707.0,166.0,458.0,172.0,3.1797
19140,-122.70,38.31,14.0,3155.0,580.0,1208.0,501.0,4.1964


# Handling Categorical and Text Attributes in Scikit-Learn

Most machine learning algorithms work best with numerical data. But real-world
datasets often contain <b>categorical</b> or <b>text attributes</b>. Let’s understand how to
handle these in Scikit-Learn using the ocean_proximity column from the
California housing dataset as an example.

####  1. Add ocean_proximity coloumn to the dataset becoze we have removed it for NaN computation

In [14]:
housing["ocean_proximity"]=df["ocean_proximity"]
housing # this is the data we are going to work on

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,INLAND
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,NEAR OCEAN
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.8750,INLAND
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,NEAR OCEAN
20496,-118.70,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,<1H OCEAN
...,...,...,...,...,...,...,...,...,...
15174,-117.07,33.03,14.0,6665.0,1231.0,2026.0,1001.0,5.0900,<1H OCEAN
12661,-121.42,38.51,15.0,7901.0,1422.0,4769.0,1418.0,2.8139,INLAND
19263,-122.72,38.44,48.0,707.0,166.0,458.0,172.0,3.1797,<1H OCEAN
19140,-122.70,38.31,14.0,3155.0,580.0,1208.0,501.0,4.1964,<1H OCEAN


In [15]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16512 entries, 12655 to 19773
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16512 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
 8   ocean_proximity     16512 non-null  object 
dtypes: float64(8), object(1)
memory usage: 1.3+ MB


## 1. Categorical Attributes
Text columns like "ocean_proximity" are not free-form text but limited to a fixed
set of values (e.g., "NEAR BAY" , "INLAND" ). These are known as categorical
attributes.

 <b>we want to convert that categorical coloumn to numerical coloumn or numerical data</b>
 
 Example:

In [16]:
housing_cat = housing[["ocean_proximity"]]
housing_cat.head()

Unnamed: 0,ocean_proximity
12655,INLAND
15502,NEAR OCEAN
2908,INLAND
14053,NEAR OCEAN
20496,<1H OCEAN


## 2. Ordinal Encoding (Method 1)
Scikit-Learn’s OrdinalEncoder can convert categories to numbers:

In [17]:
set(housing["ocean_proximity"]) # method 1 :- says replace 1st categorical value with 1 , 2nd with 2 and so on for all categories

{'<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'}

In [18]:
# Method 1 - Ordinal Encoding

from sklearn.preprocessing import OrdinalEncoder     # Import encoder for categorical data

ordinal_encoder = OrdinalEncoder()                   # Create an ordinal encoder object

housing_cat= ordinal_encoder.fit_transform(housing)       # Convert categorical values to numeric codes
housing_cat

array([[2.390e+02, 5.690e+02, 2.800e+01, ..., 7.040e+02, 1.815e+03,
        1.000e+00],
       [6.620e+02, 5.500e+01, 6.000e+00, ..., 7.660e+02, 9.519e+03,
        4.000e+00],
       [4.810e+02, 2.730e+02, 4.300e+01, ..., 2.980e+02, 3.475e+03,
        1.000e+00],
       ...,
       [1.130e+02, 5.610e+02, 4.700e+01, ..., 1.700e+02, 4.239e+03,
        0.000e+00],
       [1.150e+02, 5.480e+02, 1.300e+01, ..., 4.990e+02, 6.403e+03,
        0.000e+00],
       [1.710e+02, 7.010e+02, 2.600e+01, ..., 1.950e+02, 4.107e+03,
        1.000e+00]], shape=(16512, 9))

In [19]:
# Creating df for housing_cat
housing_cat = pd.DataFrame(housing_cat, columns=housing.columns, index=housing.index)
housing_cat # housing categroies are converted into numbers 

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
12655,239.0,569.0,28.0,3555.0,795.0,2167.0,704.0,1815.0,1.0
15502,662.0,55.0,6.0,4411.0,853.0,1965.0,766.0,9519.0,4.0
2908,481.0,273.0,43.0,1410.0,308.0,625.0,298.0,3475.0,1.0
14053,672.0,21.0,23.0,1669.0,517.0,856.0,481.0,1959.0,4.0
20496,515.0,174.0,26.0,3269.0,644.0,1791.0,578.0,6883.0,0.0
...,...,...,...,...,...,...,...,...,...
15174,678.0,49.0,13.0,4852.0,1208.0,1976.0,996.0,7805.0,0.0
12661,243.0,568.0,14.0,5072.0,1354.0,3353.0,1322.0,3341.0,1.0
19263,113.0,561.0,47.0,518.0,164.0,416.0,170.0,4239.0,0.0
19140,115.0,548.0,13.0,2924.0,578.0,1166.0,499.0,6403.0,0.0


> <b>This will output a 2D NumPy array with numerical category codes.

To see the mapping

In [20]:
#ordinal_encoder.categories_
# Output: array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'])

> <b>⚠️ Caution:</b> Ordinal encoding implies an order between categories, which may not
be true here. For example, it treats INLAND (1) as closer to <1H OCEAN (0) than 
NEAR OCEAN (4) , which might not make sense.


> Iss method me problem ye hojayegi ki humari machine esa lern kr legi ki jese ki INLAND (1) is closer to <1H OCEAN (0) but dur hai NEAR OCEAN (4)
, which might not make sense.

## 3. One-Hot Encoding
For unordered categories, <b>one-hot encoding</b> is a better choice. It creates one
binary column per category.

<b>Means</b>- we split these categories ['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'] into 5 coloumns aur jahan pr bhi wo 
       coloumn ki value hogi jese kahi pr hai '<1H OCEAN' toh woh wale coloumn pr 1 hojayega baki sare coloumn pr 0 hojayega esa hi haar rows ke liye krdenge which much better
       then One-Hot Encoding 

In [21]:
housing["ocean_proximity"] = df["ocean_proximity"]  #Copies the ocean_proximity column from df , Adds (or overwrites) it inside the housing DataFrame
                                                   #Now housing contains this categorical column

In [22]:
housing=housing[["ocean_proximity"]]  #Keeps only one column: ocean_proximity , All other columns (latitude, longitude, income, etc.) are removed
housing        # housing becomes a single-column DataFrame            

Unnamed: 0,ocean_proximity
12655,INLAND
15502,NEAR OCEAN
2908,INLAND
14053,NEAR OCEAN
20496,<1H OCEAN
...,...
15174,<1H OCEAN
12661,INLAND
19263,<1H OCEAN
19140,<1H OCEAN


In [23]:
set(housing["ocean_proximity"]) # gives unique value of housing

{'<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'}

In [24]:
from sklearn.preprocessing import OneHotEncoder  # Import One-Hot Encoder for categorical data

ordinal_encoder = OneHotEncoder()             # Create a OneHotEncoder object

In [25]:
housing_cat = ordinal_encoder.fit_transform(housing)  # Learn categories from 'housing' and convert them into binary (0/1) columns

> This gives a <b>sparse matrix</b> (efficient storage for mostly zeros).

In [26]:
ordinal_encoder.categories_    # Display the unique categories learned by the encoder

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

##### To convert it to a regular NumPy array:

In [27]:
housing_cat_npArray = housing_cat.toarray() # Convert sparse one-hot encoded data into a NumPy array

In [29]:
housing_cat_npArray

array([[0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.]], shape=(16512, 5))

In [30]:
housing_cat = pd.DataFrame(housing_cat_npArray, columns=['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'], index=housing.index)

> - Converts the one-hot encoded NumPy array into a Pandas DataFrame
>  - Each column represents one category of ocean_proximity
> - index=housing.index keeps rows aligned with the original data

In [31]:
housing_cat

Unnamed: 0,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
12655,0.0,1.0,0.0,0.0,0.0
15502,0.0,0.0,0.0,0.0,1.0
2908,0.0,1.0,0.0,0.0,0.0
14053,0.0,0.0,0.0,0.0,1.0
20496,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...
15174,1.0,0.0,0.0,0.0,0.0
12661,0.0,1.0,0.0,0.0,0.0
19263,1.0,0.0,0.0,0.0,0.0
19140,1.0,0.0,0.0,0.0,0.0
