In [1]:
# Stratified Shuffle Split 

# When building machine learning models, one of the most important steps is splitting your dataset into
# training and test sets. This ensures your model is evaluated on data it has never seen before,
# which is critical for assessing its ability to generalize.
    
#--------------------------------------------------------------------------------------------------------
# The Problem of Data Snooping Bias -

# Data snooping bias occurs when information from the test set leaks into the training process. This can lead
# to overly optimistic performance metrics and models that don’t perform well in real-world scenarios.

# To avoid this, the test set must be isolated before any data exploration, feature selection, or model training begins.

In [9]:
#----------------------------------------Random Sampling: A Basic Approach--------------------------------------------

# A simple method to split the data is to randomly shuffle it and then divide it: ( Mostly used for Regression Problems)

import pandas as pd
import numpy as np
                                        # simple split Shuffle method  
data = pd.read_csv("4CF_Data.csv")

def shuffle_and_split_data(data, test_ratio):                      # created a func that takes 2 arguements - DataFrame, fraction
    np.random.seed(42)                                                # Set the seed to a fixed set of nums
    shuffled_indices = np.random.permutation(len(data))        # Random order of all row indices in your dataset from 0 to n-1
    test_set_size = int(len(data) * test_ratio)                     # how many samples you want in your test set
    test_indices = shuffled_indices[:test_set_size]                     # start from 0 to test_size 
    train_indices = shuffled_indices[test_set_size:]                       # start from test size till the end of remaining.
    return data.iloc[train_indices], data.iloc[test_indices]      # returns two separate datasets — training set and test set 
                                                                 # based on the index positions stored in train_indices and test_indices.  

# Setting the random seed (e.g., with np.random.seed(42)) ensures consistency across runs—
# this is crucial for debugging and comparing models fairly.

# However, pure random sampling might not always be reliable, especially if the dataset contains
# importantpatterns that are not evenly distributed.

In [10]:
# Stratified Sampling - 
# To ensure that important characteristics of the population are well represented in both the training
# and test sets, we use stratified sampling.

# What is a Strata?
# A strata is a subgroup of the data defined by a specific attribute. Stratified
# sampling ensures that each of these subgroups is proportionally represented.

# For example, in the California housing dataset, median income is a strong predictor
# of house prices. Instead of randomly sampling, we can create strata based on
# income levels (e.g., binning median income into categories) and ensure the test set
# maintains the same distribution of income levels as the full dataset.    

In [11]:
# Creating Income Categories - 
# And spliting the median_income into bins - Using Pandas.cut() 

# Syntax - To make bins 
# pandas.cut (x (Data), bins (Intervals), labels=None,
# retbins=False, precision=3, include_lowest=False, duplicates='raise', ordered=True, right=True)
                                 
data["income_cat"] = pd.cut(data["median_income"], bins=[0, 1.5, 3.0, 4.5, 6.0, np.inf], labels=[1, 2, 3, 4, 5])    

# Create a new col named Income cat = that has bins created from a column median_income (From Actual Data)
# labels define the names of each bin, making your data more readable.

# ----------------Key points----------------

# Loaded the dataset
# Created income categories 
# pd.cut - used to bin values into discrete intervals.
# Each bin represents a range of income levels, allowing us to stratify our sampling based on these categories.

In [12]:
# We can also plot these income categories to visualize the distribution:

# import matplotlib.pyplot as plt
# data["income_cat"].value_counts().sort_index().plot.bar(rot=0, grid=True)  
# plt.title("Income Categories Distribution") 
# plt.xlabel("Income Category")
# plt.ylabel("Number of Instances")
# plt.show()                                   (OPTIONAL)

In [17]:
#-----------------------------------------Stratified Shuffle Split Approach---------------------------------------------
                            # Mostly used for classified problem ) Non numeric Problems 

# Stratified Shuffle Split in Scikit-Learn 
# Scikit-learn provides a built-in way to perform stratified sampling using StratifiedShuffleSplit.


# Syntax - 
# from sklearn.model_selection import StratifiedShuffleSplit
# StratifiedShuffleSplit(n_splits = 10 (In how many parts you wish to split) , test_size = 0.2 (Ratio of test),
        # , train_size = (Ratio of Train ), random_state = None (seed))



from sklearn.model_selection import StratifiedShuffleSplit

# income_cat is a column in the dataset created from median_income
Shuffled = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in Shuffled.split(data, data["income_cat"]):
    strat_train_set = data.loc[train_index]                                         # loc - label based indexing in pandas
    strat_test_set = data.loc[test_index]

# Points to remember here ------------------------
# This calls the .split() method of StratifiedShuffleSplit, which does:
# shuffles your dataset
# splits it into train/test
# ensures stratification (balanced income categories)
# .split() does NOT return the data.
# It returns ONLY the index positions needed for train/test.

# ----------------Syntax--------------Shuffled.split method--------------Explained--------------------
# it takes x (data )as argument , and y the target variable.
# .split() requires:
# ➤ x - First argument: data - This is your full dataset.
# ➤ y - Second argument: data["income_cat"] - This is the column used for stratification, meaning:

# The split will preserve the proportion of each income category (1,2,3,4,5)
# Ensures the train and test sets have similar income distributions

# “For the single stratified shuffle split, return the training indices and test indices
# generated by Shuffled.split(), which splits data in a way that preserves the proportions of income_cat.”
#--------------------------------------------------------------------------------------------------------------------
                                          
# shuffle_and_split_data(data, 0.2)   # As we dont need Basic random shuffle approach we wont use it 
strat_test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,income_cat
5241,-118.39,34.12,29.0,6447.0,1012.0,2184.0,960.0,8.2816,500001.0,<1H OCEAN,5
17352,-120.42,34.89,24.0,2020.0,307.0,855.0,283.0,5.0099,162500.0,<1H OCEAN,4
3505,-118.45,34.25,36.0,1453.0,270.0,808.0,275.0,4.3839,204600.0,<1H OCEAN,3
7777,-118.10,33.91,35.0,1653.0,325.0,1072.0,301.0,3.2708,159700.0,<1H OCEAN,3
14155,-117.07,32.77,38.0,3779.0,614.0,1495.0,614.0,4.3529,184000.0,NEAR OCEAN,3
...,...,...,...,...,...,...,...,...,...,...,...
12182,-117.29,33.72,19.0,2248.0,427.0,1207.0,368.0,2.8170,110000.0,<1H OCEAN,2
7275,-118.24,33.99,33.0,885.0,294.0,1270.0,282.0,2.1615,118800.0,<1H OCEAN,2
17223,-119.72,34.44,43.0,1781.0,342.0,663.0,358.0,4.7000,293800.0,<1H OCEAN,4
10786,-117.91,33.63,30.0,2071.0,412.0,1081.0,412.0,4.9125,335700.0,<1H OCEAN,4


In [18]:
strat_train_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,income_cat
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,72100.0,INLAND,2
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,279600.0,NEAR OCEAN,5
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.8750,82700.0,INLAND,2
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,112500.0,NEAR OCEAN,2
20496,-118.70,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,238300.0,<1H OCEAN,3
...,...,...,...,...,...,...,...,...,...,...,...
15174,-117.07,33.03,14.0,6665.0,1231.0,2026.0,1001.0,5.0900,268500.0,<1H OCEAN,4
12661,-121.42,38.51,15.0,7901.0,1422.0,4769.0,1418.0,2.8139,90400.0,INLAND,2
19263,-122.72,38.44,48.0,707.0,166.0,458.0,172.0,3.1797,140400.0,<1H OCEAN,3
19140,-122.70,38.31,14.0,3155.0,580.0,1208.0,501.0,4.1964,258100.0,<1H OCEAN,3


In [None]:
import pandas as pd
import numpy as np
                                        # simple split Shuffle method  
data = pd.read_csv("4CF_Data.csv")

def shuffle_and_split_data(data, test_ratio):                      # created a func that takes 2 arguements - DataFrame, fraction
    np.random.seed(42)                                                # Set the seed to a fixed set of nums
    shuffled_indices = np.random.permutation(len(data))        # Random order of all row indices in your dataset from 0 to n-1
    test_set_size = int(len(data) * test_ratio)                     # how many samples you want in your test set
    test_indices = shuffled_indices[:test_set_size]                     # start from 0 to test_size 
    train_indices = shuffled_indices[test_set_size:]                       # start from test size till the end of remaining.
    return data.iloc[test_indices] 

