In [1]:
import pandas as pd

In [2]:
housing_data=pd.read_csv("../data/california_housing.csv")
housing_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200
...,...,...,...,...,...,...,...,...,...,...
20637,-121.09,39.48,25,1665,374.0,845,330,1.5603,INLAND,78100
20638,-121.21,39.49,18,697,150.0,356,114,2.5568,INLAND,77100
20639,-121.22,39.43,17,2254,485.0,1007,433,1.7000,INLAND,92300
20640,-121.32,39.43,18,1860,409.0,741,349,1.8672,INLAND,84700


### Handling Missing Data

In [3]:
housing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20642 entries, 0 to 20641
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20642 non-null  float64
 1   latitude            20642 non-null  float64
 2   housing_median_age  20642 non-null  int64  
 3   total_rooms         20642 non-null  int64  
 4   total_bedrooms      20435 non-null  float64
 5   population          20642 non-null  int64  
 6   households          20642 non-null  int64  
 7   median_income       20642 non-null  float64
 8   ocean_proximity     20642 non-null  object 
 9   median_house_value  20642 non-null  int64  
dtypes: float64(4), int64(5), object(1)
memory usage: 1.6+ MB


In [4]:
housing_data.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20642.0,20642.0,20642.0,20642.0,20435.0,20642.0,20642.0,20642.0,20642.0
mean,-119.569965,35.632077,28.64175,2635.879081,537.887937,1425.475438,499.556438,3.870715,206869.032119
std,2.003609,2.135961,12.587048,2181.542412,421.368432,1132.407658,382.315883,1.899763,115398.310245
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1448.0,296.0,787.0,280.0,2.56375,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74355,264875.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [5]:
print(housing_data.isna().sum()) #Look for the null data

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
median_house_value      0
dtype: int64


In [6]:
housing_data['total_bedrooms'].isna().sum()

np.int64(207)

#### Option 1 : Fill the missing values (null) values with the mean/median of the column

In [7]:
housing_data['total_bedrooms'].fillna(housing_data['total_bedrooms'].mean(),inplace=True)
housing_data['total_bedrooms'].isnull().sum()

np.int64(0)

#### Option 2 : Drop the entire column

In [8]:
housing_data.drop('total_bedrooms',axis=1,inplace=True)
housing_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,565,259,3.8462,NEAR BAY,342200
...,...,...,...,...,...,...,...,...,...
20637,-121.09,39.48,25,1665,845,330,1.5603,INLAND,78100
20638,-121.21,39.49,18,697,356,114,2.5568,INLAND,77100
20639,-121.22,39.43,17,2254,1007,433,1.7000,INLAND,92300
20640,-121.32,39.43,18,1860,741,349,1.8672,INLAND,84700


### Check for duplicate data

In [9]:
housing_data.duplicated().sum() #Check for duplicate rows

np.int64(2)

#### If duplicate rows are present drop them

In [10]:
housing_data.drop_duplicates(inplace=True)

In [11]:
housing_data.duplicated().sum()

np.int64(0)

### Convert Data Types
#### Convert the data to appropriate data types like int,float,datetime,string if they are not in standardized data type format

In [12]:
housing_data.dtypes # Data types are already int,float except ocean proximity

longitude             float64
latitude              float64
housing_median_age      int64
total_rooms             int64
population              int64
households              int64
median_income         float64
ocean_proximity        object
median_house_value      int64
dtype: object

### Standardize Columns
#### Remove spaces, special characters, lowercase

In [13]:
housing_data.columns=housing_data.columns.str.strip().str.lower()


### Handle Categorical Data
#### Models cannot work directly with strings. Most ML algorithms require numerical input, so encoding simplifies categorical variables, enabling models to identify patterns.

#### 1. Label Encoding / Factorize
- Assigns numbers to categories: 0, 1, 2…  
- Best for **ordinal data** ( i.e data with some priority order like Low, Medium, High)  
- Also useful for **nominal data with many unique values**  

#### 2. One-Hot Encoding
- Creates **binary columns** for each category (0/1)  
- Best for **nominal data with few categories** (e.g., INLAND, NEAR BAY, NEAR OCEAN)


In [14]:
print(housing_data['ocean_proximity'].unique())
print(housing_data['ocean_proximity'].value_counts())

['NEAR BAY' '<1H OCEAN' 'INLAND' 'NEAR OCEAN' 'ISLAND']
ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64


##### In this case One Hot Encoding is best choice as INLAND, NEAR BAY, NEAR OCEAN is nominal data also the number of unique value is less so only 3 extra columns will be added 

In [15]:
housing_data=pd.get_dummies(housing_data, columns=['ocean_proximity'])
housing_data.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41,880,322,126,8.3252,452600,False,False,False,True,False
1,-122.22,37.86,21,7099,2401,1138,8.3014,358500,False,False,False,True,False
2,-122.24,37.85,52,1467,496,177,7.2574,352100,False,False,False,True,False
3,-122.25,37.85,52,1274,558,219,5.6431,341300,False,False,False,True,False
4,-122.25,37.85,52,1627,565,259,3.8462,342200,False,False,False,True,False


#### Convert bool types to 0 and 1 

In [16]:
bool_cols=housing_data.select_dtypes('bool').columns
bool_cols

Index(['ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN'],
      dtype='object')

In [17]:
housing_data[bool_cols]=housing_data[bool_cols].astype(int)
housing_data.head(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41,880,322,126,8.3252,452600,0,0,0,1,0
1,-122.22,37.86,21,7099,2401,1138,8.3014,358500,0,0,0,1,0
2,-122.24,37.85,52,1467,496,177,7.2574,352100,0,0,0,1,0
3,-122.25,37.85,52,1274,558,219,5.6431,341300,0,0,0,1,0
4,-122.25,37.85,52,1627,565,259,3.8462,342200,0,0,0,1,0
5,-122.25,37.85,52,919,413,193,4.0368,269700,0,0,0,1,0
6,-122.25,37.84,52,2535,1094,514,3.6591,299200,0,0,0,1,0
7,-122.25,37.84,52,3104,1157,647,3.12,241400,0,0,0,1,0
8,-122.26,37.84,42,2555,1206,595,2.0804,226700,0,0,0,1,0
9,-122.25,37.84,52,3549,1551,714,3.6912,261100,0,0,0,1,0


In [18]:
housing_data.columns=housing_data.columns.str.strip().str.lower() #Change the column names of ocean proximity to lower case for uniformity
housing_data.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income,median_house_value,ocean_proximity_<1h ocean,ocean_proximity_inland,ocean_proximity_island,ocean_proximity_near bay,ocean_proximity_near ocean
0,-122.23,37.88,41,880,322,126,8.3252,452600,0,0,0,1,0
1,-122.22,37.86,21,7099,2401,1138,8.3014,358500,0,0,0,1,0
2,-122.24,37.85,52,1467,496,177,7.2574,352100,0,0,0,1,0
3,-122.25,37.85,52,1274,558,219,5.6431,341300,0,0,0,1,0
4,-122.25,37.85,52,1627,565,259,3.8462,342200,0,0,0,1,0


In [19]:
housing_data.dtypes #Check data types of data

longitude                     float64
latitude                      float64
housing_median_age              int64
total_rooms                     int64
population                      int64
households                      int64
median_income                 float64
median_house_value              int64
ocean_proximity_<1h ocean       int64
ocean_proximity_inland          int64
ocean_proximity_island          int64
ocean_proximity_near bay        int64
ocean_proximity_near ocean      int64
dtype: object

### Handle the Outlier
#### Outliers are the data point which are significantly different from the other remaining data. outliers are in very less amount but had a severe effect on the performance of the Machine learning model. Thus, it is necessary to detect and handle the outliers correctly

In [20]:
cols=housing_data.select_dtypes(include=['int64','float64']).columns
print(cols)

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'population', 'households', 'median_income', 'median_house_value',
       'ocean_proximity_<1h ocean', 'ocean_proximity_inland',
       'ocean_proximity_island', 'ocean_proximity_near bay',
       'ocean_proximity_near ocean'],
      dtype='object')


In [21]:
housing_data[cols].agg(['max','min'])

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income,median_house_value,ocean_proximity_<1h ocean,ocean_proximity_inland,ocean_proximity_island,ocean_proximity_near bay,ocean_proximity_near ocean
max,-114.31,41.95,52,39320,35682,6082,15.0001,500001,1,1,1,1,1
min,-124.35,32.54,1,2,3,1,0.4999,14999,0,0,0,0,0


##### The columns total_rooms, population, households, median_income, median_house_value has outliers (i.e high difference of numerical range) so they need to be handled 

#### Use the Inter Quartile Range (IQR) method to handle the outliers

In [23]:
cols_to_handle_int = ['total_rooms', 'population', 'households']      # counts → int
cols_to_handle_float = ['median_income', 'median_house_value']         # continuous → float

# Handle integer columns
for col in cols_to_handle_int:
    Q1 = housing_data[col].quantile(0.25)
    Q3 = housing_data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5*IQR
    upper = Q3 + 1.5*IQR
    housing_data[col] = housing_data[col].clip(lower, upper).astype(int)  # clip and convert back to int

# Handle float columns
for col in cols_to_handle_float:
    Q1 = housing_data[col].quantile(0.25)
    Q3 = housing_data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5*IQR
    upper = Q3 + 1.5*IQR
    housing_data[col] = housing_data[col].clip(lower, upper)  # keep as float

print("Outliers handled successfully")


Outliers handled successfully


In [24]:
housing_data[cols].agg(['max','min'])

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income,median_house_value,ocean_proximity_<1h ocean,ocean_proximity_inland,ocean_proximity_island,ocean_proximity_near bay,ocean_proximity_near ocean
max,-114.31,41.95,52,5698,3132,1092,8.013025,482412.5,1,1,1,1,1
min,-124.35,32.54,1,2,3,1,0.4999,14999.0,0,0,0,0,0


#### Save the cleaned dataset

In [27]:
housing_data.to_csv("../data/cleaned_california_housing_dataset.csv")