<a href="https://colab.research.google.com/github/PavankumarUppar/AML/blob/main/CIE2Program4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Read the dataset
df = pd.read_csv('https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv')

# Data cleaning: Handling missing values
df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)

# Data integration: Creating new features
df['rooms_per_household'] = df['total_rooms'] / df['households']
df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
df['population_per_household'] = df['population'] / df['households']

# Data transformation: Encoding categorical variables
label_encoder = LabelEncoder()
df['ocean_proximity'] = label_encoder.fit_transform(df['ocean_proximity'])

# Data cleaning: Removing outliers
df = df[(np.abs(df['median_house_value'] - df['median_house_value'].mean()) / df['median_house_value'].std()) < 3]

# Data transformation: Binning continuous variables
df['income_cat'] = pd.cut(df['median_income'], bins=[0, 1.5, 3.0, 4.5, 6, np.inf], labels=[1, 2, 3, 4, 5])

# Data integration: Aggregating features
df['rooms_per_household_bin'] = df.groupby('income_cat')['rooms_per_household'].transform('mean')

# Data cleaning: Removing unnecessary columns
df.drop(['total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'income_cat'], axis=1, inplace=True)

# Data transformation: One-hot encoding
df = pd.get_dummies(df, columns=['ocean_proximity'])

# Print the preprocessed dataset
print(df.head())

   longitude  latitude  housing_median_age  median_house_value  \
0    -122.23     37.88                41.0            452600.0   
1    -122.22     37.86                21.0            358500.0   
2    -122.24     37.85                52.0            352100.0   
3    -122.25     37.85                52.0            341300.0   
4    -122.25     37.85                52.0            342200.0   

   rooms_per_household  bedrooms_per_room  population_per_household  \
0             6.984127           0.146591                  2.555556   
1             6.238137           0.155797                  2.109842   
2             8.288136           0.129516                  2.802260   
3             5.817352           0.184458                  2.547945   
4             6.281853           0.172096                  2.181467   

   rooms_per_household_bin  ocean_proximity_0  ocean_proximity_1  \
0                 7.104916                  0                  0   
1                 7.104916              