<a href="https://colab.research.google.com/github/Tausiq17/mlda/blob/main/T1_Data_Pre_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# ===============================
# 1. Import Libraries
# ===============================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# ===============================
# 2. Load Dataset (Kaggle)
# ===============================
# Make sure you downloaded 'House Price India.csv' from Kaggle first.
data = pd.read_csv("House Price India.csv")

print("Raw Data:")
print(data.head())

# ===============================
# 3. Data Cleaning
# ===============================
# Rename columns if needed (some datasets have spaces)
data.columns = [col.strip() for col in data.columns]

# Example: Ensure numeric columns are converted properly
# The columns in the provided data head are 'bhk', 'propertytype', 'location', 'sqft', 'pricepersqft', 'totalprice'
# We will use 'sqft' and 'totalprice' as numeric columns and 'bhk' which is also numeric.
for col in ['sqft', 'bhk', 'totalprice']:
    if col in data.columns:
        data[col] = data[col].replace(',', '', regex=True).astype(float)


# ===============================
# 4. Define Transformers
# ===============================
num_features = ['sqft', 'bhk'] # Using 'sqft' and 'bhk' as numerical features
cat_features = ['location', 'propertytype'] # Using 'location' and 'propertytype' as categorical features

num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# ===============================
# 5. Preprocess Input Features
# ===============================
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features)
    ]
)
preprocessor.set_output(transform="pandas")

X = preprocessor.fit_transform(data)

# ===============================
# 6. Preprocess Target (Price)
# ===============================
# The target column in the provided data head is 'totalprice'
target_transformer = Pipeline(steps=[
    ("scaler", StandardScaler()) # Scaling the target is common practice
])
# We don't need ColumnTransformer for a single target column.
y = target_transformer.fit_transform(data[['totalprice']])


# ===============================
# 7. Feature Engineering
# ===============================
# Price per sqft (if not already present) - This should be calculated *before* splitting if used as a feature
# However, since totalprice and sqft are already features, we can skip this for simplicity in this correction.
# If you want to use 'pricepersqft' from the original data, you can include it in num_features.
# For this example, we will use the existing 'pricepersqft' column.
num_features.append('pricepersqft')

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features)
    ]
)
preprocessor.set_output(transform="pandas")
X = preprocessor.fit_transform(data)

# ===============================
# 8. Train-Test Split
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Processed Training Features:")
display(X_train.head())

print("Processed Training Target:")
display(y_train[:5]) # Display the first 5 rows of the NumPy array

print("Processed Test Target:")
display(y_test[:5]) # Display the first 5 rows of the NumPy array

Raw Data:
   bhk propertytype   location   sqft  pricepersqft  totalprice
0    3         Flat  Ahmedabad   1346          6233    15700000
1    4         Flat  Ahmedabad   1872          4873    17500000
2    4         Flat  Ahmedabad   1650          6733    20200000
3    5         Flat  Ahmedabad  10201          8499    86700000
4    3         Flat  Ahmedabad    968          5944    10400000
Processed Training Features:


Unnamed: 0,num__sqft,num__bhk,num__pricepersqft,cat__location_Ahmedabad,cat__location_Amritsar,cat__location_Bangalore,cat__location_Bhilai,cat__location_Bhopal,cat__location_Bhubaneswar,cat__location_Bilaspur,...,cat__location_Rohini,cat__location_Shimla,cat__location_Thane,cat__location_Trichy,cat__location_Udaipur,cat__location_Vadodara,cat__location_Warangal,cat__propertytype_Flat,cat__propertytype_House,cat__propertytype_Villa
14011,-0.016176,-0.901767,-0.032559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
10835,-0.007648,0.368643,0.03005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
11914,-0.141672,-0.901767,-0.02189,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8661,-0.038108,-0.901767,-0.019859,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
958,-0.092936,-0.901767,-0.036407,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


Processed Training Target:


array([[-0.4624771 ],
       [ 0.19096681],
       [-0.35830488],
       [-0.14996045],
       [-0.39145059]])

Processed Test Target:


array([[-0.15943065],
       [ 0.46086756],
       [-0.36777509],
       [-0.36067243],
       [-0.43311948]])