In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("C:/Desktop/ML/housing.csv")

In [3]:
df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished


In [4]:
 df.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [5]:
df.dtypes

price                int64
area                 int64
bedrooms             int64
bathrooms            int64
stories              int64
mainroad            object
guestroom           object
basement            object
hotwaterheating     object
airconditioning     object
parking              int64
prefarea            object
furnishingstatus    object
dtype: object

In [9]:
df_cleaned = df.copy()

In [10]:
# Binary categorical columns (Yes/No)
binary_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']

In [11]:
binary_cols

['mainroad',
 'guestroom',
 'basement',
 'hotwaterheating',
 'airconditioning',
 'prefarea']

In [12]:
# Map 'yes' to 1 and 'no' to 0
binary_map = {'yes': 1, 'no': 0}
for col in binary_cols:
    df_cleaned[col] = df_cleaned[col].map(binary_map)


In [13]:
df_cleaned = pd.get_dummies(df_cleaned, columns=['furnishingstatus'], drop_first=True)


In [14]:
##########   Rooms per story
df_cleaned['rooms_per_story'] = df_cleaned['bedrooms'] / df_cleaned['stories']

In [15]:
df_cleaned['rooms_per_story']

0      1.333333
1      1.000000
2      1.500000
3      2.000000
4      2.000000
         ...   
540    2.000000
541    3.000000
542    2.000000
543    3.000000
544    1.500000
Name: rooms_per_story, Length: 545, dtype: float64

In [16]:
############## Bathroom to Bedroom Ratio
df_cleaned['bathroom_bedroom_ratio'] = df_cleaned['bathrooms'] / df_cleaned['bedrooms']


In [17]:
df_cleaned['bathroom_bedroom_ratio']

0      0.500000
1      1.000000
2      0.666667
3      0.500000
4      0.250000
         ...   
540    0.500000
541    0.333333
542    0.500000
543    0.333333
544    0.333333
Name: bathroom_bedroom_ratio, Length: 545, dtype: float64

In [18]:
########## Area per parking space (add 1 to avoid division by zero)
df_cleaned['area_parking_ratio'] = df_cleaned['area'] / (df_cleaned['parking'] + 1)

In [19]:
df_cleaned['area_parking_ratio'] 

0      2473.333333
1      2240.000000
2      3320.000000
3      1875.000000
4      2473.333333
          ...     
540    1000.000000
541    2400.000000
542    3620.000000
543    2910.000000
544    3850.000000
Name: area_parking_ratio, Length: 545, dtype: float64

In [22]:
###### Luxury Score (weighted sum of key comfort features)

df_cleaned['luxury_score'] = (
    df_cleaned['area'] * 0.4 +
    df_cleaned['airconditioning'] * 0.2 +
    df_cleaned['hotwaterheating'] * 0.1 +
    df_cleaned['prefarea'] * 0.3
)


In [23]:
# Separate features and target variable
target = df_cleaned['price']
features = df_cleaned.drop('price', axis=1)

In [25]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [26]:
features_scaled = scaler.fit_transform(features)


In [28]:
features_scaled

array([[ 1.04672629,  1.40341936,  1.42181174, ...,  0.33662199,
        -0.60106312,  1.04708271],
       [ 1.75700953,  1.40341936,  5.40580863, ...,  3.47446093,
        -0.7295551 ,  1.75697882],
       [ 2.21823241,  0.04727831,  1.42181174, ...,  1.3825683 ,
        -0.13482076,  2.21829022],
       ...,
       [-0.70592066, -1.30886273, -0.57018671, ...,  0.33662199,
         0.03038322, -0.70603898],
       [-1.03338891,  0.04727831, -0.57018671, ..., -0.70932433,
        -0.36059954, -1.03348822],
       [-0.5998394 ,  0.04727831, -0.57018671, ..., -0.70932433,
         0.15703961, -0.59996388]])

In [27]:
features_scaled_df = pd.DataFrame(features_scaled, columns=features.columns)


In [29]:
features_scaled_df 

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_semi-furnished,furnishingstatus_unfurnished,rooms_per_story,bathroom_bedroom_ratio,area_parking_ratio,luxury_score
0,1.046726,1.403419,1.421812,1.378217,0.405623,-0.465315,-0.734539,-0.219265,1.472618,1.517692,1.804941,-0.844888,-0.696429,-0.785731,0.336622,-0.601063,1.047083
1,1.757010,1.403419,5.405809,2.532024,0.405623,-0.465315,-0.734539,-0.219265,1.472618,2.679409,-0.554035,-0.844888,-0.696429,-1.243113,3.474461,-0.729555,1.756979
2,2.218232,0.047278,1.421812,0.224410,0.405623,-0.465315,1.361397,-0.219265,-0.679063,1.517692,1.804941,1.183588,-0.696429,-0.557040,1.382568,-0.134821,2.218290
3,1.083624,1.403419,1.421812,0.224410,0.405623,-0.465315,1.361397,-0.219265,1.472618,2.679409,1.804941,-0.844888,-0.696429,0.129032,0.336622,-0.930553,1.083978
4,1.046726,1.403419,-0.570187,0.224410,0.405623,2.149083,1.361397,-0.219265,1.472618,1.517692,-0.554035,-0.844888,-0.696429,0.129032,-1.232297,-0.601063,1.046737
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,-0.991879,-1.308863,-0.570187,-0.929397,0.405623,-0.465315,1.361397,-0.219265,-0.679063,1.517692,-0.554035,-0.844888,1.435896,0.129032,0.336622,-1.412398,-0.991981
541,-1.268613,0.047278,-0.570187,-0.929397,-2.465344,-0.465315,-0.734539,-0.219265,-0.679063,-0.805741,-0.554035,1.183588,-0.696429,1.501177,-0.709324,-0.641446,-1.268698
542,-0.705921,-1.308863,-0.570187,-0.929397,0.405623,-0.465315,-0.734539,-0.219265,-0.679063,-0.805741,-0.554035,-0.844888,1.435896,0.129032,0.336622,0.030383,-0.706039
543,-1.033389,0.047278,-0.570187,-0.929397,-2.465344,-0.465315,-0.734539,-0.219265,-0.679063,-0.805741,-0.554035,-0.844888,-0.696429,1.501177,-0.709324,-0.360600,-1.033488


In [30]:

# Combine target and scaled features
df_final = pd.concat([target.reset_index(drop=True), features_scaled_df], axis=1)

In [31]:
df_final

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_semi-furnished,furnishingstatus_unfurnished,rooms_per_story,bathroom_bedroom_ratio,area_parking_ratio,luxury_score
0,13300000,1.046726,1.403419,1.421812,1.378217,0.405623,-0.465315,-0.734539,-0.219265,1.472618,1.517692,1.804941,-0.844888,-0.696429,-0.785731,0.336622,-0.601063,1.047083
1,12250000,1.757010,1.403419,5.405809,2.532024,0.405623,-0.465315,-0.734539,-0.219265,1.472618,2.679409,-0.554035,-0.844888,-0.696429,-1.243113,3.474461,-0.729555,1.756979
2,12250000,2.218232,0.047278,1.421812,0.224410,0.405623,-0.465315,1.361397,-0.219265,-0.679063,1.517692,1.804941,1.183588,-0.696429,-0.557040,1.382568,-0.134821,2.218290
3,12215000,1.083624,1.403419,1.421812,0.224410,0.405623,-0.465315,1.361397,-0.219265,1.472618,2.679409,1.804941,-0.844888,-0.696429,0.129032,0.336622,-0.930553,1.083978
4,11410000,1.046726,1.403419,-0.570187,0.224410,0.405623,2.149083,1.361397,-0.219265,1.472618,1.517692,-0.554035,-0.844888,-0.696429,0.129032,-1.232297,-0.601063,1.046737
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,-0.991879,-1.308863,-0.570187,-0.929397,0.405623,-0.465315,1.361397,-0.219265,-0.679063,1.517692,-0.554035,-0.844888,1.435896,0.129032,0.336622,-1.412398,-0.991981
541,1767150,-1.268613,0.047278,-0.570187,-0.929397,-2.465344,-0.465315,-0.734539,-0.219265,-0.679063,-0.805741,-0.554035,1.183588,-0.696429,1.501177,-0.709324,-0.641446,-1.268698
542,1750000,-0.705921,-1.308863,-0.570187,-0.929397,0.405623,-0.465315,-0.734539,-0.219265,-0.679063,-0.805741,-0.554035,-0.844888,1.435896,0.129032,0.336622,0.030383,-0.706039
543,1750000,-1.033389,0.047278,-0.570187,-0.929397,-2.465344,-0.465315,-0.734539,-0.219265,-0.679063,-0.805741,-0.554035,-0.844888,-0.696429,1.501177,-0.709324,-0.360600,-1.033488


In [32]:
# Display the final processed dataset
df_final.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_semi-furnished,furnishingstatus_unfurnished,rooms_per_story,bathroom_bedroom_ratio,area_parking_ratio,luxury_score
0,13300000,1.046726,1.403419,1.421812,1.378217,0.405623,-0.465315,-0.734539,-0.219265,1.472618,1.517692,1.804941,-0.844888,-0.696429,-0.785731,0.336622,-0.601063,1.047083
1,12250000,1.75701,1.403419,5.405809,2.532024,0.405623,-0.465315,-0.734539,-0.219265,1.472618,2.679409,-0.554035,-0.844888,-0.696429,-1.243113,3.474461,-0.729555,1.756979
2,12250000,2.218232,0.047278,1.421812,0.22441,0.405623,-0.465315,1.361397,-0.219265,-0.679063,1.517692,1.804941,1.183588,-0.696429,-0.55704,1.382568,-0.134821,2.21829
3,12215000,1.083624,1.403419,1.421812,0.22441,0.405623,-0.465315,1.361397,-0.219265,1.472618,2.679409,1.804941,-0.844888,-0.696429,0.129032,0.336622,-0.930553,1.083978
4,11410000,1.046726,1.403419,-0.570187,0.22441,0.405623,2.149083,1.361397,-0.219265,1.472618,1.517692,-0.554035,-0.844888,-0.696429,0.129032,-1.232297,-0.601063,1.046737
