In [None]:
import pandas as pd
import io

df = pd.read_csv('/content/HOME DATASET.CSV')
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
ID              0
Location        0
Unnamed: 2     10
 size(sqft)     1
room            1
age(year)       1
price(INR)      1
dtype: int64


In [None]:
df['price(INR)'] = df['price(INR)'].astype(float)

# Convert 'room' column to numeric, coercing errors
df['room'] = pd.to_numeric(df['room'], errors='coerce')

numerical_cols = [' size(sqft)', 'room', 'age(year)', 'price(INR)']
for col in numerical_cols:
  # Check if the column is numeric before calculating median and filling NaNs
  if pd.api.types.is_numeric_dtype(df[col]):
    median_val = df[col].median()
    df[col] = df[col].fillna(median_val)
  else:
    print(f"Warning: Column '{col}' is not numeric and will not be imputed with median.")


print("Missing values after imputation:")
print(df.isnull().sum())
display(df)

Missing values after imputation:
ID              0
Location        0
Unnamed: 2     10
 size(sqft)     0
room            0
age(year)       0
price(INR)      0
dtype: int64


Unnamed: 0,ID,Location,Unnamed: 2,size(sqft),room,age(year),price(INR)
0,1,Downtown,,1200.0,3.0,10.0,5000000.0
1,2,Suburb,,1500.0,4.0,5.0,6500000.0
2,3,Uptown,,100.0,2.0,20.0,3800000.0
3,4,Sunurb,,200.0,5.0,10.0,7200000.0
4,5,Downtown,,850.0,3.0,15.0,4200000.0
5,6,Uptown,,1100.0,3.0,12.0,5600000.0
6,7,Suburb,,1100.0,3.0,7.0,4800000.0
7,8,Downtown,,1750.0,4.0,3.0,7000000.0
8,9,Uptown,,950.0,2.0,25.0,3500000.0
9,10,Suburb,,1600.0,4.0,6.0,5000000.0


In [None]:

duplicate_rows = df[df.duplicated()]
print("Duplicate rows found:")
display(duplicate_rows)


df_cleaned = df.drop_duplicates()
print("\nDataFrame after removing duplicate rows:")
display(df_cleaned)

Duplicate rows found:


Unnamed: 0,ID,Location,Unnamed: 2,size(sqft),room,age(year),price(INR)



DataFrame after removing duplicate rows:


Unnamed: 0,ID,Location,Unnamed: 2,size(sqft),room,age(year),price(INR)
0,1,Downtown,,1200.0,3.0,10.0,5000000.0
1,2,Suburb,,1500.0,4.0,5.0,6500000.0
2,3,Uptown,,100.0,2.0,20.0,3800000.0
3,4,Sunurb,,200.0,5.0,10.0,7200000.0
4,5,Downtown,,850.0,3.0,15.0,4200000.0
5,6,Uptown,,1100.0,3.0,12.0,5600000.0
6,7,Suburb,,1100.0,3.0,7.0,4800000.0
7,8,Downtown,,1750.0,4.0,3.0,7000000.0
8,9,Uptown,,950.0,2.0,25.0,3500000.0
9,10,Suburb,,1600.0,4.0,6.0,5000000.0


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Select the numerical columns to normalize
numerical_cols_to_normalize = [' size(sqft)', 'age(year)']

# Apply Min-Max scaling to the selected columns
df_cleaned[numerical_cols_to_normalize] = scaler.fit_transform(df_cleaned[numerical_cols_to_normalize])

print("DataFrame after Min-Max scaling:")
display(df_cleaned)

DataFrame after Min-Max scaling:


Unnamed: 0,ID,Location,Unnamed: 2,size(sqft),room,age(year),price(INR)
0,1,Downtown,,0.666667,3.0,0.318182,5000000.0
1,2,Suburb,,0.848485,4.0,0.090909,6500000.0
2,3,Uptown,,0.0,2.0,0.772727,3800000.0
3,4,Sunurb,,0.060606,5.0,0.318182,7200000.0
4,5,Downtown,,0.454545,3.0,0.545455,4200000.0
5,6,Uptown,,0.606061,3.0,0.409091,5600000.0
6,7,Suburb,,0.606061,3.0,0.181818,4800000.0
7,8,Downtown,,1.0,4.0,0.0,7000000.0
8,9,Uptown,,0.515152,2.0,1.0,3500000.0
9,10,Suburb,,0.909091,4.0,0.136364,5000000.0


In [None]:
# Create a new feature: Price per Square Foot
df_cleaned['Price_per_SqFt'] = df_cleaned['price(INR)'] / df_cleaned[' size(sqft)']

print("\nDataFrame with new 'Price_per_SqFt' feature:")
display(df_cleaned)


DataFrame with new 'Price_per_SqFt' feature:


Unnamed: 0,ID,Location,Unnamed: 2,size(sqft),room,age(year),price(INR),Price_per_SqFt
0,1,Downtown,,0.666667,3.0,0.318182,5000000.0,7500000.0
1,2,Suburb,,0.848485,4.0,0.090909,6500000.0,7660714.0
2,3,Uptown,,0.0,2.0,0.772727,3800000.0,inf
3,4,Sunurb,,0.060606,5.0,0.318182,7200000.0,118800000.0
4,5,Downtown,,0.454545,3.0,0.545455,4200000.0,9240000.0
5,6,Uptown,,0.606061,3.0,0.409091,5600000.0,9240000.0
6,7,Suburb,,0.606061,3.0,0.181818,4800000.0,7920000.0
7,8,Downtown,,1.0,4.0,0.0,7000000.0,7000000.0
8,9,Uptown,,0.515152,2.0,1.0,3500000.0,6794118.0
9,10,Suburb,,0.909091,4.0,0.136364,5000000.0,5500000.0


In [None]:
# Perform one-hot encoding on the 'Location ' column
df_encoded = pd.get_dummies(df_cleaned, columns=['Location '], dtype=int)

print("\nDataFrame after one-hot encoding:")
display(df_encoded)


DataFrame after one-hot encoding:


Unnamed: 0,ID,Unnamed: 2,size(sqft),room,age(year),price(INR),Price_per_SqFt,Location _Downtown,Location _Suburb,Location _Sunurb,Location _Uptown
0,1,,0.666667,3.0,0.318182,5000000.0,7500000.0,1,0,0,0
1,2,,0.848485,4.0,0.090909,6500000.0,7660714.0,0,1,0,0
2,3,,0.0,2.0,0.772727,3800000.0,inf,0,0,0,1
3,4,,0.060606,5.0,0.318182,7200000.0,118800000.0,0,0,1,0
4,5,,0.454545,3.0,0.545455,4200000.0,9240000.0,1,0,0,0
5,6,,0.606061,3.0,0.409091,5600000.0,9240000.0,0,0,0,1
6,7,,0.606061,3.0,0.181818,4800000.0,7920000.0,0,1,0,0
7,8,,1.0,4.0,0.0,7000000.0,7000000.0,1,0,0,0
8,9,,0.515152,2.0,1.0,3500000.0,6794118.0,0,0,0,1
9,10,,0.909091,4.0,0.136364,5000000.0,5500000.0,0,1,0,0
