In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder

#Dataset Overview

In [None]:
df = pd.read_csv('House_Pricing.csv')

In [None]:
print("Initial Dataset Info:")
df.info()

Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   ID                                         21613 non-null  int64  
 1   Date House was Sold                        21613 non-null  object 
 2   Sale Price                                 21609 non-null  float64
 3   No of Bedrooms                             21613 non-null  int64  
 4   No of Bathrooms                            21609 non-null  float64
 5   Flat Area (in Sqft)                        21604 non-null  float64
 6   Lot Area (in Sqft)                         21604 non-null  float64
 7   No of Floors                               21613 non-null  float64
 8   Waterfront View                            21613 non-null  object 
 9   No of Times Visited                        2124 non-null   object 
 10  

In [None]:
print("\nInitial Dataset Description:")
print(df.describe())


Initial Dataset Description:
                 ID    Sale Price  No of Bedrooms  No of Bathrooms  \
count  2.161300e+04  2.160900e+04    21613.000000     21609.000000   
mean   4.580302e+09  5.401984e+05        3.370842         2.114732   
std    2.876566e+09  3.673890e+05        0.930062         0.770138   
min    1.000102e+06  7.500000e+04        0.000000         0.000000   
25%    2.123049e+09  3.219500e+05        3.000000         1.750000   
50%    3.904930e+09  4.500000e+05        3.000000         2.250000   
75%    7.308900e+09  6.450000e+05        4.000000         2.500000   
max    9.900000e+09  7.700000e+06       33.000000         8.000000   

       Flat Area (in Sqft)  Lot Area (in Sqft)  No of Floors  Overall Grade  \
count         21604.000000        2.160400e+04  21613.000000   21613.000000   
mean           2079.931772        1.510776e+04      1.494309       7.623467   
std             918.487597        4.142827e+04      0.539989       1.105439   
min             290.000

#Duplicate Removal

In [None]:
# Check for and remove duplicate rows
print(f"\nNumber of duplicate rows before removal: {df.duplicated().sum()}")
df.drop_duplicates(inplace=True)
print(f"Number of duplicate rows after removal: {df.duplicated().sum()}")


Number of duplicate rows before removal: 0
Number of duplicate rows after removal: 0


In [None]:
# Check for and remove duplicate columns (based on values)
def identify_duplicate_columns(df):
    duplicate_cols = set()
    for i in range(df.shape[1]):
        col1 = df.iloc[:, i]
        for j in range(i + 1, df.shape[1]):
            col2 = df.iloc[:, j]
            if col1.equals(col2):
                duplicate_cols.add(df.columns[j])
    return list(duplicate_cols)
duplicate_columns = identify_duplicate_columns(df)
print(f"\nDuplicate columns found: {duplicate_columns}")
df.drop(columns=duplicate_columns, inplace=True)
print(f"Number of columns after removing duplicates: {df.shape[1]}")


Duplicate columns found: []
Number of columns after removing duplicates: 21


#Handling Missing Values

In [None]:
print("\nMissing values per column before imputation:")
print(df.isnull().sum())


Missing values per column before imputation:
ID                                               0
Date House was Sold                              0
Sale Price                                       4
No of Bedrooms                                   0
No of Bathrooms                                  4
Flat Area (in Sqft)                              9
Lot Area (in Sqft)                               9
No of Floors                                     0
Waterfront View                                  0
No of Times Visited                          19489
Condition of the House                           0
Overall Grade                                    0
Area of the House from Basement (in Sqft)        3
Basement Area (in Sqft)                          0
Age of House (in Years)                          0
Renovated Year                                   0
Zipcode                                          1
Latitude                                         1
Longitude                           

In [None]:
# Identify numerical and categorical columns
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = df.select_dtypes(include='object').columns.tolist()
# Remove SalePrice from numerical columns
if 'SalePrice' in numerical_cols:
    numerical_cols.remove('SalePrice')

In [None]:
# Impute missing values in numerical columns using the median
for col in numerical_cols:
    if df[col].isnull().any():
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)
        print(f"Numerical column '{col}': Missing values imputed with median ({median_val}).")

In [None]:
# Impute missing values in categorical columns using the mode
for col in categorical_cols:
    if df[col].isnull().any():
        mode_val = df[col].mode()[0]
        df[col].fillna(mode_val, inplace=True)
        print(f"Categorical column '{col}': Missing values imputed with mode ('{mode_val}').")

print("\nMissing values per column after imputation:")
print(df.isnull().sum())


Missing values per column after imputation:
ID                                           0
Date House was Sold                          0
Sale Price                                   0
No of Bedrooms                               0
No of Bathrooms                              0
Flat Area (in Sqft)                          0
Lot Area (in Sqft)                           0
No of Floors                                 0
Waterfront View                              0
No of Times Visited                          0
Condition of the House                       0
Overall Grade                                0
Area of the House from Basement (in Sqft)    0
Basement Area (in Sqft)                      0
Age of House (in Years)                      0
Renovated Year                               0
Zipcode                                      0
Latitude                                     0
Longitude                                    0
Living Area after Renovation (in Sqft)       0
Lot Area after 

#Scaling Numerical Variables

In [None]:
# Initialize MinMaxScaler
scaler = MinMaxScaler()
# Scale numerical columns
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
print("\nScaled numerical features using Min-Max scaling.")
print(df[numerical_cols].head())


Scaled numerical features using Min-Max scaling.
         ID  Sale Price  No of Bedrooms  No of Bathrooms  Flat Area (in Sqft)  \
0  0.720103    0.019266        0.090909          0.12500             0.067170   
1  0.647853    0.060721        0.090909          0.28125             0.172075   
2  0.568795    0.013770        0.060606          0.12500             0.036226   
3  0.251157    0.069377        0.121212          0.37500             0.126038   
4  0.197333    0.057049        0.090909          0.25000             0.104906   

   Lot Area (in Sqft)  No of Floors  Overall Grade  \
0            0.003108           0.0       0.666667   
1            0.004072           0.4       0.666667   
2            0.005743           0.0       0.555556   
3            0.002714           0.0       0.666667   
4            0.004579           0.0       0.777778   

   Area of the House from Basement (in Sqft)  Basement Area (in Sqft)  \
0                                   0.097588                 0.00

#Encoding Categorical Variables

In [None]:
# Identify nominal and potentially ordinal categorical columns (you might need more domain knowledge here)
nominal_categorical_cols = []
ordinal_categorical_cols = []

In [None]:
for col in categorical_cols:
    if df[col].nunique() > 5:  # Assuming higher cardinality might suggest nominal
        nominal_categorical_cols.append(col)
    else:
        ordinal_categorical_cols.append(col)
print("\nNominal categorical columns for One-Hot Encoding:", nominal_categorical_cols)
print("Potentially ordinal categorical columns for Label Encoding:", ordinal_categorical_cols)


Nominal categorical columns for One-Hot Encoding: ['Date House was Sold']
Potentially ordinal categorical columns for Label Encoding: ['Waterfront View', 'No of Times Visited', 'Condition of the House']


In [None]:
# One-Hot Encode nominal categorical columns
encoder_ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_nominal = encoder_ohe.fit_transform(df[nominal_categorical_cols])
encoded_nominal_df = pd.DataFrame(encoded_nominal, columns=encoder_ohe.get_feature_names_out(nominal_categorical_cols))
df = pd.concat([df.drop(columns=nominal_categorical_cols), encoded_nominal_df], axis=1)
print("\nApplied One-Hot Encoding to nominal features.")
print(df.head())


Applied One-Hot Encoding to nominal features.
         ID  Sale Price  No of Bedrooms  No of Bathrooms  Flat Area (in Sqft)  \
0  0.720103    0.019266        0.090909          0.12500             0.067170   
1  0.647853    0.060721        0.090909          0.28125             0.172075   
2  0.568795    0.013770        0.060606          0.12500             0.036226   
3  0.251157    0.069377        0.121212          0.37500             0.126038   
4  0.197333    0.057049        0.090909          0.25000             0.104906   

   Lot Area (in Sqft)  No of Floors Waterfront View No of Times Visited  \
0            0.003108           0.0              No               Twice   
1            0.004072           0.4              No               Twice   
2            0.005743           0.0              No               Twice   
3            0.002714           0.0              No               Twice   
4            0.004579           0.0              No               Twice   

  Condition of 

In [None]:
# Label Encode potentially ordinal categorical columns
encoder_le = LabelEncoder()
for col in ordinal_categorical_cols:
    df[col] = encoder_le.fit_transform(df[col])
print("\nApplied Label Encoding to potentially ordinal features.")
print(df[ordinal_categorical_cols].head())


Applied Label Encoding to potentially ordinal features.
   Waterfront View  No of Times Visited  Condition of the House
0                0                    3                       2
1                0                    3                       2
2                0                    3                       2
3                0                    3                       1
4                0                    3                       2


#Outlier Removal

In [None]:
# Outlier detection using IQR method for numerical columns
outlier_threshold = 3
print("\nOutlier analysis using IQR:")
for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - outlier_threshold * IQR
    upper_bound = Q3 + outlier_threshold * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    print(f"Column '{col}': Number of outliers = {outliers.shape[0]}")
    # Removing outliers (you might want to analyze these outliers further before removal)
    df = df[~((df[col] < lower_bound) | (df[col] > upper_bound))]
    print(f"Column '{col}': Shape after outlier removal = {df.shape}")


Outlier analysis using IQR:
Column 'ID': Number of outliers = 0
Column 'ID': Shape after outlier removal = (20000, 33)
Column 'Sale Price': Number of outliers = 402
Column 'Sale Price': Shape after outlier removal = (19598, 33)
Column 'No of Bedrooms': Number of outliers = 20
Column 'No of Bedrooms': Shape after outlier removal = (19578, 33)
Column 'No of Bathrooms': Number of outliers = 2
Column 'No of Bathrooms': Shape after outlier removal = (19576, 33)
Column 'Flat Area (in Sqft)': Number of outliers = 25
Column 'Flat Area (in Sqft)': Shape after outlier removal = (19551, 33)
Column 'Lot Area (in Sqft)': Number of outliers = 430
Column 'Lot Area (in Sqft)': Shape after outlier removal = (19121, 33)
Column 'No of Floors': Number of outliers = 0
Column 'No of Floors': Shape after outlier removal = (19121, 33)
Column 'Overall Grade': Number of outliers = 2
Column 'Overall Grade': Shape after outlier removal = (19119, 33)
Column 'Area of the House from Basement (in Sqft)': Number of o

#Train-Test Split

In [None]:
# Separate target variable
X = df.drop(columns=['Sale Price'])
y = df['Sale Price']
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nTrain-Test Split:")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


Train-Test Split:
X_train shape: (14608, 32)
X_test shape: (3653, 32)
y_train shape: (14608,)
y_test shape: (3653,)
