In [185]:
#import all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [187]:
#load dataset
df = pd.read_csv(r"C:\Users\ronak\Downloads\Bengaluru_House_Data.csv")
print(df.head())
print("\n \n Shape of the data set : ",df.shape)

              area_type   availability                  location       size  \
0  Super built-up  Area         19-Dec  Electronic City Phase II      2 BHK   
1            Plot  Area  Ready To Move          Chikka Tirupathi  4 Bedroom   
2        Built-up  Area  Ready To Move               Uttarahalli      3 BHK   
3  Super built-up  Area  Ready To Move        Lingadheeranahalli      3 BHK   
4  Super built-up  Area  Ready To Move                  Kothanur      2 BHK   

   society total_sqft  bath  balcony   price  
0  Coomee        1056   2.0      1.0   39.07  
1  Theanmp       2600   5.0      3.0  120.00  
2      NaN       1440   2.0      3.0   62.00  
3  Soiewre       1521   3.0      1.0   95.00  
4      NaN       1200   2.0      1.0   51.00  

 
 Shape of the data set :  (13320, 9)


In [189]:
# remove the unwanted columns and drop rows that include Null elements
df = df.drop(['area_type', 'society', 'balcony', 'availability'], axis=1)
df = df.dropna()
print("Updated shape of the data set : ",df.shape)
print("\n\n",df.head())

Updated shape of the data set :  (13246, 5)


                    location       size total_sqft  bath   price
0  Electronic City Phase II      2 BHK       1056   2.0   39.07
1          Chikka Tirupathi  4 Bedroom       2600   5.0  120.00
2               Uttarahalli      3 BHK       1440   2.0   62.00
3        Lingadheeranahalli      3 BHK       1521   3.0   95.00
4                  Kothanur      2 BHK       1200   2.0   51.00


In [191]:
df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [193]:
def convert_sqft_to_num(x):
    try:
        if '-' in x:
            tokens = x.split('-')
            return (float(tokens[0]) + float(tokens[1])) / 2
        return float(x.replace(',', ''))  # Handles commas
    except Exception as e:
        return None

In [195]:
df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_num)

# Remove entries with invalid total_sqft
df = df[df['total_sqft'].notnull()]

# Create price_per_sqft column
df['price_per_sqft'] = df['price'] * 100000 / df['total_sqft']

# Clean up the location column
df['location'] = df['location'].apply(lambda x: x.strip())

# Remove locations with less than 10 data points
location_stats = df['location'].value_counts()
locations_less_than_10 = location_stats[location_stats <= 10].index

df['location'] = df['location'].apply(lambda x: 'other' if x in locations_less_than_10 else x)

print(df.shape)  # Check the dimensions of the DataFrame
print(df.isnull().sum())  # Check for remaining null values
print(df.head())  # Preview the DataFrame


(13200, 7)
location          0
size              0
total_sqft        0
bath              0
price             0
bhk               0
price_per_sqft    0
dtype: int64
                   location       size  total_sqft  bath   price  bhk  \
0  Electronic City Phase II      2 BHK      1056.0   2.0   39.07    2   
1          Chikka Tirupathi  4 Bedroom      2600.0   5.0  120.00    4   
2               Uttarahalli      3 BHK      1440.0   2.0   62.00    3   
3        Lingadheeranahalli      3 BHK      1521.0   3.0   95.00    3   
4                  Kothanur      2 BHK      1200.0   2.0   51.00    2   

   price_per_sqft  
0     3699.810606  
1     4615.384615  
2     4305.555556  
3     6245.890861  
4     4250.000000  


In [197]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for location, subdf in df.groupby('location'):
        mean = np.mean(subdf.price_per_sqft)
        std = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft > (mean - std)) & (subdf.price_per_sqft <= (mean + std))]
        df_out = pd.concat([df_out, reduced_df], ignore_index=True)
    return df_out

In [199]:
df = remove_pps_outliers(df)
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,1st Block Jayanagar,4 BHK,2850.0,4.0,428.0,4,15017.54386
1,1st Block Jayanagar,3 BHK,1630.0,3.0,194.0,3,11901.840491
2,1st Block Jayanagar,6 BHK,1200.0,6.0,125.0,6,10416.666667
3,1st Block Jayanagar,3 BHK,1875.0,2.0,235.0,3,12533.333333
4,1st Block Jayanagar,7 Bedroom,930.0,4.0,85.0,7,9139.784946


In [201]:
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk - 1)
            if stats and stats['count'] > 5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft < stats['mean']].index.values)
    return df.drop(exclude_indices, axis='index')


In [203]:
# Remove outliers
df = remove_pps_outliers(df)
print("After removing PPS outliers:", df.shape)
df = remove_bhk_outliers(df)
print("After removing BHK outliers:", df.shape)

# Final cleanup
df = df[df.bath < df.bhk + 2]
df = df.drop(['size', 'price_per_sqft'], axis=1)
df = pd.concat([df, pd.get_dummies(df.location).drop('other', axis=1)], axis=1)
df = df.drop('location', axis=1)

# Splitting the data
X = df.drop('price', axis=1)
y = df['price']
print("X shape:", X.shape)
print("y shape:", y.shape)

# Splitting the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training and evaluation
lr_clf = LinearRegression()
lr_clf.fit(X_train, y_train)

After removing PPS outliers: (8341, 7)
After removing BHK outliers: (5764, 7)
X shape: (5708, 243)
y shape: (5708,)


In [205]:
print(f"Linear Regression Test Score: {lr_clf.score(X_test, y_test)}")

Linear Regression Test Score: 0.8633134362843073


In [207]:
def find_best_model_using_gridsearchcv(X, y):
    algos = {
        'linear_regression': {
            'model': LinearRegression(),
            'params': {}
        },
        'lasso': {
            'model': Lasso(),
            'params': {'alpha': [1, 2], 'selection': ['random', 'cyclic']}
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {'criterion': ['squared_error', 'friedman_mse'], 'splitter': ['best', 'random']}
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
    for algo_name, config in algos.items():
        print(f"Training {algo_name}...")
        gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X, y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })
    return pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])

# Call the function
best_model = find_best_model_using_gridsearchcv(X, y)
print(best_model)

Training linear_regression...
Training lasso...
Training decision_tree...
               model  best_score  \
0  linear_regression    0.872750   
1              lasso    0.705556   
2      decision_tree    0.757155   

                                         best_params  
0                                                 {}  
1                {'alpha': 1, 'selection': 'random'}  
2  {'criterion': 'friedman_mse', 'splitter': 'best'}  
