In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load dataset
df1 = pd.read_csv("Bengaluru_House_Data.csv")

# Unique values and value counts for area_type
print(df1['area_type'].unique())
print(df1['area_type'].value_counts())

# Drop unnecessary columns
df2 = df1.drop(['area_type', 'society', 'balcony', 'availability'], axis='columns')

# Check for missing values
print(df2.isnull().sum())

# Drop rows with missing values
df3 = df2.dropna()
print(df3.isnull().sum())

# Extract bhk from size
df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))
print(df3['bhk'].unique())

# Filter out bhk values greater than 20
print(df3[df3.bhk > 20])


['Super built-up  Area' 'Plot  Area' 'Built-up  Area' 'Carpet  Area']
area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64
location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64
location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64
[ 2  4  3  6  1  8  7  5 11  9 27 10 19 16 43 14 12 13 18]
                       location        size total_sqft  bath  price  bhk
1718  2Electronic City Phase II      27 BHK       8000  27.0  230.0   27
4684                Munnekollal  43 Bedroom       2400  40.0  660.0   43


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))


In [2]:

# Function to check if a value can be converted to float
def is_float(x):
    try:
        float(x)
        return True
    except (ValueError, TypeError):
        return False

# Identify rows where total_sqft is not float
print(df3[~df3['total_sqft'].apply(is_float)].head(10))

# Function to convert sqft values to numeric
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:  # Range case
        try:
            return (float(tokens[0]) + float(tokens[1])) / 2
        except ValueError:
            return None
    try:
        return float(x)  # Single value case
    except ValueError:
        return None

# Example conversion
print(convert_sqft_to_num('2100 - 2850'))

# Apply conversion to total_sqft and filter out nulls
df4 = df3.copy()
df4['total_sqft'] = df4['total_sqft'].apply(convert_sqft_to_num)
df4 = df4[df4['total_sqft'].notnull()]

# Calculate price per sqft
df5 = df4.copy()
df5['price_per_sqft'] = df5['price'] * 100000 / df5['total_sqft']

# Export cleaned dataset
df5.to_csv("do_not_download.csv", index=False)

# Clean location data
df5['location'] = df5['location'].apply(lambda x: x.strip())
location_stats = df5['location'].value_counts(ascending=False)

# Combine less frequent locations
location_stats_less_than_10 = location_stats[location_stats <= 10]
df5['location'] = df5['location'].apply(lambda x: 'other' if x in location_stats_less_than_10 else x)

# Remove unrealistic sqft per bhk values
df5 = df5[~(df5['total_sqft'] / df5['bhk'] < 300)]

# Outlier detection and removal using IQR for total_sqft
Q1 = np.percentile(df5['total_sqft'], 25.)
Q3 = np.percentile(df5['total_sqft'], 75.)
IQR = Q3 - Q1
ll = Q1 - (1.5 * IQR)
ul = Q3 + (1.5 * IQR)
df5 = df5[~((df5['total_sqft'] < ll) | (df5['total_sqft'] > ul))]

# Repeat for bath, price, bhk, and price_per_sqft
for col in ['bath', 'price', 'bhk', 'price_per_sqft']:
    Q1 = np.percentile(df5[col], 25.)
    Q3 = np.percentile(df5[col], 75.)
    IQR = Q3 - Q1
    ll = Q1 - (1.5 * IQR)
    ul = Q3 + (1.5 * IQR)
    df5 = df5[~((df5[col] < ll) | (df5[col] > ul))]

# Prepare data for model training
X = df5.drop(['price'], axis='columns')
y = df5['price']

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Print shapes of training and testing sets
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")


               location       size      total_sqft  bath    price  bhk
30            Yelahanka      4 BHK     2100 - 2850   4.0  186.000    4
122              Hebbal      4 BHK     3067 - 8156   4.0  477.000    4
137  8th Phase JP Nagar      2 BHK     1042 - 1105   2.0   54.005    2
165            Sarjapur      2 BHK     1145 - 1340   2.0   43.490    2
188            KR Puram      2 BHK     1015 - 1540   2.0   56.800    2
410             Kengeri      1 BHK  34.46Sq. Meter   1.0   18.500    1
549         Hennur Road      2 BHK     1195 - 1440   2.0   63.770    2
648             Arekere  9 Bedroom       4125Perch   9.0  265.000    9
661           Yelahanka      2 BHK     1120 - 1145   2.0   48.130    2
672        Bettahalsoor  4 Bedroom     3090 - 5002   4.0  445.000    4
2475.0
X_train shape: (8072, 6), y_train shape: (8072,)
X_test shape: (2018, 6), y_test shape: (2018,)
