In [77]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [78]:
data = pd.read_csv('/content/sample_data/Bengaluru_House_Data.csv')

In [79]:
#show the first five rows of the dataframe
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [80]:
#number of rows and columns in the dataframe
data.shape

(13320, 9)

In [81]:
#information about the dataframe
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [82]:
#get informations about the column
#how many types of rows in a column
for column in data.columns:
  print(data[column].value_counts())
  print("*"*20)

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64
********************
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: availability, Length: 81, dtype: int64
********************
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int64
********************
2 BHK         5199
3 BHK        

In [83]:
#find null values
data.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [84]:
#drop society, area_type, avalability, balcony column
data.drop(columns = ['area_type', 'availability', 'society', 'balcony'], inplace = True)

In [85]:
#show the statistical measurment
data.describe()

Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


# **Data Cleaning**

In [86]:
#types of rows on location column
data['location'].value_counts()

Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int64

In [87]:
#fill up missing values in location column
data['location'] = data['location'].fillna('Sarjapur  Road')

In [88]:
data['size'].value_counts()

2 BHK         5199
3 BHK         4310
4 Bedroom      826
4 BHK          591
3 Bedroom      547
1 BHK          538
2 Bedroom      329
5 Bedroom      297
6 Bedroom      191
1 Bedroom      105
8 Bedroom       84
7 Bedroom       83
5 BHK           59
9 Bedroom       46
6 BHK           30
7 BHK           17
1 RK            13
10 Bedroom      12
9 BHK            8
8 BHK            5
11 BHK           2
11 Bedroom       2
10 BHK           2
14 BHK           1
13 BHK           1
12 Bedroom       1
27 BHK           1
43 Bedroom       1
16 BHK           1
19 BHK           1
18 Bedroom       1
Name: size, dtype: int64

In [89]:
#fill the missing values with 2 BHK
data['size'] = data['size'].fillna('2 BHK')

In [90]:
#filing bath column missing values with median
data['bath'] = data['bath'].fillna(data['bath'].median())

In [91]:
# Convert the 'size' column to strings
data['size'] = data['size'].astype(str)

# Extract the number from the 'size' column and create the 'bhk' column
data['bhk'] = data['size'].str.split().str.get(0).astype(int)

In [92]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [93]:
data[data.bhk > 20]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [94]:
#show different types of rows on total_sqft column
data['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [95]:
# Define a function to convert the values in the 'total_sqft' column
def convertRange(x):
    # Split the value by "-" to handle range values
    temp = x.split("-")

    # If there are two values in the split, average them
    if len(temp) == 2:
        # Use float() to convert the individual values to float, average them, and return the result
        return (float(temp[0]) + float(temp[1])) / 2.0

    # If it's a single value, try converting it to float
    try:
        return float(x)
    except:
        return None  # Handle any exceptions and return None in case of errors


In [96]:
# Apply the conversion function to the 'total_sqft' column
data['total_sqft'] = data['total_sqft'].apply(convertRange)


In [97]:
#make a new column price_per_sqft
data['price_per_sqft'] = data['price']*100000/data['total_sqft']

In [98]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [99]:
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [100]:
#location has too many types length 1305
data['location'].value_counts()

Whitefield                        540
Sarjapur  Road                    400
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int64

In [101]:
#if location < 10 write other
#reduce the location
data['location'] = data['location'].apply(lambda x:x.strip())
location_count = data['location'].value_counts()

In [102]:
location_count

Whitefield                        541
Sarjapur  Road                    400
Electronic City                   304
Kanakpura Road                    273
Thanisandra                       237
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1294, dtype: int64

In [103]:
location_count_less_10 = location_count[location_count <= 10]
location_count_less_10

Dairy Circle                      10
Nagappa Reddy Layout              10
Basapura                          10
1st Block Koramangala             10
Sector 1 HSR Layout               10
                                  ..
Bapuji Layout                      1
1st Stage Radha Krishna Layout     1
BEML Layout 5th stage              1
singapura paradise                 1
Abshot Layout                      1
Name: location, Length: 1053, dtype: int64

In [104]:
data['location'] = data['location'].apply(lambda x: 'other' if x in location_count_less_10 else x)

In [105]:
data['location'].value_counts()

other                 2885
Whitefield             541
Sarjapur  Road         400
Electronic City        304
Kanakpura Road         273
                      ... 
Nehru Nagar             11
Banjara Layout          11
LB Shastri Nagar        11
Pattandur Agrahara      11
Narayanapura            11
Name: location, Length: 242, dtype: int64

In [106]:
#reduce length 242 of location 242

In [107]:
(data['total_sqft']/data['bhk']).describe()

count    13274.000000
mean       575.074878
std        388.205175
min          0.250000
25%        473.333333
50%        552.500000
75%        625.000000
max      26136.000000
dtype: float64

In [108]:
#remove less than 300 data
data = data[((data['total_sqft']/data['bhk']) >= 300)]
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,12530.0,12530.0,12530.0,12530.0,12530.0
mean,1594.564544,2.559537,111.382401,2.650838,6303.979357
std,1261.271296,1.077938,152.077329,0.976678,4162.237981
min,300.0,1.0,8.44,1.0,267.829813
25%,1116.0,2.0,49.0,2.0,4210.526316
50%,1300.0,2.0,70.0,3.0,5294.117647
75%,1700.0,3.0,115.0,3.0,6916.666667
max,52272.0,16.0,3600.0,16.0,176470.588235


In [109]:
data.shape

(12530, 7)

In [110]:
data.price_per_sqft.describe()

count     12530.000000
mean       6303.979357
std        4162.237981
min         267.829813
25%        4210.526316
50%        5294.117647
75%        6916.666667
max      176470.588235
Name: price_per_sqft, dtype: float64

In [111]:
# Define a function to remove outliers from the DataFrame based on 'price_per_sqft' for each location.
def remove_outlier(df):
    # Create an empty DataFrame to store the filtered data
    df_output = pd.DataFrame()

    # Group the input DataFrame 'df' by the 'location' column
    for key, subdf in df.groupby('location'):
        # Calculate the mean and standard deviation of 'price_per_sqft' for each location
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)

        # Generate a DataFrame containing data points within one standard deviation from the mean
        gen_df = subdf[(subdf.price_per_sqft > (m - st)) & (subdf.price_per_sqft <= (m + st))]

        # Concatenate the filtered data for this location with the overall 'df_output'
        df_output = pd.concat([df_output, gen_df], ignore_index=True)

    # Return the DataFrame 'df_output' with outliers removed
    return df_output

# Call the 'remove_outlier' function to filter the 'data' DataFrame
data = remove_outlier(data)

# Describe the 'data' DataFrame to see the statistics after removing outliers
data.describe()


Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,10301.0,10301.0,10301.0,10301.0,10301.0
mean,1508.440608,2.471702,91.286372,2.574896,5659.062876
std,880.694214,0.979449,86.342786,0.897649,2265.774749
min,300.0,1.0,10.0,1.0,1250.0
25%,1110.0,2.0,49.0,2.0,4244.897959
50%,1286.0,2.0,67.0,2.0,5175.600739
75%,1650.0,3.0,100.0,3.0,6428.571429
max,30400.0,16.0,2200.0,16.0,24509.803922


In [112]:
data.bhk.describe()

count    10301.000000
mean         2.574896
std          0.897649
min          1.000000
25%          2.000000
50%          2.000000
75%          3.000000
max         16.000000
Name: bhk, dtype: float64

Certainly! The code you provided is intended to remove outliers in a dataset with a focus on the 'bhk' (bedroom, hall, and kitchen) column based on the 'price_per_sqft' and 'location' columns. Let me explain it step by step:

1. **bhk_outlier_remover Function Definition:**
   ```python
   def bhk_outlier_remover(df):
   ```
   This defines a Python function named `bhk_outlier_remover` that takes a DataFrame `df` as an input.

2. **Initialization of `exclude_indices`:**
   ```python
   exclude_indices = np.array([])
   ```
   It initializes an empty NumPy array to store the indices of rows to exclude as outliers.

3. **Grouping by Location:**
   ```python
   for location, location_df in df.groupby('location'):
   ```
   It loops through the DataFrame `df`, grouping it by the 'location' column. This is done to process data for each location separately.

4. **Initialization of `bhk_stats`:**
   ```python
   bhk_stats = {}
   ```
   Inside the location loop, an empty dictionary `bhk_stats` is initialized to store statistics related to 'price_per_sqft' for different 'bhk' values.

5. **Iterating Through 'bhk' Values in Location Data:**
   ```python
   for bhk, bhk_df in location_df.groupby('location'):
   ```
   Within each location, it iterates through the 'bhk' values in the location DataFrame. The goal is to calculate statistics for 'price_per_sqft' for different 'bhk' configurations within the location.

6. **Calculating Statistics for 'price_per_sqft' by 'bhk' Value:**
   ```python
   bhk_stats[bhk] = {
       'mean': np.mean(bhk_df.price_per_sqft),
       'std': np.std(bhk_df.price_per_sqft),
       'count': bhk_df.shape[0]
   }
   ```
   For each 'bhk' value, it calculates statistics such as the mean and standard deviation of 'price_per_sqft' and counts the number of rows.

7. **Iterating Through 'bhk' Values Again:**
   ```python
   for bhk, bhk_df in location_df.groupby('bhk'):
   ```
   It iterates through the 'bhk' values again within the same location DataFrame.

8. **Checking for Outliers and Marking for Removal:**
   ```python
   stats = bhk_stats.get(bhk - 1)
   if stats and stats['count'] > 5:
       exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft < stats['mean']].index.values)
   ```
   For each 'bhk' value within the location, it checks if the previous 'bhk' value has statistics and if the count is greater than 5. If these conditions are met, it identifies rows where 'price_per_sqft' is less than the mean of the previous 'bhk' value as outliers. It appends the indices of these outliers to the `exclude_indices` array.

9. **Return Filtered DataFrame:**
   ```python
   return df.drop(exclude_indices, axis='index')
   ```
   After processing all locations and 'bhk' values, it returns the original DataFrame with the identified outliers removed.

10. **Calling the Function:**
    ```python
    data = bhk_outlier_remover(data)
    ```
    Finally, the function is called with the 'data' DataFrame to remove outliers based on the defined logic.

This code aims to filter out rows in the DataFrame where the 'price_per_sqft' is significantly different from the mean of the previous 'bhk' value, but only if the count of data points is sufficient (greater than 5) for that 'bhk' value.

In [113]:
# Define a function to remove outliers in the 'bhk' column based on 'price_per_sqft' and 'location'.
def bhk_outlier_remover(df):
    exclude_indices = np.array([])

    # Loop through the DataFrame grouped by 'location'
    for location, location_df in df.groupby('location'):
        bhk_stats = {}

        # Iterate through each 'bhk' value in the location DataFrame
        for bhk, bhk_df in location_df.groupby('location'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]  # Fix typo ('shpe' to 'shape')
            }

        # Iterate through 'bhk' values in the location DataFrame again
        for bhk, bhk_df in location_df.groupby('bhk'):
            # Get statistics for the previous 'bhk' value (bhk-1)
            stats = bhk_stats.get(bhk - 1)

            # Check if stats exist and the count is greater than 5
            if stats and stats['count'] > 5:
                # Find indices of rows with 'price_per_sqft' less than the previous 'bhk' mean
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft < stats['mean']].index.values)

    # Return the DataFrame with the identified outliers removed
    return df.drop(exclude_indices, axis='index')

In [114]:
# Call the 'bhk_outlier_remover' function to filter the 'data' DataFrame
data = bhk_outlier_remover(data)

In [115]:
data.shape

(10301, 7)

In [116]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,1st Block Jayanagar,4 BHK,2850.0,4.0,428.0,4,15017.54386
1,1st Block Jayanagar,3 BHK,1630.0,3.0,194.0,3,11901.840491
2,1st Block Jayanagar,3 BHK,1875.0,2.0,235.0,3,12533.333333
3,1st Block Jayanagar,3 BHK,1200.0,2.0,130.0,3,10833.333333
4,1st Block Jayanagar,2 BHK,1235.0,2.0,148.0,2,11983.805668


In [117]:
#drop size and price_per_sqft column
data.drop(columns = ['size', 'price_per_sqft'], inplace = True)

# **Cleaned Data**

In [118]:
data.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2


In [119]:
data.to_csv('Cleaned_data.csv')

In [120]:
X = data.drop(columns = ['price'])
y = data['price']

In [121]:
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [122]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [123]:
print(X_train.shape)
print(X_test.shape)

(8240, 4)
(2061, 4)


# **Applying Linear Regression**

In [124]:
# Define the column transformer with a list of transformers
column_trans = make_column_transformer(
    (OneHotEncoder(sparse=False), ['location']),  # One-hot encode 'location' column
    remainder='passthrough'  # Keep the remaining columns unchanged
)

In [125]:
# Create a pipeline that includes feature scaling, one-hot encoding, and linear regression
pipe = make_pipeline(column_trans, StandardScaler(), LinearRegression())

In [126]:
pipe.fit(X_train, y_train)



In [127]:
y_pred_lr = pipe.predict(X_test)

In [128]:
r2_score(y_test, y_pred_lr)

0.8298802052293621

# **Apply Lasso**

In [133]:
pipe = make_pipeline(column_trans, StandardScaler(), Lasso())

In [134]:
pipe.fit(X_train, y_train)



In [136]:
y_pred_lasso = pipe.predict(X_test)
r2_score(y_test, y_pred_lasso)

0.8225976333132023

# **Applying Ridge**

In [137]:
pipe = make_pipeline(column_trans, StandardScaler(), Ridge())

In [138]:
pipe.fit(X_train, y_train)



In [139]:
y_pred_ridge = pipe.predict(X_test)
r2_score(y_test, y_pred_ridge)

0.8298734741832591

In [140]:
print("No Regularization", r2_score(y_test, y_pred_lr))
print("Lasso",r2_score(y_test, y_pred_lasso))
print("Ridge", r2_score(y_test, y_pred_ridge))

No Regularization 0.8298802052293621
Lasso 0.8225976333132023
Ridge 0.8298734741832591


In [141]:
import pickle

In [143]:
pickle.dump(pipe, open("RidgeModel.pkl",'wb'))

In [144]:
from google.colab import files

# Specify the file name and path
file_name = "RidgeModel.pkl"

# Use the files.download function to generate a download link
files.download(file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>