In [252]:
import pandas as pd    
import warnings
warnings.filterwarnings("ignore")

In [253]:
# Reading the CSV file into a pandas dataframe
df = pd.read_csv("C:/Users/pruth/OneDrive/Desktop/Projects/Housing-Price-Prediction/Bengaluru_House_Data.csv")
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [254]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


## Data Pre-Processing

In [255]:
# Finding duplicates
df.duplicated().sum()

529

In [256]:
# Dropping duplicates
print('Before dropping unique rows',df.shape)
df.drop_duplicates(inplace=True) 
print('After dropping unique rows',df.shape)

Before dropping unique rows (13320, 9)
After dropping unique rows (12791, 9)


In [257]:
# Find missing values by column
missing_values_count = df.isna().sum()

print(missing_values_count)

area_type          0
availability       0
location           1
size              16
society         5328
total_sqft         0
bath              73
balcony          605
price              0
dtype: int64


In [258]:
# Dropping the society column because it has to many missing values
df.drop(['society'],axis=1,inplace=True)

# Dropping missing values rows as we have abundant data
df.dropna(axis=0, subset = ('location','size','bath','balcony'),inplace=True)

In [259]:
# Find missing values by column
missing_values_count = df.isna().sum()

print(missing_values_count)
print(df.shape)

area_type       0
availability    0
location        0
size            0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64
(12185, 8)


In [260]:
# Convert the size column into a new column called BHK which will contain just the value
df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))
df.drop(['size'],axis='columns',inplace=True)
df.head()

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,19-Dec,Electronic City Phase II,1056,2.0,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,2600,5.0,3.0,120.0,4
2,Built-up Area,Ready To Move,Uttarahalli,1440,2.0,3.0,62.0,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,1521,3.0,1.0,95.0,3
4,Super built-up Area,Ready To Move,Kothanur,1200,2.0,1.0,51.0,2


In [261]:
# Filter a dataframe to include only rows that contain "-"
filtered_df = df[df['total_sqft'].str.contains("-")]

# Print the first few rows of the filtered dataframe
print(filtered_df.head())

                area_type   availability            location   total_sqft   
30   Super built-up  Area         19-Dec           Yelahanka  2100 - 2850  \
122  Super built-up  Area         18-Mar              Hebbal  3067 - 8156   
137  Super built-up  Area         19-Mar  8th Phase JP Nagar  1042 - 1105   
165  Super built-up  Area         18-Dec            Sarjapur  1145 - 1340   
188  Super built-up  Area  Ready To Move            KR Puram  1015 - 1540   

     bath  balcony    price  bhk  
30    4.0      0.0  186.000    4  
122   4.0      0.0  477.000    4  
137   2.0      0.0   54.005    2  
165   2.0      0.0   43.490    2  
188   2.0      0.0   56.800    2  


In [262]:
# Converting the range values into a single value from the 'total_sqft' column
def range_converter(x):
    temp = x.split('-')
    
    if len(temp)==2:
        return (float(temp[0])+float(temp[1]))/2
    
    try:
        return float(x)
    except:
        return None       
    
df['total_sqft']=df['total_sqft'].apply(range_converter)
df.head()

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,19-Dec,Electronic City Phase II,1056.0,2.0,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,2600.0,5.0,3.0,120.0,4
2,Built-up Area,Ready To Move,Uttarahalli,1440.0,2.0,3.0,62.0,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3
4,Super built-up Area,Ready To Move,Kothanur,1200.0,2.0,1.0,51.0,2


In [263]:
# Find missing values by column
missing_values_count = df.isna().sum()

print(missing_values_count)
print(df.shape)

area_type        0
availability     0
location         0
total_sqft      42
bath             0
balcony          0
price            0
bhk              0
dtype: int64
(12185, 8)


In [264]:
# Dropping missing values rows
df.dropna(axis=0, subset = ('total_sqft'),inplace=True)

In [265]:
# Find missing values by column
missing_values_count = df.isna().sum()

print(missing_values_count)
print(df.shape)

area_type       0
availability    0
location        0
total_sqft      0
bath            0
balcony         0
price           0
bhk             0
dtype: int64
(12143, 8)


In [266]:
df.head()

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,19-Dec,Electronic City Phase II,1056.0,2.0,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,2600.0,5.0,3.0,120.0,4
2,Built-up Area,Ready To Move,Uttarahalli,1440.0,2.0,3.0,62.0,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3
4,Super built-up Area,Ready To Move,Kothanur,1200.0,2.0,1.0,51.0,2


In [267]:
# Saving Cleaned data file in csv format
df.to_csv("C:/Users/pruth/OneDrive/Desktop/Projects/Housing-Price-Prediction/Clean_Data.csv", index=False)