In [None]:
#Dowanload the Dataset from the google drive:
! gdown  1qF3zvu_MFS2DWmN8pqWTG5aFXxGFQgJX


Downloading...
From: https://drive.google.com/uc?id=1qF3zvu_MFS2DWmN8pqWTG5aFXxGFQgJX
To: /content/house_listings.csv
  0% 0.00/1.92M [00:00<?, ?B/s]100% 1.92M/1.92M [00:00<00:00, 171MB/s]


In [None]:
#Import the neccessary libraries:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Load the csv file to the dataframe:
df = pd.read_csv('house_listings.csv')
df.head(10)

Unnamed: 0,address,price,est_monthly_payment,bedrooms,bathrooms,square_footage,property_type,year_built,lot_size,price_per_sqft,parking,state,city,url
0,"269 Holley St, Biloxi, MS 39530",200000,,3,2.5,1250,Single-family,2013.0,5662.0,160,4.0,Mississippi,"Biloxi, MS",https://www.redfin.com/MS/Biloxi/269-Holley-St...
1,"132 Dukate St, Biloxi, MS 39530",350000,,2,2.0,1378,Single-family,1900.0,5662.0,254,1.0,Mississippi,"Biloxi, MS",https://www.redfin.com/MS/Biloxi/132-Dukate-St...
2,"1130 Beach Blvd #408, Biloxi, MS 39530",280000,,2,2.0,1517,Condo,1981.0,145926.0,185,,Mississippi,"Biloxi, MS",https://www.redfin.com/MS/Biloxi/1130-Beach-Bl...
3,"2350 Rue Maison, Biloxi, MS 39532",484000,,4,3.5,2687,Single-family,2004.0,10890.0,180,2.0,Mississippi,"Biloxi, MS",https://www.redfin.com/MS/Biloxi/2350-Rue-Mais...
4,"11512 Holly Bluff Cir, Biloxi, MS 39532",355000,,3,2.0,2043,Single-family,1996.0,44431.0,174,2.0,Mississippi,"Biloxi, MS",https://www.redfin.com/MS/Biloxi/11512-Holly-B...
5,"14231 Woodland Hills Dr, Biloxi, MS 39532",350000,,3,2.0,1954,Single-family,1993.0,339768.0,179,6.0,Mississippi,"Biloxi, MS",https://www.redfin.com/MS/Biloxi/14231-Woodlan...
6,"15171 Haversham Pl, D'Iberville, MS 39540",295000,,4,2.0,2547,Single-family,2006.0,9583.0,116,2.0,Mississippi,"Biloxi, MS",https://www.redfin.com/MS/Diberville/15171-Hav...
7,,327900,,5,3.0,2100,Single-family,2025.0,13503.0,156,,Mississippi,"Biloxi, MS",https://www.redfin.com/MS/Biloxi/9574-Reserve-...
8,"345 Bowen St, Biloxi, MS 39530",250000,,3,2.0,2219,Single-family,2007.0,5662.0,113,2.0,Mississippi,"Biloxi, MS",https://www.redfin.com/MS/Biloxi/345-Bowen-St-...
9,"179 Gardenia St, Biloxi, MS 39531",119000,,1,1.0,800,Single-family,1950.0,8712.0,149,2.0,Mississippi,"Biloxi, MS",https://www.redfin.com/MS/Biloxi/179-Gardenia-...


In [None]:
#Removing the monthly installment column form the dataset:
df.drop('est_monthly_payment', axis=1, inplace=True)

In [None]:
#Removing the Null values and fill them

#Removing the rows which have - this value from the dataset
df = df[~df['square_footage'].astype(str).str.contains('—')]
df=df.dropna(subset=['square_footage'])

#Removing the rows which Null values
df=df.dropna(subset=['address','property_type','price'])

#Fill the null values with 0 for parking
df['parking'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['parking'].fillna(0, inplace=True)


In [None]:
#Convert the object type values to Numaric values:
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['square_footage'] = pd.to_numeric(df['square_footage'], errors='coerce')
df["bedrooms"] = pd.to_numeric(df["bedrooms"], errors="coerce")
df["bathrooms"] = pd.to_numeric(df["bathrooms"], errors="coerce")
df["price_per_sqft"] = pd.to_numeric(df["price_per_sqft"], errors="coerce")



In [None]:
# Convert acres to square feet for values under 100 in 'square_footage'
df['square_footage'] = df.apply(lambda row: row['square_footage'] * 43560 if row['square_footage'] < 500 and isinstance(row['square_footage'], float) and row['property_type'] == 'Vacant land' else row['square_footage'], axis=1)

#calulating the price per squarefit manually for price per squarefit column
df['price_per_sqft'] = df.apply(lambda row: row['price'] / row['square_footage'] if pd.isnull(row['price_per_sqft']) and not pd.isnull(row['price']) and not pd.isnull(row['square_footage']) else row['price_per_sqft'], axis=1)

In [None]:
df.isnull().sum()

Unnamed: 0,0
address,0
price,0
bedrooms,0
bathrooms,0
square_footage,0
property_type,0
year_built,0
lot_size,0
price_per_sqft,0
parking,0


In [None]:
#Ectract the city name from the address for city column
def extract_city(address):
    if isinstance(address, str):
        parts = address.split(',')
        if len(parts) >= 2:
            city_state_zip = parts[-2].strip()
            city = city_state_zip.rsplit(' ', 1)[0] # Split by the last space to remove zip
            return city
    return None

df['city'] = df['address'].apply(extract_city)
display(df.head())

Unnamed: 0,address,price,bedrooms,bathrooms,square_footage,property_type,year_built,lot_size,price_per_sqft,parking,state,city,url
0,"269 Holley St, Biloxi, MS 39530",200000.0,3.0,2.5,1250.0,Single-family,2013.0,5662.0,160.0,4.0,Mississippi,Biloxi,https://www.redfin.com/MS/Biloxi/269-Holley-St...
1,"132 Dukate St, Biloxi, MS 39530",350000.0,2.0,2.0,1378.0,Single-family,1900.0,5662.0,254.0,1.0,Mississippi,Biloxi,https://www.redfin.com/MS/Biloxi/132-Dukate-St...
2,"1130 Beach Blvd #408, Biloxi, MS 39530",280000.0,2.0,2.0,1517.0,Condo,1981.0,145926.0,185.0,0.0,Mississippi,Biloxi,https://www.redfin.com/MS/Biloxi/1130-Beach-Bl...
3,"2350 Rue Maison, Biloxi, MS 39532",484000.0,4.0,3.5,2687.0,Single-family,2004.0,10890.0,180.0,2.0,Mississippi,Biloxi,https://www.redfin.com/MS/Biloxi/2350-Rue-Mais...
4,"11512 Holly Bluff Cir, Biloxi, MS 39532",355000.0,3.0,2.0,2043.0,Single-family,1996.0,44431.0,174.0,2.0,Mississippi,Biloxi,https://www.redfin.com/MS/Biloxi/11512-Holly-B...


In [None]:
#fill the bed and bathroom with 0 if the land type is vacant
df.loc[df['property_type'] == 'Vacant land', ['bedrooms', 'bathrooms']] = 0
df=df.dropna(subset=['bathrooms'])

In [None]:
#fill the null values of year built column with 0
df['year_built'] = df['year_built'].fillna(0)

In [None]:
#fill the null lot size values with 0
df['lot_size'].fillna(0, inplace=True)

#drop null values of squarte footage column
df.dropna(subset=['square_footage'], inplace=True)

#drop null values of bedroom column
df.dropna(subset=['bedrooms'], inplace=True)

#drop null values of price column
df.dropna(subset=['price'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['lot_size'].fillna(0, inplace=True)


In [None]:
df.isnull().sum()


Unnamed: 0,0
address,0
price,0
bedrooms,0
bathrooms,0
square_footage,0
property_type,0
year_built,0
lot_size,0
price_per_sqft,0
parking,0


In [None]:
#Making some corrections to the property type values:
df = df[df['property_type'] != 'Parking']
df['property_type'] = df['property_type'].replace('Condo (co-op)', 'Condo')
df['property_type'] = df['property_type'].replace('Single Family Residence, 24 - Floating Home/On-Water Res', 'Single-family')
df['property_type'] = df['property_type'].replace('Townhome (co-op)', 'Townhome')

df

Unnamed: 0,address,price,bedrooms,bathrooms,square_footage,property_type,year_built,lot_size,price_per_sqft,parking,state,city,url
0,"269 Holley St, Biloxi, MS 39530",200000.0,3.0,2.5,1250.0,Single-family,2013.0,5662.0,160.0,4.0,Mississippi,Biloxi,https://www.redfin.com/MS/Biloxi/269-Holley-St...
1,"132 Dukate St, Biloxi, MS 39530",350000.0,2.0,2.0,1378.0,Single-family,1900.0,5662.0,254.0,1.0,Mississippi,Biloxi,https://www.redfin.com/MS/Biloxi/132-Dukate-St...
2,"1130 Beach Blvd #408, Biloxi, MS 39530",280000.0,2.0,2.0,1517.0,Condo,1981.0,145926.0,185.0,0.0,Mississippi,Biloxi,https://www.redfin.com/MS/Biloxi/1130-Beach-Bl...
3,"2350 Rue Maison, Biloxi, MS 39532",484000.0,4.0,3.5,2687.0,Single-family,2004.0,10890.0,180.0,2.0,Mississippi,Biloxi,https://www.redfin.com/MS/Biloxi/2350-Rue-Mais...
4,"11512 Holly Bluff Cir, Biloxi, MS 39532",355000.0,3.0,2.0,2043.0,Single-family,1996.0,44431.0,174.0,2.0,Mississippi,Biloxi,https://www.redfin.com/MS/Biloxi/11512-Holly-B...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10717,"2101 S 324 St #107, Federal Way, WA 98003",169000.0,2.0,2.0,1248.0,Manufactured,2020.0,2800.0,135.0,2.0,Washington,Federal,https://www.redfin.com/WA/Federal-Way/2101-S-3...
10718,"33010 17th Pl S Unit A-108, Federal Way, WA 98003",250000.0,2.0,1.0,774.0,Condo,1978.0,0.0,323.0,1.0,Washington,Federal,https://www.redfin.com/WA/Federal-Way/33010-17...
10719,"2611 S 288th St #61, Federal Way, WA 98003",142500.0,3.0,2.0,1568.0,Manufactured,1990.0,0.0,91.0,2.0,Washington,Federal,https://www.redfin.com/WA/Federal-Way/2611-S-2...
10720,"31867 48th Cir SW Unit 14A, Federal Way, WA 98023",598000.0,2.0,2.0,1670.0,Condo,2000.0,0.0,358.0,1.0,Washington,Federal,https://www.redfin.com/WA/Federal-Way/31867-48...


In [None]:
df.to_csv('cleaned_house_listings.csv', index=False)