In [274]:
import pandas as pd
import numpy as np


In [275]:
# Loading in the dataset and doing an initial check
df = pd.read_csv('rentfaster.csv')

df.head()
df.shape
df.columns
df.info()
df.describe()
df.sample(n=5, random_state=1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25771 entries, 0 to 25770
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   rentfaster_id      25771 non-null  int64  
 1   city               25771 non-null  object 
 2   province           25771 non-null  object 
 3   address            25646 non-null  object 
 4   latitude           25771 non-null  float64
 5   longitude          25771 non-null  float64
 6   lease_term         25725 non-null  object 
 7   type               25771 non-null  object 
 8   price              25771 non-null  float64
 9   beds               25639 non-null  object 
 10  baths              25637 non-null  object 
 11  sq_feet            21659 non-null  object 
 12  link               25771 non-null  object 
 13  furnishing         25771 non-null  object 
 14  availability_date  25759 non-null  object 
 15  smoking            23069 non-null  object 
 16  cats               255

Unnamed: 0,rentfaster_id,city,province,address,latitude,longitude,lease_term,type,price,beds,baths,sq_feet,link,furnishing,availability_date,smoking,cats,dogs
13024,570328,Calgary,Alberta,Herron Walk NE,51.190039,-114.052503,Long Term,Basement,1300.0,1 Bed,1.0,600,/ab/calgary/rentals/basement/1-bedroom/livings...,Unfurnished,July 01,Non-Smoking,True,False
4691,569598,Calgary,Alberta,25 Walgrove Walk Southeast,50.866188,-114.02724,Long Term,Apartment,2200.0,2 Beds,2.0,1100,/ab/calgary/rentals/apartment/2-bedrooms/walde...,Unfurnished,Call for Availability,Non-Smoking,False,False
37,563675,Airdrie,Alberta,171 Baneberry Way SW,51.292034,-114.050214,12 months,House,2800.0,3 Beds,2.5,2100,/ab/airdrie/rentals/house/3-bedrooms/non-smoki...,Unfurnished,Immediate,Non-Smoking,False,False
24693,315466,Montréal,Quebec,520 rue de Gaspé,45.456078,-73.548578,Long Term,Apartment,1449.0,1 Bed,1.0,750,/qc/montreal/rentals/apartment/1-bedroom/pet-f...,Unfurnished,August 01,Non-Smoking,True,True
20992,383122,Toronto,Ontario,44 Lillian Street,43.705655,-79.393608,Long Term,Apartment,3525.0,2 Beds,2.0,828,/on/toronto/rentals/apartment/1-bedroom/pet-fr...,Unfurnished,July 31,Non-Smoking,True,True


In [276]:
# Checking for null values
df.isnull().sum()

# Removing the null values
df.dropna(inplace=True)
df.isnull().sum()

rentfaster_id        0
city                 0
province             0
address              0
latitude             0
longitude            0
lease_term           0
type                 0
price                0
beds                 0
baths                0
sq_feet              0
link                 0
furnishing           0
availability_date    0
smoking              0
cats                 0
dogs                 0
dtype: int64

In [277]:
# Dropping duplicate rows
df.drop_duplicates(inplace=True)

In [278]:
# Checking what values are in the 'beds' column
df['beds'].unique()

array(['2 Beds', '3 Beds', 'Studio', '1 Bed', '5 Beds', '4 Beds',
       '6 Beds', '8 Beds', '7 Beds', '9 Beds'], dtype=object)

In [279]:
# Removing strings from beds column
df['beds'] = df['beds'].str.replace('Bed', '')
df['beds'] = df['beds'].str.replace('s', '')
df['beds'] = df['beds'].str.replace('Studio', '0')
df['beds'] = df['beds'].str.replace(' ', '')

df['beds'].unique()

array(['2', '3', '0', '1', '5', '4', '6', '8', '7', '9'], dtype=object)

In [280]:
# Converting beds column to numeric (int) and checking for any non-numeric values
df['beds'] = pd.to_numeric(df['beds'], errors='coerce')
df['beds'].unique()

array([2, 3, 0, 1, 5, 4, 6, 8, 7, 9])

In [281]:
df.sample(n=5, random_state=1)

Unnamed: 0,rentfaster_id,city,province,address,latitude,longitude,lease_term,type,price,beds,baths,sq_feet,link,furnishing,availability_date,smoking,cats,dogs
22803,508616,Côte Saint-Luc,Quebec,5885 Ave. Marc Chagall,45.480288,-73.663982,Long Term,Apartment,2450.0,2,1.0,1005,/qc/c-te-saint-luc/rentals/apartment/1-bedroom...,Unfurnished,July 01,Non-Smoking,True,True
3432,406910,Calgary,Alberta,323 5 Avenue Northeast,51.056884,-114.056342,Long Term,Apartment,1625.0,1,1.0,578,/ab/calgary/rentals/apartment/1-bedroom/cresce...,Unfurnished,No Vacancy,Non-Smoking,False,False
14871,569577,Victoria,British Columbia,45 Gorge Rd E,48.443404,-123.387582,Long Term,Apartment,3100.0,2,2.0,841,/bc/victoria/rentals/apartment/1-bedroom/domin...,Unfurnished,Immediate,Non-Smoking,True,True
4276,530963,Calgary,Alberta,Tuscany,51.13077,-114.254658,Long Term,House,4100.0,5,3.5,2400,/ab/calgary/rentals/house/5-bedrooms/tuscany/p...,Unfurnished,August 01,Non-Smoking,True,True
5388,437811,Edmonton,Alberta,8122 106 ST NW,53.517723,-113.503495,Long Term,Apartment,1719.0,1,1.0,635,/ab/edmonton/rentals/apartment/1-bedroom/old-s...,Unfurnished,Immediate,Non-Smoking,True,True


In [282]:
# Dropping unnecessary columns
df.drop(columns=['link', 'furnishing', 'cats', 'dogs', 'availability_date'], inplace=True)
df.sample(n=5, random_state=1)

Unnamed: 0,rentfaster_id,city,province,address,latitude,longitude,lease_term,type,price,beds,baths,sq_feet,smoking
22803,508616,Côte Saint-Luc,Quebec,5885 Ave. Marc Chagall,45.480288,-73.663982,Long Term,Apartment,2450.0,2,1.0,1005,Non-Smoking
3432,406910,Calgary,Alberta,323 5 Avenue Northeast,51.056884,-114.056342,Long Term,Apartment,1625.0,1,1.0,578,Non-Smoking
14871,569577,Victoria,British Columbia,45 Gorge Rd E,48.443404,-123.387582,Long Term,Apartment,3100.0,2,2.0,841,Non-Smoking
4276,530963,Calgary,Alberta,Tuscany,51.13077,-114.254658,Long Term,House,4100.0,5,3.5,2400,Non-Smoking
5388,437811,Edmonton,Alberta,8122 106 ST NW,53.517723,-113.503495,Long Term,Apartment,1719.0,1,1.0,635,Non-Smoking


In [283]:
# Checking smoking column values
df['smoking'].unique()


array(['Non-Smoking', 'Smoke Free Building', 'Negotiable',
       'Smoking Allowed'], dtype=object)

In [284]:
# Removing unwanted values from smoking column
df = df.drop(df[df['smoking'] == 'Negotiable'].index)
df = df.drop(df[df['smoking'] == 'Smoking Allowed'].index)

df['smoking'].unique()

array(['Non-Smoking', 'Smoke Free Building'], dtype=object)

In [285]:
# Dropping unnecessary columns
df.drop(columns=['smoking'], inplace=True)
df.sample(n=5, random_state=1)

Unnamed: 0,rentfaster_id,city,province,address,latitude,longitude,lease_term,type,price,beds,baths,sq_feet
1125,367078,Calgary,Alberta,2304 26 Avenue Northwest,51.076323,-114.113219,Long Term,Main Floor,2000.0,3,1,950
20594,376808,Toronto,Ontario,25 Montgomery Avenue,43.709298,-79.399749,Long Term,Apartment,3894.0,2,2,963
5832,554875,Edmonton,Alberta,11009 109A Avenue,53.55548,-113.511683,Long Term,Apartment,2095.0,2,2,983
25545,407642,Saskatoon,Saskatchewan,"111 111 St W, Saskatoon",52.139496,-106.601068,Long Term,Apartment,1375.0,2,1,732
8816,334193,Wetaskiwin,Alberta,3901-3933 54A St,52.958335,-113.385347,Long Term,Townhouse,1299.0,2,1,870


In [286]:
df['type'].unique()

array(['Townhouse', 'Apartment', 'Main Floor', 'Basement', 'House',
       'Condo Unit', 'Room For Rent', 'Duplex', 'Loft', 'Vacation Home',
       'Acreage', 'Mobile'], dtype=object)

In [287]:
# Removing unwanted values from type column
df = df.drop(df[df['type'] == 'Mobile'].index)
df = df.drop(df[df['type'] == 'Vacation Home'].index)
df = df.drop(df[df['type'] == 'Acreage'].index)

df['type'].unique()

array(['Townhouse', 'Apartment', 'Main Floor', 'Basement', 'House',
       'Condo Unit', 'Room For Rent', 'Duplex', 'Loft'], dtype=object)

In [288]:
df['baths'].unique()

array(['2.5', '1', '2', '1.5', '3.5', '4', '3', '5', 'none', '4.5', '7.5',
       '5.5', '6', '6.5', '0', '7'], dtype=object)

In [289]:
# Removing unwanted values from baths column (e.g., 'none')
df = df.drop(df[df['baths'] == 'none'].index)
df['baths'].unique()

array(['2.5', '1', '2', '1.5', '3.5', '4', '3', '5', '4.5', '7.5', '5.5',
       '6', '6.5', '0', '7'], dtype=object)

In [290]:
# Converting baths column to numeric (int)
df['baths'] = pd.to_numeric(df['baths'], errors='coerce')
df['baths'].unique()

array([2.5, 1. , 2. , 1.5, 3.5, 4. , 3. , 5. , 4.5, 7.5, 5.5, 6. , 6.5,
       0. , 7. ])

In [291]:
# Removing unwanted values from baths column (e.g., 0)
df = df.drop(df[df['baths'] == 0].index)
df['baths'] = df['baths'].astype(int)
df['baths'].unique()

array([2, 1, 3, 4, 5, 7, 6])

In [292]:
df.sample(n=5, random_state=1)

Unnamed: 0,rentfaster_id,city,province,address,latitude,longitude,lease_term,type,price,beds,baths,sq_feet
1287,543315,Calgary,Alberta,239 15 Avenue Northeast,51.065832,-114.057865,Long Term,House,1875.0,2,1,850
8259,479456,Grande Prairie,Alberta,9702 Prairie Road,55.153757,-118.787108,Long Term,Apartment,1345.0,2,1,873
3046,572511,Calgary,Alberta,850 Belmont Drive Southwest,50.863674,-114.064516,Long Term,Townhouse,2600.0,3,2,1400
24915,315477,Quebec City,Quebec,100-850 Rue Laudance,46.761221,-71.335908,Long Term,Apartment,1599.0,1,1,884
19124,568239,Ottawa,Ontario,1291 Summerville Avenue,45.377126,-75.729401,Long Term,Apartment,2545.0,2,2,930


In [293]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13928 entries, 0 to 25770
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   rentfaster_id  13928 non-null  int64  
 1   city           13928 non-null  object 
 2   province       13928 non-null  object 
 3   address        13928 non-null  object 
 4   latitude       13928 non-null  float64
 5   longitude      13928 non-null  float64
 6   lease_term     13928 non-null  object 
 7   type           13928 non-null  object 
 8   price          13928 non-null  float64
 9   beds           13928 non-null  int64  
 10  baths          13928 non-null  int64  
 11  sq_feet        13928 non-null  object 
dtypes: float64(3), int64(3), object(6)
memory usage: 1.4+ MB


In [294]:
df['sq_feet'].unique()

array(['1403', '1496', '1180', ..., '260', '286', '334'], dtype=object)

In [295]:
# Converting baths column to numeric (int)
df['sq_feet'] = pd.to_numeric(df['sq_feet'], errors='coerce')
df['sq_feet'].unique()

array([1403., 1496., 1180., ...,  260.,  286.,  334.])

In [296]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13928 entries, 0 to 25770
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   rentfaster_id  13928 non-null  int64  
 1   city           13928 non-null  object 
 2   province       13928 non-null  object 
 3   address        13928 non-null  object 
 4   latitude       13928 non-null  float64
 5   longitude      13928 non-null  float64
 6   lease_term     13928 non-null  object 
 7   type           13928 non-null  object 
 8   price          13928 non-null  float64
 9   beds           13928 non-null  int64  
 10  baths          13928 non-null  int64  
 11  sq_feet        13671 non-null  float64
dtypes: float64(4), int64(3), object(5)
memory usage: 1.4+ MB


In [297]:
df['lease_term'].unique()

array(['Long Term', 'Negotiable', 'Short Term', '12 months', 'months',
       '6 months'], dtype=object)

In [298]:
# Renaming values in lease_term column
df['lease_term'] = df['lease_term'].str.replace('12 months', '1 year')
df['lease_term'] = df['lease_term'].str.replace('6 months', '0.5 year')
df['lease_term'] = df['lease_term'].str.replace('months', 'Month to Month')

df['lease_term'].unique()

array(['Long Term', 'Negotiable', 'Short Term', '1 year',
       'Month to Month', '0.5 year'], dtype=object)

In [299]:
# Final check (checking for null values again)

df.isnull().sum()

rentfaster_id      0
city               0
province           0
address            0
latitude           0
longitude          0
lease_term         0
type               0
price              0
beds               0
baths              0
sq_feet          257
dtype: int64

In [300]:
df.dtypes

rentfaster_id      int64
city              object
province          object
address           object
latitude         float64
longitude        float64
lease_term        object
type              object
price            float64
beds               int64
baths              int64
sq_feet          float64
dtype: object

In [301]:
# Checking for negative values in the price column
df[df["price"] < 0]

Unnamed: 0,rentfaster_id,city,province,address,latitude,longitude,lease_term,type,price,beds,baths,sq_feet


In [302]:
# Exporting the cleaned dataset to a new CSV file
df.to_csv('rentfaster_cleaned.csv', index=False)