In [1]:
# import the pandas library
import pandas as pd
import numpy as np

In [2]:
# read csv
df = pd.read_csv("apartments_for_rent_classified_10K.csv", sep=";", encoding='cp1252')


In [3]:
# To print no. of samples and attributes
print(df.shape)

(10000, 22)


In [4]:
# getting the columns of the dataset
columns = list(df.columns)
print(columns)

['id', 'category', 'title', 'body', 'amenities', 'bathrooms', 'bedrooms', 'currency', 'fee', 'has_photo', 'pets_allowed', 'price', 'price_display', 'price_type', 'square_feet', 'address', 'cityname', 'state', 'latitude', 'longitude', 'source', 'time']


In [5]:
# To print first five samples
print(df.head()) 

           id                category   
0  5668626895  housing/rent/apartment  \
1  5664597177  housing/rent/apartment   
2  5668626833  housing/rent/apartment   
3  5659918074  housing/rent/apartment   
4  5668626759  housing/rent/apartment   

                                               title   
0  Studio apartment 2nd St NE, Uhland Terrace NE,...  \
1                  Studio apartment 814 Schutte Road   
2  Studio apartment N Scott St, 14th St N, Arling...   
3                     Studio apartment 1717 12th Ave   
4  Studio apartment Washington Blvd, N Cleveland ...   

                                                body amenities  bathrooms   
0  This unit is located at second St NE, Uhland T...        10        5.0  \
1  This unit is located at 814 Schutte Road, Evan...         5        5.0   
2  This unit is located at N Scott St, 14th St N,...         5        1.0   
3  This unit is located at 1717 12th Ave, Seattle...         7        1.0   
4  This unit is located at Wash

In [6]:
#Describing dataset 
print(df.describe())

                 id     bathrooms      bedrooms         price   square_feet   
count  1.000000e+04  10000.000000  10000.000000  10000.000000  10000.000000  \
mean   5.623396e+09      1.392850      1.746300   1486.277500    945.810500   
std    7.021025e+07      0.649489      0.945952   1076.507968    655.755736   
min    5.508654e+09      1.000000      0.000000    200.000000    101.000000   
25%    5.509248e+09      1.000000      1.000000    949.000000    649.000000   
50%    5.668610e+09      1.000000      2.000000   1270.000000    802.000000   
75%    5.668626e+09      2.000000      2.000000   1695.000000   1100.000000   
max    5.668663e+09      8.500000      9.000000  52500.000000  40000.000000   

           latitude     longitude          time  
count  10000.000000  10000.000000  1.000000e+04  
mean      37.662467    -94.552595  1.574891e+09  
std        5.589471     16.063769  3.762395e+06  
min        5.000000   -158.022100  1.568744e+09  
25%       33.679500   -101.301700  1.5

In [7]:
#New dataframe 
new_df = df
#new_df.isnull()
#Checking for null values 
print(new_df.isnull().sum()) 
print("Missing values distribution: ")
print(new_df.isnull().mean())
#print(new_df.shape) 
#new_df.duplicated()

id                  0
category            0
title               0
body                0
amenities           0
bathrooms           0
bedrooms            0
currency            0
fee                 0
has_photo           0
pets_allowed     2415
price               0
price_display       0
price_type          0
square_feet         0
address             0
cityname            0
state               0
latitude            0
longitude           0
source              0
time                0
dtype: int64
Missing values distribution: 
id               0.0000
category         0.0000
title            0.0000
body             0.0000
amenities        0.0000
bathrooms        0.0000
bedrooms         0.0000
currency         0.0000
fee              0.0000
has_photo        0.0000
pets_allowed     0.2415
price            0.0000
price_display    0.0000
price_type       0.0000
square_feet      0.0000
address          0.0000
cityname         0.0000
state            0.0000
latitude         0.0000
longitude        

In [8]:
# #Checking for duplicates
print(new_df.duplicated().any()) 
print(new_df.duplicated())
print(new_df.shape) 

False
0       False
1       False
2       False
3       False
4       False
        ...  
9995    False
9996    False
9997    False
9998    False
9999    False
Length: 10000, dtype: bool
(10000, 22)


In [9]:
print("Column datatypes: ")
print(new_df.dtypes)

Column datatypes: 
id                 int64
category          object
title             object
body              object
amenities         object
bathrooms        float64
bedrooms           int64
currency          object
fee               object
has_photo         object
pets_allowed      object
price              int64
price_display     object
price_type        object
square_feet        int64
address           object
cityname          object
state             object
latitude         float64
longitude        float64
source            object
time               int64
dtype: object


In [10]:
#Replacing string values with integer 
new_df["price_display"].replace({'$', ','}, inplace = True) 
print(new_df.head())
print(new_df.shape)

           id                category   
0  5668626895  housing/rent/apartment  \
1  5664597177  housing/rent/apartment   
2  5668626833  housing/rent/apartment   
3  5659918074  housing/rent/apartment   
4  5668626759  housing/rent/apartment   

                                               title   
0  Studio apartment 2nd St NE, Uhland Terrace NE,...  \
1                  Studio apartment 814 Schutte Road   
2  Studio apartment N Scott St, 14th St N, Arling...   
3                     Studio apartment 1717 12th Ave   
4  Studio apartment Washington Blvd, N Cleveland ...   

                                                body amenities  bathrooms   
0  This unit is located at second St NE, Uhland T...        10        5.0  \
1  This unit is located at 814 Schutte Road, Evan...         5        5.0   
2  This unit is located at N Scott St, 14th St N,...         5        1.0   
3  This unit is located at 1717 12th Ave, Seattle...         7        1.0   
4  This unit is located at Wash

In [11]:
#Finding unique values in "Region" 
print(new_df["price_type"].unique()) 

['Monthly' 'Weekly' 'Monthly|Weekly']


In [12]:
#Replacing string values with int
 
new_df['price_type'].replace({'Monthly':'1/12','Weekly':'1/4','Monthly|Weekly':'2'}, inplace = True)
print(new_df.head())

           id                category   
0  5668626895  housing/rent/apartment  \
1  5664597177  housing/rent/apartment   
2  5668626833  housing/rent/apartment   
3  5659918074  housing/rent/apartment   
4  5668626759  housing/rent/apartment   

                                               title   
0  Studio apartment 2nd St NE, Uhland Terrace NE,...  \
1                  Studio apartment 814 Schutte Road   
2  Studio apartment N Scott St, 14th St N, Arling...   
3                     Studio apartment 1717 12th Ave   
4  Studio apartment Washington Blvd, N Cleveland ...   

                                                body amenities  bathrooms   
0  This unit is located at second St NE, Uhland T...        10        5.0  \
1  This unit is located at 814 Schutte Road, Evan...         5        5.0   
2  This unit is located at N Scott St, 14th St N,...         5        1.0   
3  This unit is located at 1717 12th Ave, Seattle...         7        1.0   
4  This unit is located at Wash