In [118]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
#import catboost
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [119]:
#reading df
df = pd.read_csv('no_outlier_df.csv',index_col='id')
df.shape

(29813, 23)

# Feautre Engineering

In [120]:
#setting out amenities to simplify the dataset and taking on important columns
amenity_col = list(df.filter(like='amenity').columns)
not_required_cols = ['property_description', 'property_overview', 'property_url', 'image_url']
df.drop(columns=amenity_col+not_required_cols,inplace=True,axis=1)
df.isna().sum()

area                  0
building_type         0
building_nature       0
num_bath_rooms        0
num_bed_rooms         0
price                 0
purpose               0
city                  0
locality              0
address            4680
division              1
zone                 80
dtype: int64

Now I am going to modify each feature using the insights given in task#3 EDA

### area

In [121]:
df['area'].describe()
#nothing to do here

count    29813.000000
mean      1657.573696
std       1215.077508
min         93.000000
25%       1050.000000
50%       1350.000000
75%       2000.000000
max      17000.000000
Name: area, dtype: float64

### building_type

1. Nearly 80% of our properties are `Apartment`, for a total of nearly 27000 samples. We also some `Office`, `Building`, `Shop`, `Floor`, `Residential Plot`, whose number are under 10% of the total dataset; that is to say, their numbers are under 2500. 
2. There are other types of properties, in a very negligible number.

❗ **Recommendation**:
* We are expecting our future models to perform well on `Apartment`, and to have and acceptable result for `Office`, `Building`, `Shop`, `Floor`, `Residential Plot`. They are expected to perform poorly on other types of properties.
* Types not part of (1) should be dropped in order to avoid noise in our future models.


In [122]:
req_building_type = ['Apartment','Office', 'Building', 'Shop', 'Floor', 'Residential Plot']
df = df[df['building_type'].isin(req_building_type)]

### building_nature

In [123]:
df['building_nature'].describe()
#nothing to do here

count           29637
unique              2
top       Residential
freq            23985
Name: building_nature, dtype: object

### num_bath_rooms & num_bed_rooms

In [124]:
df[['num_bath_rooms','num_bed_rooms']].describe()
#nothing to do here

Unnamed: 0,num_bath_rooms,num_bed_rooms
count,29637.0,29637.0
mean,1.660694,2.3168
std,1.55106,1.268307
min,0.0,0.0
25%,0.0,2.0
50%,2.0,3.0
75%,3.0,3.0
max,10.0,10.0


### price

In [125]:
df['price'].describe()
#non-linear relation was observed with every feature
#nothing to do here

count    2.963700e+04
mean     3.572601e+06
std      5.929573e+06
min      4.200000e+03
25%      2.500000e+04
50%      1.300000e+05
75%      6.000000e+06
max      1.200000e+08
Name: price, dtype: float64

### purpose

In [126]:
#nothing to do here
df['purpose'].describe()

count     29637
unique        2
top        Rent
freq      17727
Name: purpose, dtype: object

### city

1. Most of our properties are in `Dhaka`, for a total of nearly 28,000 properties. We also have nearly 4000 properties in `Chattogram`.     
1. A negligible amount of properties are in `Narayanganj City`, `Barishal`, `Gazipur`, each of them with a count below 500 properties ..
1. As for the other cities, their properties count is too insignificant.

❗ **Recommendation**:
* We are expecting our future models not to be perform well on cities mentioned in (2). We should consider dropping samples with those cities when building models since their low number will make it so that the models will not predict well on them. 
* Cities not part of (1) and (2) should definitively be dropped in order to avoid noise in our future models.

In [127]:
req_city = ['Dhaka','Chattogram','Narayanganj City', 'Barishal','Gazipur']
df = df[df['city'].isin(req_city)]
df.shape

(29346, 12)

### locality

In [128]:
#nothing to do here
df['locality'].describe()

count      29346
unique       160
top       Mirpur
freq        4966
Name: locality, dtype: object

### address

In [129]:
df['address'].value_counts()[:30].values
#there is error, bangladesh should not be included in a address

array([12731,   413,   212,   193,   184,   173,   164,   153,   152,
         150,   147,   141,   139,   139,   139,   127,   123,   120,
         119,   114,   111,   109,   108,   106,   105,   103,   101,
         100,    99,    99], dtype=int64)

In [130]:
df.replace('Bangladesh',np.nan, inplace=True)


In [131]:
len(df['address'].unique())

778

In [132]:
df['address'].isna().sum()

17350

In [133]:
#this feature doesn't see like a meaningfull for price prediction better droping as it may cause high dimensionality 
df.drop(columns='address', inplace=True)

### divison

In [134]:
df['division'].value_counts()
#nothing to do here

Dhaka         25780
Chattogram     3344
Barisal         222
Name: division, dtype: int64

### zone

In [135]:
df['zone'].isnull().sum()

80

In [136]:
df[df['zone'].isna()]['division'].value_counts()
#all missing zones belong to dhaka

Dhaka    80
Name: division, dtype: int64

In [137]:
#it won't be wrong to fill them with mode
df['zone'].fillna(df['zone'].mode(), inplace=True)

### Data Splitting

In [138]:
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42) 

### Handling Categorical Columns

In [139]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23476 entries, bproperty-10419 to bdhousing-162
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   area             23476 non-null  float64
 1   building_type    23476 non-null  object 
 2   building_nature  23476 non-null  object 
 3   num_bath_rooms   23476 non-null  float64
 4   num_bed_rooms    23476 non-null  float64
 5   price            23476 non-null  float64
 6   purpose          23476 non-null  object 
 7   city             23476 non-null  object 
 8   locality         23476 non-null  object 
 9   division         23476 non-null  object 
 10  zone             23409 non-null  object 
dtypes: float64(4), object(7)
memory usage: 2.1+ MB


It is observed that locality and zone have high no. of unique features.

Therefore, a distinct technique will be employed to handle features with fewer than 10 unique values as compared to those with more than 10.

=> using OneHotEncoder/get_dummies for less than 10 

=> using CatBoost encoding for greater than 10


In [140]:
#dividing the columns based on no. of unique values
cat_cols = list(train_set.select_dtypes(include=['object']).columns)
large_cat = ['zone', 'locality']
small_cat = [item for item in cat_cols if item not in large_cat]

#### OneHot_encoding/get_dummies (small features)

In [141]:
#small_cat columns encoded
encode_small_cat_df = pd.get_dummies(train_set[small_cat], drop_first=True)
encode_small_cat_df.shape

(23476, 13)

#### catboost encoding (large features)

In [142]:
#large_cat columns encoded
cat_boost_encoder = ce.CatBoostEncoder()
cat_boost_encoder.fit(train_set[large_cat],train_set['price'])

CatBoostEncoder(cols=['zone', 'locality'])

#### Creating encoded df

In [143]:
#adding cols of small encoded features
encoded_train_set = pd.concat([train_set,encode_small_cat_df],axis=1)
encoded_train_set.drop(columns=small_cat,inplace=True,axis=1)

In [144]:
#adding cols of large encoded feautures
encoded_train_set[['encoded_zone', 'encoded_locality']] = cat_boost_encoder.transform(train_set[large_cat])
encoded_train_set = encoded_train_set.drop(columns=['zone','locality'], axis=1)

In [145]:

encoded_train_set.head().T

id,bproperty-10419,pbazaar-2885,bproperty-9200,bproperty-4467,bproperty-13788
area,4500.0,2300.0,1684.0,2160.0,2600.0
num_bath_rooms,0.0,4.0,0.0,0.0,0.0
num_bed_rooms,0.0,3.0,3.0,0.0,0.0
price,55000000.0,130000.0,11000000.0,9000000.0,130000.0
building_type_Building,0.0,0.0,0.0,0.0,0.0
building_type_Floor,0.0,0.0,0.0,0.0,0.0
building_type_Office,0.0,0.0,0.0,0.0,1.0
building_type_Residential Plot,0.0,0.0,0.0,1.0,0.0
building_type_Shop,0.0,0.0,0.0,0.0,0.0
building_nature_Residential,0.0,1.0,1.0,1.0,0.0


In [146]:
encoded_train_set.shape

(23476, 19)

### Feature Scaling

In [147]:
X_train = encoded_train_set.drop(columns='price', axis=1)
y_train = encoded_train_set['price'].copy()

In [148]:
#Applying Standarization
s_scaler = StandardScaler()

s_scaler.fit(X_train)
X_train_scaled = s_scaler.transform(X_train)

### Dimensionality Reduction

In [149]:
# to be done

### Custom Transformers

In [150]:
# to be done

### Exporting ready data

In [151]:
#converting encoded data to DataFrame
X_train_scaled_df = pd.DataFrame(X_train_scaled,columns=X_train.columns, index=X_train.index)

In [152]:
#new x_train
new_train_set = pd.concat([X_train_scaled_df,y_train],axis=1)
new_train_set.shape

(23476, 19)

##### this part will be done using tranformers, for now i m doing it manually

In [153]:
#encoding the test_set
encoded_small_test_set = pd.get_dummies(test_set[small_cat], drop_first=True)
encoded_test_set = pd.concat([test_set,encoded_small_test_set],axis=1)
encoded_test_set.drop(columns=small_cat,inplace=True,axis=1)
encoded_test_set[['encoded_zone', 'encoded_locality']] = cat_boost_encoder.transform(test_set[large_cat])
encoded_test_set = encoded_test_set.drop(columns=['zone','locality'], axis=1)

X_test = encoded_test_set.drop(columns='price', axis=1)
y_test = encoded_test_set['price'].copy()

X_test_scaled = s_scaler.transform(X_test)
#converting encoded data to DataFrame
X_test_scaled_df = pd.DataFrame(X_test_scaled,columns=X_test.columns, index=X_test.index)
new_test_set = pd.concat([X_test_scaled_df,y_test],axis=1)

In [154]:
ready_df = pd.concat([new_train_set,new_test_set])
print(ready_df.shape)
ready_df.to_csv('encoded_scaled_df_no_amenity.csv')

(29346, 19)


# Model Development
