## Initial understanding of the data, brainstorming for discussion at the meeting

In [2]:
#load packages
import sys #access to system parameters https://docs.python.org/3/library/sys.html
print("Python version: {}". format(sys.version))

import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features
print("pandas version: {}". format(pd.__version__))

import matplotlib #collection of functions for scientific and publication-ready visualization
print("matplotlib version: {}". format(matplotlib.__version__))

import numpy as np #foundational package for scientific computing
print("NumPy version: {}". format(np.__version__))

import scipy as sp #collection of functions for scientific computing and advance mathematics
print("SciPy version: {}". format(sp.__version__)) 

import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook
print("IPython version: {}". format(IPython.__version__)) 

import sklearn #collection of machine learning algorithms
print("scikit-learn version: {}". format(sklearn.__version__))

#misc libraries
import random
import time


#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)

Python version: 3.9.7 (default, Sep 16 2021, 08:50:36) 
[Clang 10.0.0 ]
pandas version: 1.4.1
matplotlib version: 3.4.3
NumPy version: 1.20.3
SciPy version: 1.7.1
IPython version: 7.29.0
scikit-learn version: 0.24.2
-------------------------


In [3]:
#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.plotting import scatter_matrix

#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

In [7]:
data_raw = pd.read_csv("dataset/train.csv")
data_val  = pd.read_csv("dataset/train.csv")


#to play with our data we'll create a copy
#remember python assignment or equal passes by reference vs values, so we use the copy function: https://stackoverflow.com/questions/46327494/python-pandas-dataframe-copydeep-false-vs-copydeep-true-vs
data1 = data_raw.copy(deep = True)

#however passing by reference is convenient, because we can clean both datasets at once
data_cleaner = [data1, data_val]

#preview data
print (data_raw.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26048 entries, 0 to 26047
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   listing_id      26048 non-null  int64  
 1   name            26048 non-null  object 
 2   street          26048 non-null  object 
 3   type            26048 non-null  object 
 4   model           24768 non-null  object 
 5   market_segment  26048 non-null  object 
 6   type_of_area    26048 non-null  object 
 7   bedrooms        25644 non-null  object 
 8   bathrooms       24812 non-null  float64
 9   district        26048 non-null  int64  
 10  region          26048 non-null  object 
 11  planning_area   26048 non-null  object 
 12  subszone        26048 non-null  object 
 13  lat             26048 non-null  float64
 14  lng             26048 non-null  float64
 15  tenure          25761 non-null  object 
 16  built_year      16005 non-null  float64
 17  no_of_units     25006 non-null 

In [8]:
print('Train columns with null values:\n', data1.isnull().sum())
print("-"*10)

print('Test/Validation columns with null values:\n', data_val.isnull().sum())
print("-"*10)

data_raw.describe(include = 'all')

Train columns with null values:
 listing_id            0
name                  0
street                0
type                  0
model              1280
market_segment        0
type_of_area          0
bedrooms            404
bathrooms          1236
district              0
region                0
planning_area         0
subszone              0
lat                   0
lng                   0
tenure              287
built_year        10043
no_of_units        1042
area_size             2
eco_category          0
accessibility         0
date_listed           0
price                 0
dtype: int64
----------
Test/Validation columns with null values:
 listing_id            0
name                  0
street                0
type                  0
model              1280
market_segment        0
type_of_area          0
bedrooms            404
bathrooms          1236
district              0
region                0
planning_area         0
subszone              0
lat                   0
lng         

Unnamed: 0,listing_id,name,street,type,model,market_segment,type_of_area,bedrooms,bathrooms,district,...,lat,lng,tenure,built_year,no_of_units,area_size,eco_category,accessibility,date_listed,price
count,26048.0,26048,26048,26048,24768,26048,26048,25644.0,24812.0,26048.0,...,26048.0,26048.0,25761,16005.0,25006.0,26046.0,26048,26048,26048,26048.0
unique,,1650,3442,2,14,1,1,17.0,,,...,,,53,,,,1,1,359,
top,,marina one residences,23 marina way,apartment,apartment,ocr,strata,3.0,,,...,,,leasehold/99 years,,,,uncategorized,guarded,2021-10-12,
freq,,668,274,13958,13306,26048,26048,7306.0,,,...,,,15300,,,,26048,26048,1728,
mean,5495573.0,,,,,,,,2.278091,12.211609,...,1.321145,103.846138,,2010.926398,462.350636,1248.902096,,,,2994669.0
std,2608064.0,,,,,,,,1.147204,7.020134,...,0.038569,0.049865,,11.915875,423.476259,1066.10202,,,,4324294.0
min,1000122.0,,,,,,,,1.0,1.0,...,1.239337,103.696215,,1799.0,1.0,226.0,,,,556600.0
25%,3229224.0,,,,,,,,1.0,7.0,...,1.296472,103.820262,,2010.0,130.0,678.0,,,,1331000.0
50%,5486808.0,,,,,,,,2.0,11.0,...,1.313384,103.843081,,2014.0,366.0,958.0,,,,1851800.0
75%,7764926.0,,,,,,,,3.0,18.0,...,1.34121,103.879673,,2016.0,646.0,1356.0,,,,2948000.0


## unique values

It is meaningless to find a variable with only one value through unique: market_segment, type_of_area, eco_category, accessibility can be discarded

In [9]:
data_raw.nunique()

listing_id        26048
name               1650
street             3442
type                  2
model                14
market_segment        1
type_of_area          1
bedrooms             17
bathrooms             9
district             27
region                5
planning_area        38
subszone            193
lat                3118
lng                3118
tenure               53
built_year           60
no_of_units         526
area_size          1178
eco_category          1
accessibility         1
date_listed         359
price              4704
dtype: int64

In [10]:
data_raw.drop(columns=["market_segment", "type_of_area", "eco_category", "accessibility"], inplace=True)

## observe the value of each variable

In [22]:
var_ls=list(data_raw.columns)
var_ls

['listing_id',
 'name',
 'street',
 'type',
 'model',
 'bedrooms',
 'bathrooms',
 'district',
 'region',
 'planning_area',
 'subszone',
 'lat',
 'lng',
 'tenure',
 'built_year',
 'no_of_units',
 'area_size',
 'date_listed',
 'price']

In [28]:
for i in var_ls:
    print("variable {0} : {1}".format(i,data_raw[i].unique()))

variable listing_id : [6998418 2046604 7563513 ... 2402528 8150354 8675194]
variable name : ['seascape' 'la maison' 'viva' ... 'kembangan plaza' 'the grandhill'
 'beauty world centre']
variable street : ['57 cove way ' '10 moulmein rise ' '2 suffolk walk ' ... '71 oxley rise '
 '301 jalan bukit ho swee ' '12b cairnhill rise ']
variable type : ['condominium' 'apartment']
variable model : ['condominium' 'apartment' 'executive condominium' nan 'walk-up apt'
 'townhouse' 'soho' 'penthouse' 'duplex' 'high rise'
 'residential with commercial on level 1' 'with pool' 'low rise'
 'shophouse' 'strata terrace']
variable bedrooms : ['3' '4' '2' '1' '3+1' '1+1' '2+1' nan '5' '4+1' '7' '5+1' '6' '3+2' '2+2'
 '9' '8' '4+2']
variable bathrooms : [ 4.  3.  2.  1. nan  5.  6.  7.  8. 10.]
variable district : [ 4 11 14 15 27 23 19  7  2 10 18 16  1  6  3  5  9 28 13 21 17  8 22 12
 20 25 26]
variable region : ['central region' 'east region' 'north region' 'west region'
 'north-east region']
variable plan

## related thinking(discuessed in the meeting)

1）

'type' is a nominal variable and can be handled one-hot

'model' is a subdivision of 'type', what does the relationship look like?

There is 'nan' in it that needs to be completed

2）

'bedrooms' has the values of "3+1', '4+1', you need to know the meaning

3）

'bathrooms' is of type float and needs to be changed to type int

4）

'tenure' has a value like '956 years from 27/05/1928', which needs to be processed to become a value like '999 years'. Segmentation can then be performed.

5）

'built_year' is of type float and needs to be changed to type int

6）

'no_of_units' , Describe how big this apartment is, can it be clustered or segmented?

7）

The value '2021-05-04' in 'date_listed' needs to be changed to timestamp format

8)

For missing data, you can try to get the value through the correspondence of other variables. For example, through the name of the community, the type of apartment, and the corresponding year of construction.