In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("../../../Data/Dataset_splited.csv")

### Get features categorie

In [3]:
def categorisation(data,date_cols,target="price"):
    distance = []
    date = []
    numerique = []
    categorique = []
    for col in data.columns:
        if data[col].dtypes == np.object:
            try:
                if str(type(eval(data[col][data[col].first_valid_index()]))) == "<class 'list'>":
                    distance.append(col)
                elif col in date_cols:
                    date.append(col)
                else:
                    categorique.append(col)
            except:
                if col in date_cols:
                    date.append(col)
                else:
                    categorique.append(col)
                
            
        else :
            if col != target:
                numerique.append(col)
                
    return dict({"numerique":numerique,"categorique":categorique,"date":date,"distance":distance})

feat_cat = categorisation(data,['listing_date'])

In [4]:
feat_cat

{'numerique': ['bathrooms',
  'bedrooms',
  'erf_size',
  'floor_size',
  'garages',
  'listing_number',
  'pet_friendly',
  'garden',
  'pool',
  'secure_parking',
  'parking',
  'no_pets_allowed',
  'furnished',
  'flatlet',
  'fibre_internet',
  'furnished_(optional)',
  'adsl_internet',
  'satellite_internet',
  'fixed_wimax_internet',
  'split'],
 'categorique': ['town', 'type_of_property'],
 'date': ['listing_date'],
 'distance': []}

### Fill by 0 those features (we suppose missing == 0) except 'bathrooms' and 'bedrooms'

In [5]:
_feat = [
  'garages',
  'pet_friendly',
  'garden',
  'pool',
  'secure_parking',
  'parking',
  'no_pets_allowed',
  'furnished',
  'flatlet',
  'fibre_internet',
  'furnished_(optional)',
  'adsl_internet',
  'satellite_internet',
  'fixed_wimax_internet']

In [6]:
for c in _feat:
    print(data[c].unique())
    data[c].fillna(0,inplace=True)

[ nan  1.   2.   3.   4.   6.   7.   5.  10.  20.   8.   1.5]
[nan  1.]
[nan  1.]
[nan  1.]
[nan  2.  1.  6.  4.  8.  3. 10. 15.  5. 12.  9. 11.]
[nan  1.  3.  2.  6.  5.  4. 30.  7.  8. 10.  9. 11. 13. 12. 50.]
[nan  1.]
[nan  1.]
[nan  1.]
[nan  1.]
[nan  1.]
[nan  1.]
[nan  1.]
[nan  1.]


In [7]:
for c in ['bathrooms','bedrooms']:
    print(data[c].unique())
    data[c].fillna(1,inplace=True)

[ 1.   2.   2.5  5.   4.5  1.5  7.5  3.   4.   3.5 12.   5.5  nan  7.
  6.5  6.   0.5 30.   9.  18.  11.  15.   8.   9.5  8.5 10.  25.  13.
 14. ]
[ 2.   nan  3.   4.   1.   6.   7.   8.   5.  12.   0.5  1.5  9.  30.
 11.  16.  15.  10.   2.5  3.5 18.   4.5 13.  33.  14. ]


### target mean ordering encoding of categorical value "type_of_property" based on train dataset

In [8]:


def encode(frame, feature):
    ordering = pd.DataFrame()
    ordering['val'] = frame[feature].unique()
    ordering.index = ordering.val
    ordering['spmean'] = frame[[feature, 'price']].groupby(feature).mean()['price']
    #print(frame[[feature, 'a']].groupby(feature).mean()['a'])
    ordering = ordering.sort_values('spmean')
    ordering['ordering'] = range(1, ordering.shape[0]+1)
    #print(ordering)
    ordering = ordering['ordering'].to_dict()
    #print("Encoding results : ",ordering)
    
    for cat, o in ordering.items():
        frame.loc[frame[feature] == cat, feature+'_E'] = o
        
    return ordering

#tmp = pd.DataFrame({"a":[1,2,5,10],"b":["1","1","o","o"]})

qual_encoded = []
train = data[data["split"]==0].copy()
for q in ['type_of_property']:  
    qual_encoded.append(encode(train, q))

##We can see it follow the distribution order from boxplot in advanced EDA

#### Then encode the entire dataset

data["type_of_property"] = data["type_of_property"].replace(qual_encoded[0])


data["type_of_property"].unique()

array([1, 2, 3])

In [9]:
data["type_of_property"] = data["type_of_property"].astype("float")

### use a simple linear regressor to fill size column

In [10]:
rest_of_num = ['erf_size','floor_size']

In [11]:
X_floor = data[data['floor_size'] > 0.][['bathrooms','bedrooms','garages','garden','pool','type_of_property']]
y_floor = data[data['floor_size'] > 0.]['floor_size']
y_floor.shape

(5841,)

In [12]:
X_erf = data[data['erf_size'] > 0.][['bathrooms','bedrooms','garages','garden','pool','type_of_property']]
y_erf = data[data['erf_size'] > 0.]['erf_size']
X_erf.shape

(3595, 6)

In [13]:
print("size of row to fill (floor and size)")
X_floor_test = data[data['floor_size'].astype("str") == "nan"][['bathrooms','bedrooms','garages','garden','pool','type_of_property']]
y_floor_test = data[data['floor_size'].astype("str") == "nan"]['floor_size']
print(y_floor_test.shape)

X_erf_test = data[data['erf_size'].astype("str") == "nan"][['bathrooms','bedrooms','garages','garden','pool','type_of_property']]
y_erf_test = data[data['erf_size'].astype("str") == "nan"]['erf_size']
print(y_erf_test.shape)

size of row to fill (floor and size)
(2005,)
(4251,)


#### floor_size prediction

In [14]:
xf_train, xf_val,  yf_train, yf_val = train_test_split(X_floor,y_floor,test_size=0.2,random_state=2)
yf_train = np.log(yf_train)
yf_val = np.log(yf_val)

mod = RidgeCV(alphas=[ 0.1       ,  0.16681005,  0.27825594,  0.46415888,  0.77426368,
        1.29154967,  2.15443469,  3.59381366,  5.9948425 , 10.        ])

### Evaluate on a val set
mod.fit(xf_train,yf_train)
yf_val = np.array(yf_val).reshape(-1, 1)
rmse = np.sqrt(mean_squared_error(yf_val, mod.predict(xf_val)))
print("The Root mean squared error (RMSE) onval set: {:.4f}".format(rmse))

###Retrain with all floor data != nan
mod.fit(X_floor,np.log(y_floor))
y_floor_test = mod.predict(X_floor_test)
print(len(y_floor_test))

idx = data[data['floor_size'].astype("str") == "nan"]['floor_size'].index
data.loc[data['floor_size'].astype("str") == "nan", 'floor_size']= np.exp(y_floor_test)

The Root mean squared error (RMSE) onval set: 0.8005
2005


#### erf_size prediction

In [15]:
xf_train, xf_val,  yf_train, yf_val = train_test_split(X_erf,y_erf,test_size=0.2,random_state=2)
yf_train = np.log(yf_train)
yf_val = np.log(yf_val)
xf_train = (xf_train-xf_train.mean())/xf_train.std()
xf_val = (xf_val-xf_val.mean())/xf_val.std()

mod = RidgeCV(alphas=[ 0.1       ,  0.16681005,  0.27825594,  0.46415888,  0.77426368,
        1.29154967,  2.15443469,  3.59381366,  5.9948425 , 10.        ])

mod.fit(xf_train,yf_train)
yf_val = np.array(yf_val).reshape(-1, 1)
rmse = np.sqrt(mean_squared_error(yf_val, mod.predict(xf_val)))
print("The Root mean squared error (RMSE) onval set: {:.4f}".format(rmse))

###Retrain with all erf size data != nan
mod.fit(X_erf,np.log(y_erf))
y_erf_test = mod.predict(X_erf_test)
print(len(y_erf_test))

idx = data[data['erf_size'].astype("str") == "nan"]['erf_size'].index
data.loc[data['erf_size'].astype("str") == "nan", 'erf_size']= np.exp(y_erf_test)

The Root mean squared error (RMSE) onval set: 1.2659
4251


### Listing date preprocessing
Get year

In [16]:
data['listing_date'] = [float(v.year) for v in pd.to_datetime(data['listing_date'])]

In [17]:
data['listing_date'].unique()

array([2020., 2019., 2018., 2017., 2016., 2015.])

### Town feature preprocess -> get only the district (quartier)

In [18]:
series = []
for v in data["town"]:
    v = v.split(',')
    v.pop()
    series.append(v.pop())
    
data["town"] = series

### We will use cat boost categorical features default handler for the town cases

#### drop listing_number 'cause just index value of property 

In [19]:
data = data.drop(['listing_number'],axis=1)

### Standard scale continous numeric features

In [20]:
to_scale = ['erf_size',  'floor_size'] 

#Get mean and deviation of train
#mean = data[data["split"]==0][to_scale].mean()
#std = data[data["split"]==0][to_scale].std()

#data[to_scale] = data[to_scale]-mean
#data[to_scale] = data[to_scale]/std

# log transform
#data[to_scale]=np.log(data[to_scale])
#data[to_scale]=data[to_scale].replace(-np.inf,-1)
# data[to_scale]

In [21]:
data.isna().sum()

bathrooms               0
bedrooms                0
erf_size                0
floor_size              0
garages                 0
listing_date            0
pet_friendly            0
price                   0
town                    0
type_of_property        0
garden                  0
pool                    0
secure_parking          0
parking                 0
no_pets_allowed         0
furnished               0
flatlet                 0
fibre_internet          0
furnished_(optional)    0
adsl_internet           0
satellite_internet      0
fixed_wimax_internet    0
split                   0
dtype: int64

### Drop outliers

### par IQR 

$$ IQR = Q3 - Q1 $$
$$ interval = [Q1 - seuil * IQR, Q3 + seuil * IQR]$$

In [22]:
def IQR(df,cols,quantiles=[0.25,0.75],seuil=1.5):
    Q1 = df[cols].quantile(quantiles[0])
    Q3 = df[cols].quantile(quantiles[1])
    IQR = Q3 - Q1

    return df[~((df[cols] < (Q1 - seuil * IQR)) |(df[cols] > (Q3 + seuil * IQR))).any(axis=1)]

In [23]:
for col in data.columns:
    print(data[col].value_counts())

1.0     3357
2.0     2145
3.0      706
2.5      343
4.0      328
3.5      259
1.5      231
5.0      139
4.5      113
6.0       67
5.5       50
7.0       25
8.0       21
6.5       15
7.5       10
9.0        8
10.0       5
12.0       4
11.0       4
15.0       3
14.0       3
0.5        2
18.0       2
8.5        2
13.0       1
9.5        1
25.0       1
30.0       1
Name: bathrooms, dtype: int64
2.0     2562
3.0     1734
1.0     1722
4.0      903
5.0      388
0.5      208
6.0      162
7.0       56
8.0       32
9.0       21
10.0      10
12.0       9
2.5        8
1.5        7
11.0       7
13.0       4
16.0       3
3.5        3
15.0       2
4.5        1
33.0       1
18.0       1
30.0       1
14.0       1
Name: bedrooms, dtype: int64
85.228615      670
95.754172      337
100.083695     296
130.748510     242
153.537565     172
              ... 
1183.000000      1
1304.000000      1
5027.000000      1
1291.000000      1
7357.000000      1
Name: erf_size, Length: 1562, dtype: int64
45.283221    

In [24]:
to_check = [
    "parking","secure_parking","town","garages","floor_size","bedrooms","erf_size","bathrooms"
    
]

In [25]:
data_out = IQR(data,to_check,quantiles=[0.0001,0.90])
data_out.shape

  return df[~((df[cols] < (Q1 - seuil * IQR)) |(df[cols] > (Q3 + seuil * IQR))).any(axis=1)]
  return df[~((df[cols] < (Q1 - seuil * IQR)) |(df[cols] > (Q3 + seuil * IQR))).any(axis=1)]


(6814, 23)

### data saving

In [26]:
data_out.to_csv("../../../Data/Dataset_splited_v2.csv",index=False)