## LogisticRegression - HDB_Database Example (resale price)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

### Load data file without null values

In [34]:
data=pd.read_csv('HDB_database.csv').dropna()
data

Unnamed: 0,month,town,flat_type,addresses,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,searchval,X,Y,lat,long,distanceWithMrt,distanceWithRaffles,distanceWithGdPri
0,20-Aug,ANG MO KIO,3 ROOM,331 ANG MO KIO AVE 1,01 TO 03,68.0,New Generation,1981,59 years 06 months,274000.0,TECK GHEE VIEW,29941.74561,38240.88157,1.362111,103.850767,881.003375,8703.096482,1041.995726
1,20-Aug,ANG MO KIO,3 ROOM,333 ANG MO KIO AVE 1,07 TO 09,68.0,New Generation,1981,59 years 06 months,315000.0,TECK GHEE VIEW,30045.48338,38155.86680,1.361343,103.851699,985.490578,8617.203081,1048.316784
2,20-Aug,ANG MO KIO,3 ROOM,307C ANG MO KIO AVE 1,16 TO 18,70.0,Model A,2012,91 years 01 month,500000.0,TECK GHEE VISTA,29318.86604,38622.31673,1.365561,103.845169,689.479121,9113.711978,1199.433818
3,20-Aug,ANG MO KIO,3 ROOM,331 ANG MO KIO AVE 1,01 TO 03,82.0,New Generation,1981,59 years 05 months,315000.0,TECK GHEE VIEW,29941.74561,38240.88157,1.362111,103.850767,881.003375,8703.096482,1041.995726
4,20-Aug,ANG MO KIO,3 ROOM,471 ANG MO KIO AVE 10,04 TO 06,67.0,New Generation,1979,58 years,235000.0,TECK GHEE HORIZON,30602.40075,38390.67192,1.363466,103.856703,1072.597799,8872.728560,1542.266971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2432,20-Aug,YISHUN,EXECUTIVE,258 YISHUN ST 22,01 TO 03,154.0,Maisonette,1985,63 years 09 months,530000.0,HDB-YISHUN,28721.75915,46317.74732,1.435156,103.839804,830.455528,16883.935510,377.031439
2433,20-Aug,YISHUN,EXECUTIVE,611 YISHUN ST 61,04 TO 06,146.0,Maisonette,1987,66 years 03 months,598000.0,NEE SOON CENTRAL MEADOWS,28315.41791,44664.11187,1.420201,103.836153,472.288614,15264.628720,530.766169
2434,20-Aug,YISHUN,EXECUTIVE,643 YISHUN ST 61,01 TO 03,142.0,Apartment,1987,66 years 01 month,510000.0,HDB-YISHUN,28458.31899,44789.53725,1.421335,103.837437,662.983861,15374.950720,373.478202
2435,20-Aug,YISHUN,EXECUTIVE,724 YISHUN ST 71,07 TO 09,142.0,Apartment,1986,64 years 11 months,555000.0,KHATIB SPRING,27621.33645,45357.85481,1.426475,103.829916,655.684638,16047.820670,1340.339325


#### Select useful columns

In [96]:
df = data[['flat_type','floor_area_sqm','lease_commence_date',
           'distanceWithMrt','distanceWithRaffles','distanceWithGdPri','resale_price']]
df

Unnamed: 0,flat_type,floor_area_sqm,lease_commence_date,distanceWithMrt,distanceWithRaffles,distanceWithGdPri,resale_price
0,3 ROOM,68.0,1981,881.003375,8703.096482,1041.995726,274000.0
1,3 ROOM,68.0,1981,985.490578,8617.203081,1048.316784,315000.0
2,3 ROOM,70.0,2012,689.479121,9113.711978,1199.433818,500000.0
3,3 ROOM,82.0,1981,881.003375,8703.096482,1041.995726,315000.0
4,3 ROOM,67.0,1979,1072.597799,8872.728560,1542.266971,235000.0
...,...,...,...,...,...,...,...
2432,EXECUTIVE,154.0,1985,830.455528,16883.935510,377.031439,530000.0
2433,EXECUTIVE,146.0,1987,472.288614,15264.628720,530.766169,598000.0
2434,EXECUTIVE,142.0,1987,662.983861,15374.950720,373.478202,510000.0
2435,EXECUTIVE,142.0,1986,655.684638,16047.820670,1340.339325,555000.0


In [97]:
# convert the flat_type to integer category
df=df.replace(['2 ROOM','3 ROOM','4 ROOM','5 ROOM','EXECUTIVE'],[2,3,4,5,0])
df

Unnamed: 0,flat_type,floor_area_sqm,lease_commence_date,distanceWithMrt,distanceWithRaffles,distanceWithGdPri,resale_price
0,3,68.0,1981,881.003375,8703.096482,1041.995726,274000.0
1,3,68.0,1981,985.490578,8617.203081,1048.316784,315000.0
2,3,70.0,2012,689.479121,9113.711978,1199.433818,500000.0
3,3,82.0,1981,881.003375,8703.096482,1041.995726,315000.0
4,3,67.0,1979,1072.597799,8872.728560,1542.266971,235000.0
...,...,...,...,...,...,...,...
2432,0,154.0,1985,830.455528,16883.935510,377.031439,530000.0
2433,0,146.0,1987,472.288614,15264.628720,530.766169,598000.0
2434,0,142.0,1987,662.983861,15374.950720,373.478202,510000.0
2435,0,142.0,1986,655.684638,16047.820670,1340.339325,555000.0


In [104]:
# divide resale_price into three categories, 0, 1 and 2
def myfunction(x):
    if x <= 375000:
        return 0
    elif x <= 500000:
        return 1
    else:
        return 2
    
df["resale_price"] = df["resale_price"].apply(myfunction)
df

Unnamed: 0,flat_type,floor_area_sqm,lease_commence_date,distanceWithMrt,distanceWithRaffles,distanceWithGdPri,resale_price
0,3,68.0,1981,881.003375,8703.096482,1041.995726,0
1,3,68.0,1981,985.490578,8617.203081,1048.316784,0
2,3,70.0,2012,689.479121,9113.711978,1199.433818,0
3,3,82.0,1981,881.003375,8703.096482,1041.995726,0
4,3,67.0,1979,1072.597799,8872.728560,1542.266971,0
...,...,...,...,...,...,...,...
2432,0,154.0,1985,830.455528,16883.935510,377.031439,0
2433,0,146.0,1987,472.288614,15264.628720,530.766169,0
2434,0,142.0,1987,662.983861,15374.950720,373.478202,0
2435,0,142.0,1986,655.684638,16047.820670,1340.339325,0


In [99]:
X = df.iloc[:,0:5]
y = df['resale_price']
X.head()

Unnamed: 0,flat_type,floor_area_sqm,lease_commence_date,distanceWithMrt,distanceWithRaffles
0,3,68.0,1981,881.003375,8703.096482
1,3,68.0,1981,985.490578,8617.203081
2,3,70.0,2012,689.479121,9113.711978
3,3,82.0,1981,881.003375,8703.096482
4,3,67.0,1979,1072.597799,8872.72856


### Train our model

In [100]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X, y, random_state=42)
X_train.head()

Unnamed: 0,flat_type,floor_area_sqm,lease_commence_date,distanceWithMrt,distanceWithRaffles
670,3,68.0,1980,172.354354,10194.05193
886,5,112.0,2014,1035.30944,11626.01961
2035,0,146.0,1988,532.40337,12335.79835
1221,4,104.0,1996,1698.886212,13781.13196
1078,5,110.0,2001,761.49783,18151.46116


In [101]:
lr = LogisticRegression(solver = 'lbfgs', multi_class = 'multinomial', random_state = 42)
result = lr.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [102]:
y_pred = lr.predict(X_test)
y_pred

array([1, 0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 0, 2, 1, 1, 1, 2, 2, 0,
       1, 1, 2, 2, 0, 0, 1, 2, 2, 2, 0, 0, 2, 0, 0, 1, 2, 2, 2, 1, 2, 1,
       0, 0, 2, 2, 2, 2, 0, 0, 1, 2, 1, 0, 2, 2, 1, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 2, 1, 1, 2, 2, 0, 0, 1, 1, 2, 1, 1,
       0, 1, 0, 0, 2, 2, 1, 0, 1, 0, 0, 0, 0, 0, 2, 2, 0, 1, 0, 1, 0, 2,
       2, 0, 0, 0, 1, 0, 2, 2, 1, 1, 1, 1, 1, 0, 2, 0, 2, 2, 1, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 2, 0, 0, 0, 2, 2, 1, 1, 2, 1, 1, 0, 1, 1, 1, 0,
       2, 0, 1, 0, 1, 1, 1, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1, 1, 0, 2, 1, 2,
       1, 1, 0, 1, 0, 0, 1, 0, 2, 2, 2, 1, 0, 1, 1, 0, 0, 1, 2, 1, 0, 1,
       1, 0, 2, 0, 0, 0, 2, 0, 2, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 0,
       0, 0, 2, 0, 0, 1, 1, 0, 1, 1, 1, 2, 1, 1, 0, 1, 0, 1, 2, 1, 2, 2,
       1, 0, 1, 1, 2, 0, 1, 0, 2, 0, 1, 2, 1, 2, 0, 0, 0, 0, 2, 1, 0, 2,
       1, 0, 2, 1, 2, 2, 0, 2, 1, 0, 1, 0, 1, 0, 2, 2, 1, 1, 0, 0, 2, 0,
       1, 2, 2, 2, 0, 2, 0, 0, 1, 0, 1, 0, 2, 0, 1,

In [103]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))

0.6475409836065574


In [95]:
print(lr.predict([[4,150,2005,1000,10000,800]]))

ValueError: X has 6 features per sample; expecting 5

### Experimentation - choosing only two features