In [82]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectKBest


In [83]:
def processData(path):
    def calculateAge(month: str, lease_commence_date: float) -> int:
        # calculate years of lease remaining, using 99 years as default, round up to the nearest year
        sold_year = month.strftime('%Y')
        return max(int(int(sold_year) - lease_commence_date ),0)

    def classifyStorey(storey_range):
        # floor = average of start and end storey
        start = int(storey_range[:2])
        end = int(storey_range[-2:])
        return (start + end)//2

    def classifyModel(model):
        if 'Maisonette' in model or 'Generation' in model:
            return 'Maisonette'
        elif 'Type' in model:
            return 'Type'
        elif 'Model' in model:
            return 'Model'
        elif 'Apartment' in model:
            return 'Apartment'
        else:
            return model

    df = pd.read_csv(path)
    df['month'] = pd.to_datetime(df['month'])
    df['age'] = df.apply(
        lambda x: calculateAge(x['month'], x['lease_commence_date']), axis=1)
    df['general_model'] = df['flat_model'].apply(classifyModel)
    df['storey'] = df['storey_range'].apply(classifyStorey)

    df['sold_year'] = df['month'].dt.strftime('%Y')
    df=df.dropna(axis=1)
    return df.copy()


In [84]:
train_df = processData('./train.csv')
train_df.head()

Unnamed: 0,town,month,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,age,general_model,storey,sold_year
0,GEYLANG,2006-01-01,4 ROOM,328,UBI AVE 1,04 TO 06,84.0,Simplified,1985.0,175000.0,21,Simplified,5,2006
1,JURONG EAST,2003-02-01,5 ROOM,271,TOH GUAN RD,10 TO 12,120.0,Improved,1999.0,353000.0,4,Improved,11,2003
2,TOA PAYOH,2005-09-01,3 ROOM,205,TOA PAYOH NTH,10 TO 12,65.0,Improved,1973.0,215000.0,32,Improved,11,2005
3,PASIR RIS,2001-03-01,EXECUTIVE,508,PASIR RIS ST 52,04 TO 06,146.0,Apartment,1993.0,482000.0,8,Apartment,5,2001
4,JURONG WEST,2014-07-01,EXECUTIVE,656B,JURONG WEST ST 61,10 TO 12,133.0,Apartment,2001.0,613000.0,13,Apartment,11,2014


In [85]:
features = ['town', 'general_model', 'flat_type',
            'storey', 'sold_year', 'age']


In [86]:
# independent variables
X = train_df[features]

# dependent variables
Y = train_df['resale_price']/train_df['floor_area_sqm']
# X = pd.get_dummies(data=X, drop_first=True)
ct = make_column_transformer(
    (Pipeline([
        ('scaler', StandardScaler()),
        ('normalize', Normalizer()),
    ]),
        make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(drop='first'),
     make_column_selector(dtype_include=object)))


ct.fit(X)
X = ct.transform(X)


In [87]:
# convert to dummy variable for category variable
# X = pd.get_dummies(data=X,drop_first=True)

In [88]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.33, random_state=42)


In [89]:
# fit with linear regression
est = sm.OLS(np.log(y_train), sm.add_constant(X_train.toarray())).fit()
print(est.summary())


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.859
Model:                            OLS   Adj. R-squared:                  0.859
Method:                 Least Squares   F-statistic:                 2.909e+04
Date:                Wed, 20 Apr 2022   Prob (F-statistic):               0.00
Time:                        21:03:29   Log-Likelihood:             1.8691e+05
No. Observations:              309885   AIC:                        -3.737e+05
Df Residuals:                  309819   BIC:                        -3.730e+05
Df Model:                          65                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          7.8304      0.039    199.467      0.0

In [90]:
y_pred = est.predict(sm.add_constant(X_test.toarray()))

In [91]:
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.linear_model import LinearRegression
# from sklearn.ensemble import RandomForestRegressor

# Linear_Model = LinearRegression()
# Linear_Model.fit(X_train, y_train)
# y_pred_lin = Linear_Model.predict(X_test)

# regr1 = DecisionTreeRegressor(max_depth=2, random_state=42, criterion='friedman_mse')
# regr2 = DecisionTreeRegressor(max_depth=5, random_state=42, criterion='friedman_mse')
# regr3 = DecisionTreeRegressor(max_depth=8, random_state=42, criterion='friedman_mse')

# regr4 = RandomForestRegressor(max_depth=2, random_state=42,n_estimators=1000)
# regr5 = RandomForestRegressor(max_depth=5, random_state=42, n_estimators=1000)
# regr6 = RandomForestRegressor(max_depth=8, random_state=42,n_estimators=1000)


# regr1.fit(X_train, y_train)
# regr2.fit(X_train, y_train)
# regr3.fit(X_train, y_train)

# regr4.fit(X_train, y_train)
# regr5.fit(X_train, y_train)
# regr6.fit(X_train, y_train)
# # diecision tree
# y_1 = regr1.predict(X_test)
# y_2 = regr2.predict(X_test)
# y_3 = regr3.predict(X_test)
# # random forest
# y_4 = regr4.predict(X_test)
# y_5 = regr5.predict(X_test)
# y_6 = regr6.predict(X_test)

# print('linear model')
# print('-'*20)
# print(mean_squared_error(y_test, y_pred_lin))  # instead of y_true and y_pred?
# print(Linear_Model.score(X_test, y_test))
# print('descision model')
# print('-'*20)
# print(mean_squared_error(y_test, y_1))
# print(r2_score(y_test, y_1))

# print(mean_squared_error(y_test, y_2))
# print(r2_score(y_test, y_2))

# print(mean_squared_error(y_test, y_3))
# print(r2_score(y_test, y_3))
# print('random forest model')
# print('-'*20)
# print(mean_squared_error(y_test, y_4))
# print(r2_score(y_test, y_4))

# print(mean_squared_error(y_test, y_5))
# print(r2_score(y_test, y_5))

# print(mean_squared_error(y_test, y_6))
# print(r2_score(y_test, y_6))

'''
249307.3995155182
0.8578557474755346
1456782.7736734557
0.16940572619690164
1103550.5746500914
0.3708033862556195
855868.7588298644
0.5120208015512975
1452489.8983405593
0.17185333728490415
1088995.8077608538
0.3791018822656015
830615.3229060769
0.5264192140333861
'''
'''
249307.3995155182
0.8578557474755346
855868.7588298641
0.5120208015512977
855870.7604444271
0.512019660317598
1662117.5614166406
0.0523327473052021
1452489.8983405593
0.17185333728490415
1088995.8077608538
0.3791018822656015
830615.3229060769
0.5264192140333861
'''


'\n249307.3995155182\n0.8578557474755346\n855868.7588298641\n0.5120208015512977\n855870.7604444271\n0.512019660317598\n1662117.5614166406\n0.0523327473052021\n1452489.8983405593\n0.17185333728490415\n1088995.8077608538\n0.3791018822656015\n830615.3229060769\n0.5264192140333861\n'

In [92]:



mean_squared_error(
    np.log(y_test), y_pred), r2_score(np.log(y_test), y_pred)

# (249307.3995155186, 0.8578557474755344)
# (249307.39951551866, 0.8578557474755344)
# (264329.5240520152, 0.8492907844310387)
# (359681.76312715403, 0.7949250785746986)
# (326728.343355769, 0.8137136874592772)
# (0.015303960806279139, 0.8771978030267437)


(0.017524576156837772, 0.8593791188878472)

350993    2744.565217
293572    2260.000000
77014     2835.820896
37231     2058.823529
79166     3300.970874
             ...     
357468    5806.451613
11901     3290.174757
106690    2663.043478
151471    2971.631206
46672     3687.022901
Length: 152630, dtype: float64

Since the F-statistic value is very large and Prob (F-statistic) is less than 0.05, we have sufficient evidence that there is a good amount of linear relationship between the target variable and the feature variables

In [93]:
test_df = processData('./test.csv')
X_test = test_df[features]
X_test.head()


Unnamed: 0,town,general_model,flat_type,storey,sold_year,age
0,YISHUN,Improved,5 ROOM,5,2003,16
1,PUNGGOL,Apartment,4 ROOM,11,2019,5
2,ANG MO KIO,Maisonette,3 ROOM,5,2013,33
3,SERANGOON,Improved,5 ROOM,11,2014,15
4,YISHUN,Maisonette,3 ROOM,8,2005,20


In [94]:


# X_test=pd.get_dummies(X_test,drop_first=True)
X_test = ct.transform(X_test)


In [95]:

y_pred = est.predict(sm.add_constant(X_test.toarray()))


In [106]:
submission = test_df['floor_area_sqm']*pd.DataFrame(np.exp(y_pred))[0]


In [107]:
submission = pd.DataFrame(
    submission, columns=['Resale_price'])

submission['Index'] = np.arange(1, len(submission)+1)

submission = submission[['Index', 'Resale_price']]


In [108]:
submission.to_csv('submission.csv', index=False)


In [109]:
submission

Unnamed: 0,Index,Resale_price
0,1,256784.488587
1,2,442700.387688
2,3,375981.008473
3,4,644562.219288
4,5,146680.299788
...,...,...
115624,115625,213266.928849
115625,115626,173199.862528
115626,115627,406425.075274
115627,115628,173718.539413
