In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc

# Load the dataset
df = pd.read_csv("HDBresale_subset24.csv")
df

Unnamed: 0,Height,Remaining_lease,district_C,district_N,district_E,district_S,district_W,Improved,Model_A,New_Gen,Premium,Simplified,Rel_value
0,3,52.000000,1,0,0,0,0,1,0,0,0,0,1
1,3,59.666667,1,0,0,0,0,1,0,0,0,0,0
2,3,55.666667,0,0,1,0,0,1,0,0,0,0,0
3,3,53.916667,0,0,1,0,0,1,0,0,0,0,0
4,3,55.416667,0,0,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29664,24,79.166667,0,0,0,0,1,0,0,0,1,0,1
29665,24,80.500000,0,0,0,0,1,0,0,0,1,0,0
29666,24,79.000000,0,0,0,0,1,0,0,0,1,0,1
29667,24,80.333333,0,0,0,0,1,0,0,0,1,0,0


In [2]:
# Define feature columns and target variable
target = "Rel_value"
features = [col for col in df.columns if col != target]

# Apply MinMaxScaler to features only
scaler = MinMaxScaler()
df[features] = scaler.fit_transform(df[features])

# Split into training and testing sets
Train, Test = train_test_split(df, test_size=0.25, random_state=0)

# Separate features and target
X_train, y_train = Train[features], Train[target]
X_test, y_test = Test[features], Test[target]

# GradientBoostingClassifier
gb_model = GradientBoostingClassifier(n_estimators=500, random_state=1, 
                                      max_features="sqrt", learning_rate=0.01, max_depth=None)
gb_model.fit(X_train, y_train)
gb_predictions = gb_model.predict(X_test)
gb_proba = gb_model.predict_proba(X_test)[:, 1]

# Evaluate GradientBoostingClassifier
gb_accuracy = accuracy_score(y_test, gb_predictions)
gb_fpr, gb_tpr, _ = roc_curve(y_test, gb_proba)
gb_auc = auc(gb_fpr, gb_tpr)
print("Gradient Boosting Classifier Accuracy:", gb_accuracy)
print("Gradient Boosting Classifier AUC:", gb_auc)
print("Gradient Boosting Classifier Report:")
print(classification_report(y_test, gb_predictions))

# RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=1, 
                                  max_features="sqrt", max_depth=None)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_proba = rf_model.predict_proba(X_test)[:, 1]

# Evaluate RandomForestClassifier
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_proba)
rf_auc = auc(rf_fpr, rf_tpr)
print("Random Forest Classifier Accuracy:", rf_accuracy)
print("Random Forest Classifier AUC:", rf_auc)
print("Random Forest Classifier Report:")
print(classification_report(y_test, rf_predictions))

# 1. District with highest average remaining lease
average_lease_by_district = df.groupby(['district_C', 'district_N', 'district_E', 'district_S', 'district_W'])['Remaining_lease'].mean()
highest_avg_lease_district = average_lease_by_district.idxmax()
print("District with the highest average remaining lease:", highest_avg_lease_district)

# 3. Most important features in RandomForestClassifier
importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
top_two_features = feature_importance_df.head(2)
print("Top two features by importance (Random Forest):")
print(top_two_features)

# 4. Predicted probability for specified inputs using RandomForestClassifier
# Given input: Premium flat in Eastern district, 24th storey, 90 years remaining lease
min_lease, max_lease = 45.75, 97.16667
scaled_remaining_lease = (90 - min_lease) / (max_lease - min_lease)

input_data = pd.DataFrame({
    'Height': [1],  # Scaled value for 24th storey
    'Remaining_lease': [scaled_remaining_lease],
    'district_C': [0],
    'district_N': [0],
    'district_E': [1],  # Eastern district
    'district_S': [0],
    'district_W': [0],
    'Improved': [0],
    'Model_A': [0],
    'New_Gen': [0],
    'Premium': [1],  # Premium flat
    'Simplified': [0]
})

predicted_proba = rf_model.predict_proba(input_data)[:, 1][0]
print(f"Predicted probability of Y=1 for the given input: {predicted_proba:.4f}")

Gradient Boosting Classifier Accuracy: 0.8964680506875169
Gradient Boosting Classifier AUC: 0.8454622132253711
Gradient Boosting Classifier Report:
              precision    recall  f1-score   support

           0       0.92      0.96      0.94      6240
           1       0.74      0.54      0.62      1178

    accuracy                           0.90      7418
   macro avg       0.83      0.75      0.78      7418
weighted avg       0.89      0.90      0.89      7418

Random Forest Classifier Accuracy: 0.8944459423025074
Random Forest Classifier AUC: 0.8734692655957512
Random Forest Classifier Report:
              precision    recall  f1-score   support

           0       0.92      0.96      0.94      6240
           1       0.72      0.55      0.62      1178

    accuracy                           0.89      7418
   macro avg       0.82      0.76      0.78      7418
weighted avg       0.89      0.89      0.89      7418

District with the highest average remaining lease: (0.0, 1.0, 