# Machine Learning Modeling
- Name: Minh T. Nguyen
- Date: 11/24/2023
- About:
    - Interest level prediction with classical ML model on the dataset (without image feature extraction)
    - For the sake of simplicity, we will only split the dataset into train/test. No need for grid-search, validation set, or k-fold validation for this project. For model comparison, we will only check for test accuracy for simplicity.
    - Models to consider:
        - KNN
        - Neural Networks
        - SVM
        - Decision Tree
        - Random Forests

In [1]:
!ls ../data

final_dataset_image.json     sentimental_extraction_kaggle.csv
final_dataset_no_image.json  sentimental_extraction_kaggle.json
images_sample		     sentimental_extraction_sample.csv
Kaggle-renthop.torrent	     train.json


**Note:** The datasets can be found [here]((https://www.kaggle.com/competitions/two-sigma-connect-rental-listing-inquiries/data?select=train.json.zip)).
- train.json: the training set.
- images_sample.zip: listing images organized by listing_id (a sample of 100 listings)
- Kaggle-renthop.7z: listing images organized by listing_id. Total size: 78.5 GB compressed.

In [2]:
# import libraries
import numpy as np
import pandas as pd
from collections import Counter
import re
import os
import joblib

# sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

import warnings
warnings.filterwarnings('ignore')

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


## 1. Import dataset

In [3]:
# import the dataset (this dataset has already remove outlier)
df = pd.read_json("../data/final_dataset_no_image.json")
df.head(100)

Unnamed: 0,bathrooms,bedrooms,price,sentiment_label,feature_laundry in building,feature_dishwasher,feature_hardwood floors,feature_dogs allowed,feature_cats allowed,feature_doorman,feature_elevator,feature_no fee,feature_fitness center,interest_level
4,1.0,1,2400,1,1,1,1,1,1,0,0,0,0,0
6,1.0,2,3800,1,1,1,1,0,0,1,1,1,0,-1
9,1.0,2,3495,1,1,1,1,0,0,1,1,0,0,0
10,1.5,3,3000,0,0,0,0,0,0,0,0,0,0,0
15,1.0,0,2795,0,1,0,0,0,0,1,1,0,1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,1.0,0,2795,0,1,1,1,0,0,1,1,0,0,-1
289,1.0,1,1875,0,0,1,1,1,1,0,1,0,0,-1
291,1.0,1,3450,0,0,0,0,1,1,1,0,0,1,-1
292,1.0,2,2659,1,1,0,0,1,1,0,1,0,0,-1


In [4]:
df.columns

Index(['bathrooms', 'bedrooms', 'price', 'sentiment_label',
       'feature_laundry in building', 'feature_dishwasher',
       'feature_hardwood floors', 'feature_dogs allowed',
       'feature_cats allowed', 'feature_doorman', 'feature_elevator',
       'feature_no fee', 'feature_fitness center', 'interest_level'],
      dtype='object')

In [5]:
df.max()

bathrooms                         10.0
bedrooms                           8.0
price                          13000.0
sentiment_label                    1.0
feature_laundry in building        1.0
feature_dishwasher                 1.0
feature_hardwood floors            1.0
feature_dogs allowed               1.0
feature_cats allowed               1.0
feature_doorman                    1.0
feature_elevator                   1.0
feature_no fee                     1.0
feature_fitness center             1.0
interest_level                     1.0
dtype: float64

In [6]:
df.min()

bathrooms                       0.0
bedrooms                        0.0
price                          43.0
sentiment_label                 0.0
feature_laundry in building     0.0
feature_dishwasher              0.0
feature_hardwood floors         0.0
feature_dogs allowed            0.0
feature_cats allowed            0.0
feature_doorman                 0.0
feature_elevator                0.0
feature_no fee                  0.0
feature_fitness center          0.0
interest_level                 -1.0
dtype: float64

In [7]:
print(f"There are {len(df)} samples.")

There are 48871 samples.


## 2. Data Processing

In [8]:
# split dataset into training set and test set
X = df.drop('interest_level', axis=1)
y = df['interest_level'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1)  # 80% training and 20% test

In [None]:
# # feature scaling
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

## 3. KNN

In [9]:
# create KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)

# train the classifier
knn.fit(X_train, y_train)

# predict the response for test dataset
y_pred = knn.predict(X_test)

# evaluate performance
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Full Report:")
print(classification_report(y_test, y_pred))

Test Accuracy: 0.688719137907516
Full Report:
              precision    recall  f1-score   support

          -1       0.74      0.90      0.81      5082
           0       0.40      0.23      0.29      1690
           1       0.44      0.12      0.18       559

    accuracy                           0.69      7331
   macro avg       0.53      0.42      0.43      7331
weighted avg       0.64      0.69      0.65      7331



In [10]:
# save model
joblib.dump(knn, 'models/knn_model_generic_cls_not_scale.pkl')

['models/knn_model_generic_cls_not_scale.pkl']

In [None]:
# quick test
knn_loaded = joblib.load('models/knn_model_generic_cls_not_scale.pkl')
y_pred = knn_loaded.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

## 4. Decision Trees

In [None]:
# create Decision Tree classifier
dt = DecisionTreeClassifier()

# train the classifier
dt.fit(X_train, y_train)

# predict the response for test dataset
y_pred = dt.predict(X_test)

# evaluate performance
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Full Report:")
print(classification_report(y_test, y_pred))

In [None]:
# save model
joblib.dump(dt, 'models/dt_model_generic_cls.pkl')

In [None]:
# quick test
dt_loaded = joblib.load('models/dt_model_generic_cls.pkl')
y_pred = dt_loaded.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

## 5. SVM

In [None]:
# create SVM classifier
svm = SVC()

# train the classifier
svm.fit(X_train, y_train)

# predict the response for test dataset
y_pred = svm.predict(X_test)

# evaluate performance
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Full Report:")
print(classification_report(y_test, y_pred))

In [None]:
# save model
joblib.dump(svm, 'models/svm_model_generic_cls.pkl')

In [None]:
# quick test
svm_loaded = joblib.load('models/svm_model_generic_cls.pkl')
y_pred = svm_loaded.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

## 6. Multi-layer Perceptron

In [None]:
# create MLP classifier
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)

# train the classifier
mlp.fit(X_train, y_train)

# predict the response for test dataset
y_pred = mlp.predict(X_test)

# evaluate performance
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Full Report:")
print(classification_report(y_test, y_pred))

In [None]:
# save model
joblib.dump(mlp, 'models/mlp_model_generic_cls.pkl')

In [None]:
# quick test
mlp_loaded = joblib.load('models/mlp_model_generic_cls.pkl')
y_pred = mlp_loaded.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

## 7. Random Forest

In [None]:
# create Random Forest classifier
rf = RandomForestClassifier(n_estimators=10)

# train the classifier
rf.fit(X_train, y_train)

# predict the response for test dataset
y_pred = rf.predict(X_test)

# evaluate performance
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Full Report:")
print(classification_report(y_test, y_pred))

In [None]:
# save model
joblib.dump(mlp, 'models/rf_model_generic_cls.pkl')

In [None]:
# quick test
rf_loaded = joblib.load('models/rf_model_generic_cls.pkl')
y_pred = rf_loaded.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))