In [0]:
#R: Comments beginning with "R:" are added by me for the purpose of the assignment.
# download data (-q is the quiet mode)
! wget -q https://www.dropbox.com/s/lhb1awpi769bfdr/test.csv?dl=1 -O test.csv
! wget -q https://www.dropbox.com/s/gudb5eunj700s7j/train.csv?dl=1 -O train.csv
#R: Brings both the train and test data sets into our local folder for use.

In [0]:
import pandas as pd

#Using pandas assign the training data to a variable.
Xy_train = pd.read_csv('train.csv', engine='python')

In [0]:
#Plot a correlation heat map to see features and their correlation with price_rating.
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
corrmat = Xy_train.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)
#A point to note is that the correlation heatmap doesn't provide information for categorical attributes.

In [0]:
#Plot individual correlation plots for features I deemed well-correlated with price_rating
#Compares accommodates feature & price_rating
fig, ax = plt.subplots()
ax.scatter(Xy_train['accommodates'], Xy_train['price_rating'])
plt.ylabel('price_rating', fontsize=13)
plt.xlabel('accommodates', fontsize=13)
plt.show()

In [0]:
#Plot individual correlation plots for features I deemed well-correlated with price_rating
#Compares bedrooms feature & price_rating
fig, ax = plt.subplots()
ax.scatter(Xy_train['bedrooms'], Xy_train['price_rating'])
plt.ylabel('price_rating', fontsize=13)
plt.xlabel('bedrooms', fontsize=13)
plt.show()

In [0]:
#Plot individual correlation plots for features I deemed well-correlated with price_rating
#Compares beds feature & price_rating
fig, ax = plt.subplots()
ax.scatter(Xy_train['beds'], Xy_train['price_rating'])
plt.ylabel('price_rating', fontsize=13)
plt.xlabel('beds', fontsize=13)
plt.show()

In [0]:
#Plot individual correlation plots for features I deemed well-correlated with price_rating
#Compares square_feet feature & price_rating
fig, ax = plt.subplots()
ax.scatter(Xy_train['square_feet'], Xy_train['price_rating'])
plt.ylabel('price_rating', fontsize=13)
plt.xlabel('square_feet', fontsize=13)
plt.show()

In [0]:
#From the square_feet plot, it seems as though this column has many missing values...
nonMissing = 0
for i in range(0,len(Xy_train['square_feet'])):
  if (Xy_train['square_feet'][i] >= 0):
    nonMissing += 1
print("The number of missing entries in the square_feet feature is:",len(Xy_train['square_feet']) - nonMissing)\
#With how many missing values, it may not be worthwhile to interpret as the majority would all have
#the same value depending on interpretation method. This wouldn't give my model much information gain.

In [0]:
#Plot individual correlation plots for features I deemed well-correlated with price_rating
#Compares guests_included feature & price_rating
fig, ax = plt.subplots()
ax.scatter(Xy_train['guests_included'], Xy_train['price_rating'])
plt.ylabel('price_rating', fontsize=13)
plt.xlabel('guests_included', fontsize=13)
plt.show()

In [0]:
#Plot individual correlation plots for features I deemed well-correlated with price_rating
#Compares review_scores_cleanliness feature & price_rating
fig, ax = plt.subplots()
ax.scatter(Xy_train['review_scores_cleanliness'], Xy_train['price_rating'])
plt.ylabel('price_rating', fontsize=13)
plt.xlabel('review_scores_cleanliness', fontsize=13)
plt.show()

In [0]:
#Plot individual correlation plots for features I deemed well-correlated with price_rating
#Compares availability_30 feature & price_rating
fig, ax = plt.subplots()
ax.scatter(Xy_train['availability_30'], Xy_train['price_rating'])
plt.ylabel('price_rating', fontsize=13)
plt.xlabel('availability_30', fontsize=13)
plt.show()

In [0]:
#Drop outliers and fill in missing values for the features we care about.
import numpy as np
#Based on the correlation figure for the beds and bedrooms there was one extreme outlier.
#We drop this so our model can 'train' better.
Xy_train = Xy_train[Xy_train.beds != 20]
Xy_train = Xy_train[Xy_train.bedrooms != 20]
#We also drop the square_feet feature based on the previous investigations into it.
Xy_train = Xy_train.drop(columns=['square_feet'])

#Partition the training and testing data into separate variables as done in the sample script.
X_train = Xy_train.drop(columns=['price_rating'])
y_train = Xy_train[['price_rating']]
X_test = pd.read_csv('test.csv', engine='python')
testing_ids = X_test.Id

In [0]:
# model training and tuning
#This script attempts to accomplish the classification problem at hand using a Support Vector Machine.
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC as svc

#Based on the scatterplots, these were the decided upon numeric features. Only three were used in order to not overfit the model.
numeric_features = ['bedrooms', 'accommodates', 'beds']
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors = 3, weights = 'uniform')),
    ('scaler', StandardScaler())])

#Similarly, I did not want to overfit the model with categorical features and so selected the two which intuitively could make the most
#sense in explaining the price level of an AirBnB listing.
categorical_features = ['property_type', 'room_type']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant',fill_value="missing")),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

#A standard preprocessor variable for applying transformations to different types of data.
preprocessor = ColumnTransformer(
               transformers = [
                  ('num', numeric_transformer, numeric_features),
                  ('cat', categorical_transformer, categorical_features)])

#Define that the classifier model to be used is an SVM for classification. i.e. svc.
classif = Pipeline(
          steps = [
              ('preprocessor', preprocessor),
              ('classifier', svc())])

#Partitions the training and testing set by the selected attributes.
X_train = X_train[[*numeric_features, *categorical_features]]
X_test = X_test[[*numeric_features, *categorical_features]]

#Possible hyperparameters include C, gamma and the kernel to be used.
param_grid = {
    'classifier__C': [10,50,80],
    'classifier__gamma': [0.001,0.01,0.1],
    'classifier__kernel': ['rbf'],
    'preprocessor__num__imputer__n_neighbors': [3,4,5]
}

#Using the above "param_grid" we can search all possible combinations selecting the best set of hyperparameters
#for our model to use. (Final Model: C = 80 / gamma = 0.1 / kernel = rbf)
grid_search = GridSearchCV(
   classif, param_grid, cv=5, verbose = 3, n_jobs = 3, scoring='accuracy')
grid_search.fit(X_train, y_train)

print('best score {}'.format(grid_search.best_score_))
print('best hyperparameter set{}'.format(grid_search.best_params_))

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:   40.9s
[Parallel(n_jobs=3)]: Done 122 tasks      | elapsed:  5.3min
[Parallel(n_jobs=3)]: Done 135 out of 135 | elapsed:  7.2min finished
  y = column_or_1d(y, warn=True)


best score 0.7002218163848484
best hyperparameter set{'classifier__C': 80, 'classifier__gamma': 0.1, 'classifier__kernel': 'rbf', 'preprocessor__num__imputer__n_neighbors': 3}


In [0]:
# Prediction & generating the submission file
#Predict the price rating for each test record. Create a csv file containing the test records Id and the predicted
#price rating.
y_pred = grid_search.predict(X_test)
pd.DataFrame(
    {'Id': testing_ids, 'price_rating':y_pred}).to_csv('a1submission1.csv', index=False)