In [0]:
#R: Comments beginning with "R:" are added by me for the purpose of the assignment.
#R: Brings both the train and test data sets into our "local" folder for use.
# download data (-q is the quiet mode)
! wget -q https://www.dropbox.com/s/lhb1awpi769bfdr/test.csv?dl=1 -O test.csv
! wget -q https://www.dropbox.com/s/gudb5eunj700s7j/train.csv?dl=1 -O train.csv

In [0]:
#R: Import pandas, a Python library used in data manipulation and analysis.
import pandas as pd

#R: Importing the train data set using pandas, we partition it into two variables;
#   one containing all the descriptive attributes (X_train --> bedrooms, accommodates, etc.) 
#   and the other the attribute to be predicted (y_train --> price_rating).
Xy_train = pd.read_csv('train.csv', engine='python')
X_train = Xy_train.drop(columns=['price_rating'])
y_train = Xy_train[['price_rating']]

#R: Prints how many training records we have.
print('training', len(X_train))
#R: Creates a histogram of the 'price_rating' values found in the training set.
#   Shows the number of records in each target class label.
Xy_train.price_rating.hist()

#R: Loads in the records of the testing set. Each object in the test
#   set lacks a price_rating value. This is what we attempt to predict.
X_test = pd.read_csv('test.csv', engine='python')

#R: Attaches the Id of each "new" listing to a variable.
testing_ids = X_test.Id

#R: Prints how many test records we have.
print('testing', len(X_test))


In [0]:
# model training and tuning
#R: Import numerous libraries used for data manipulation, analysis and to build
#   machine learning models capable of best predicting the price_rating 
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost.sklearn import XGBClassifier

#R: A random seed makes it so each run of the script produces the same results.
np.random.seed(0)

#R: Define which numeric attributes in our training set we would like to use to train our model with.
numeric_features = ['bedrooms', 'review_scores_location', 'accommodates', 'beds']

#R: The numeric_transformer is a way of processing our numeric data...
#   The Pipeline takes a number of transformations / processing methods and applies them to the data.
#   SimpleImputer predicts missing values using the median of that attributes known data.
#   StandardScaler scales the data to have a standard normal distribution.
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

#R: Define which categorical attributes in our training set we would like to use to train our model with.
categorical_features = ['property_type', 'is_business_travel_ready', 'room_type']

#R: Similar to the numeric_transformer, the categorical_transformer is a way of preprocessing our data.
#   SimpleImputer in this case fills missing values with the string 'missing'.
#   OneHotEncoder is a way of processing categorical data into a one-hot numeric array. 
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

#R: Creates a variable which holds both our attribute transformers.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

#R: Creates a variable holding our two transformers as well as specifying what type of machine learning model
#   we would like to use. In this sample script it is a XGBoost for classification. For the chosen model
#   two default hyperparameters are set.
regr = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', XGBClassifier(
                          objective='multi:softmax', seed=1))])

#R: From the full training and test datasets, partition out the columns which pertain to the attributes
#   we use for training and classification.
X_train = X_train[[*numeric_features, *categorical_features]]
X_test = X_test[[*numeric_features, *categorical_features]]

# `__` denotes attribute 
# (e.g. regressor__n_estimators means the `n_estimators` param for `regressor`
#  which is our xgb)
#R: Create a grid of possible hyperparameter values. Each combination in this grid will be evaluated.
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean'],
    'regressor__n_estimators': [50, 100],
    'regressor__max_depth':[10, 20]
}
#R: Using the above hyperparameter grid, evaluate each combination. Use 5-fold cross validation.
#   The scoring metric used is accuracy.
grid_search = GridSearchCV(
    regr, param_grid, cv=5, verbose=3, n_jobs=2, 
    scoring='accuracy')
grid_search.fit(X_train, y_train)

#R: Prints the best score acheived from our hyperparameter search.
print('best score {}'.format(grid_search.best_score_))

In [0]:
# Prediction & generating the submission file
#R: Creates a csv file containing the Id for each test record as well as the newly
#   predicted label for each of these Airbnb Id's.
y_pred = grid_search.predict(X_test)
pd.DataFrame(
    {'Id': testing_ids, 'price_rating':y_pred}).to_csv('sample_submission.csv', index=False)