## Data Preprocessing

In [9]:
import pandas as pd
import numpy as np
import csv
import datetime

## Feature Selection 

In [10]:
train = pd.read_csv("train_clean.csv")
test = pd.read_csv('test_clean.csv')
sample = pd.read_csv("Sample Submission DAC.csv")


In [11]:
sample.head()

Unnamed: 0,id,regency_cluster
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1


In [12]:
print(train.shape)
print(test.shape)

(343583, 21)
(21018, 20)


In [13]:
test.drop(['Unnamed: 0'], axis = 1, inplace=True)


In [14]:
test.isnull().sum()

site                 0
continent_id         0
buyer_country        0
buyer_region         0
buyer_city           0
distance             0
mobile               0
package              0
channel_id           0
adults               0
children             0
room                 0
destination_id       0
destination_type     0
regency_continent    0
regency_country      0
regency_market       0
Number of days       0
cnt                  0
dtype: int64

In [29]:
del test['site']
del test['mobile']
del test['package']


In [10]:
train.head(10)

Unnamed: 0,site,continent_id,buyer_country,buyer_region,buyer_city,distance,buyer_id,mobile,package,channel_id,...,children,room,destination_id,destination_type,regency_continent,regency_country,regency_market,cnt,regency_cluster,Number of days
0,2,3,66,348,48862,2234.2641,12,0,1,9,...,0,1,8250,1,3,2,50,628,1,4
1,2,3,66,348,48862,2234.2641,12,0,1,9,...,0,1,8250,1,1,2,50,628,1,4
2,2,3,66,348,48862,2234.2641,12,0,0,9,...,0,1,8250,1,1,2,50,628,1,4
3,2,3,66,442,35390,913.1932,93,0,0,3,...,0,1,14984,1,1,2,50,1457,80,5
4,2,3,66,442,35390,913.6259,93,0,0,3,...,0,1,14984,1,1,2,50,1457,21,5
5,2,3,66,442,35390,911.5142,93,0,0,3,...,0,1,14984,1,1,2,50,1457,92,5
6,2,3,66,189,10067,599.5329,501,0,0,2,...,0,1,8267,1,2,2,50,675,41,1
7,2,3,66,189,10067,599.5329,501,0,1,2,...,0,1,8267,1,1,2,50,675,41,1
8,2,3,66,189,10067,599.5329,501,0,0,2,...,0,1,8267,1,1,2,50,675,69,1
9,2,3,66,189,10067,599.5329,501,0,0,2,...,0,1,8267,1,1,2,50,675,70,1


In [15]:
main_df = train.copy()

In [16]:
from sklearn.model_selection import train_test_split

## Split into train and test set
df_target = train['regency_cluster']
df_feature = train.drop(['regency_cluster','buyer_id','site','mobile','package'], axis = 1)
## distance, 

In [17]:
print(df_target.shape)
print(df_feature.shape)

(343583,)
(343583, 16)


In [18]:
x_train, x_test, y_train, y_test = train_test_split(df_feature, df_target, test_size=0.2)


In [23]:
print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

(274866, 16)
(274866,)
(68717, 16)
(68717,)


## Random Forest


In [19]:
from sklearn.ensemble import RandomForestClassifier
classifier2 = RandomForestClassifier(n_estimators = 50, random_state = 42)
classifier2.fit(df_feature, df_target)

RandomForestClassifier(n_estimators=50, random_state=42)

In [20]:
feature_importances_3 = pd.DataFrame({'features':df_feature.columns,'feature_importance':classifier2.feature_importances_})
print(feature_importances_3.sort_values('feature_importance',ascending=False))

             features  feature_importance
4            distance            0.216061
3          buyer_city            0.115184
11  regency_continent            0.103244
15     Number of days            0.087287
2        buyer_region            0.085496
5          channel_id            0.078810
9      destination_id            0.074417
14                cnt            0.063460
6              adults            0.040117
7            children            0.028989
1       buyer_country            0.028285
13     regency_market            0.023622
10   destination_type            0.017671
8                room            0.014826
0        continent_id            0.012838
12    regency_country            0.009693


## Evaluation

In [25]:
print(classifier2.score(x_train,y_train))
print(classifier2.score(x_test,y_test))

0.7972939541449288
0.7969643610751342


In [30]:
y_pred = classifier2.predict(test)

In [31]:
y_pred

array([11,  3,  3, ..., 89, 25, 25], dtype=int64)

In [32]:
def formatNumber(num):
    return int(num)
    

In [34]:
#assuming id for test dataset is equal to the id of submission dataset
pred = pd.DataFrame(y_pred)
sub_df = pd.read_csv('Sample Submission DAC.csv')
datasets=pd.concat([sub_df['id'],pred],axis=1)
datasets.columns=['id','regency_cluster']
datasets.to_csv('sample_submission_test.csv',index=False)
final_sub_df = pd.read_csv('sample_submission_test.csv')
final_sub_df = final_sub_df.head(10001)
final_sub_df['id'] = final_sub_df['id'].astype(np.int64)
final_sub_df.to_csv('SA-21-0191_Exponentials_Asia Pacific University.csv',index=False)

## Hyperparameter tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import pprint as pp
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import uniform, truncnorm, randint

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
# Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(5, 30, num = 10)]



# Create the random grid
random_grid = {'n_estimators': n_estimators,
#                'max_depth': max_depth,       
#                'max_features': truncnorm(a=0, b=1, loc=0.25, scale=0.1),
#                'min_samples_split': uniform(0.01, 0.199)
               }
pp.pprint(random_grid)

# Random Search Training

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=1, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(df_feature, df_target)


In [None]:
rf_random.best_params_