In [1]:
# CMPT459 2020 spring
# group name: Salt

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import os
%matplotlib inline
from sklearn import model_selection
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RepeatedKFold 
from sklearn import tree
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import precision_score, recall_score

def outlier_fd(data, threshold=0.5):
    low, high = np.percentile(data,1), np.percentile(data, 99)
    dist = high - low
    cut_off = dist * threshold
    lower, upper = low-cut_off, high+cut_off
    return [lower, upper]

def photos_number_counter(x):
    return len(x)

def valid_des(s):
    valid_s = " "
    for i in s.split():
        if i.isalnum():
            valid_s = " ".join([valid_s, i])
            
    return valid_s.strip()

In [2]:
def dataforclf(train):
    #Find the outlier of price, latitude and longitude using bound given by outlier_fd
    [price_lower, price_higher] = outlier_fd(train['price'])
    outlier_price = train[(train['price']<price_lower)|(train['price']>price_higher)]
    [latitude_lower, latitude_higher] = outlier_fd(train['latitude'],1)
    outlier_latitude = train[(train['latitude']<latitude_lower)|(train['latitude']>latitude_higher)]
    [longitude_lower, longitude_higher] = outlier_fd(train['longitude'],1)
    outlier_longitude = train[(train['longitude']<longitude_lower)|(train['longitude']>longitude_higher)]
    #deal with outliers, set the column for modified data and plot again
    train['price_modify'] = train['price']
    train['price_modify'].loc[train['price_modify']<price_lower] = price_lower
    train['price_modify'].loc[train['price_modify']>price_higher] = price_higher
    #deal with outliers, set the column for modified data and plot again
    train['latitude_modify'] = train['latitude']
    train['latitude_modify'].loc[train['latitude_modify']<latitude_lower] = latitude_lower
    train['latitude_modify'].loc[train['latitude_modify']>latitude_higher] = latitude_higher
    #deal with outliers, set the column for modified data and plot again
    train['longitude_modify'] = train['longitude']
    train['longitude_modify'].loc[train['longitude']<longitude_lower] = longitude_lower
    train['longitude_modify'].loc[train['longitude']>longitude_higher] = longitude_higher
    #get the number of photos
    train['photos_number'] = train['photos'].apply(photos_number_counter)
    #set all feature words in tf_idf as features for train for feature_modify
    train['features_modify'] = train['features'].apply(lambda x:" ".join(["_".join(i.split(" "))for i in x]))
    tfidf_f = TfidfVectorizer(stop_words=['central_a', 'pre', 'war','features'], max_features=98)
    train_sparse_f = tfidf_f.fit_transform(train['features_modify'])
    feature_list = tfidf_f.get_feature_names()
    array1 = train_sparse_f.toarray()
    for i, feature_str in enumerate(feature_list):
        list_all = []
        [rows, cols] = array1.shape
        for row in range(rows):
            list_all.append(array1[row][i])
        feature_str = feature_str + '(f)'
        feature_v = pd.Series(list_all,train.index, name = feature_str)
        train[feature_str] = feature_v
    #set all feature words in tf_idf as features for train for description_modify
    train['description_modify'] = train['description'].apply(lambda x:valid_des(x))
    tfidf_d = TfidfVectorizer(stop_words=['features'], max_features=100)
    train_sparse_d = tfidf_d.fit_transform(train['description_modify'])
    description_list = tfidf_d.get_feature_names()
    array2 = train_sparse_d.toarray()
    for i, description_str in enumerate(description_list):
        list_all = []
        [rows, cols] = array2.shape
        for row in range(rows):
            list_all.append(array2[row][i])
        description_str_ = description_str + '(d)'
        description_v = pd.Series(list_all,train.index, name = description_str_)
        train[description_str_] = description_v
    ## /*add something to change the created to year month day*/
    train['created']=pd.to_datetime(train["created"])
    #extract pieces
    train['year']=train['created'].dt.year
    train['month']=train['created'].dt.month
    train['day']=train['created'].dt.day
    ## /*add something to do feature selection*/
    train = train.drop(["building_id", "created", "description", "display_address", "features", 
                         "manager_id", "photos", "street_address", "price", "longitude",
                        "latitude", "features_modify", "description_modify"], axis=1)
    #drop stop words
    train = train.drop(['an(d)','and(d)','are(d)','as(d)','at(d)','be(d)','by(d)','can(d)','for(d)','from(d)','in(d)','is(d)','it(d)','just(d)','me(d)','more(d)','natural(d)','new(d)','or(d)','park(d)','that(d)','the(d)','this(d)','to(d)','will(d)','with(d)','you(d)','your(d)'], axis = 1)
    return train

In [3]:
#main function
train = pd.read_json('train.json')
test = pd.read_json('test.json')

In [4]:
train['test']=False
test['test']=True

cdf = train.append(test).reset_index(drop=True)
cdf = dataforclf(cdf)
train = cdf[cdf.test == False]
test = cdf[cdf.test == True]
train = train.drop(["test"],axis=1)
test = test.drop(["interest_level","test"],axis=1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [5]:
test

Unnamed: 0,bathrooms,bedrooms,listing_id,price_modify,latitude_modify,longitude_modify,photos_number,_photos(f),actual_apt(f),balcony(f),...,supports(d),text(d),two(d),unit(d),west(d),windows(d),york(d),year,month,day
49352,1.0,1,7142618,2950.0,40.7185,-73.9865,8,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,2016,6,11
49353,1.0,2,7210040,2850.0,40.7278,-74.0000,3,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,2016,6,24
49354,1.0,0,7174566,2295.0,40.7260,-74.0026,1,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,2016,6,17
49355,1.0,2,7191391,2900.0,40.7321,-74.0028,4,0.0,0.0,0.0,...,0.330474,0.000000,0.0,0.000000,0.0,0.0,0.292075,2016,6,21
49356,1.0,1,7171695,3254.0,40.7054,-74.0095,6,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,2016,6,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124006,1.0,1,6928108,1700.0,40.7925,-73.9454,10,0.0,0.0,0.0,...,0.000000,0.273799,0.0,0.000000,0.0,0.0,0.000000,2016,4,26
124007,1.0,2,6906674,4195.0,40.7456,-73.9797,4,0.0,0.0,0.0,...,0.000000,0.188096,0.0,0.000000,0.0,0.0,0.000000,2016,4,21
124008,1.0,0,6897967,2400.0,40.7416,-73.9829,0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.135853,0.0,0.0,0.000000,2016,4,20
124009,2.0,2,6842183,6895.0,40.7485,-73.9800,8,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.193046,2016,4,8


In [6]:
train

Unnamed: 0,bathrooms,bedrooms,interest_level,listing_id,price_modify,latitude_modify,longitude_modify,photos_number,_photos(f),actual_apt(f),...,supports(d),text(d),two(d),unit(d),west(d),windows(d),york(d),year,month,day
0,1.0,1,medium,7170325,2400.0,40.7108,-73.9539,12,0.0,0.0,...,0.000000,0.277420,0.0,0.000000,0.0,0.0,0.000000,2016,6,16
1,1.0,2,low,7092344,3800.0,40.7513,-73.9722,6,0.0,0.0,...,0.000000,0.158305,0.0,0.000000,0.0,0.0,0.000000,2016,6,1
2,1.0,2,medium,7158677,3495.0,40.7575,-73.9625,6,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,2016,6,14
3,1.5,3,medium,7211212,3000.0,40.7145,-73.9425,5,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,2016,6,24
4,1.0,0,low,7225292,2795.0,40.7439,-73.9743,4,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,2016,6,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49347,1.0,3,low,6824800,2800.0,40.8433,-73.9396,5,0.0,0.0,...,0.125354,0.000000,0.0,0.000000,0.0,0.0,0.110788,2016,4,5
49348,1.0,2,medium,6813268,2395.0,40.8198,-73.9578,5,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.228953,2016,4,2
49349,1.0,1,medium,6927093,1850.0,40.5765,-73.9554,3,0.0,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,2016,4,26
49350,1.0,2,medium,6892816,4195.0,40.7448,-74.0017,5,0.0,0.0,...,0.000000,0.000000,0.0,0.163351,0.0,0.0,0.000000,2016,4,19


In [7]:
#For training
x_train = train.drop(["listing_id","interest_level","photos_number","day"],axis=1)
#For testing
x_test = test.drop(["listing_id","photos_number","day"],axis=1)

y = pd.get_dummies(train['interest_level'])
#X_train,X_test,Y_train,Y_test = train_test_split(x_train,y,test_size=0.33,random_state=0)

# Frist Version under final choice of features

In [8]:
Best_validation_score = 0
kf = RepeatedKFold(n_splits=10, n_repeats=5, random_state=None)
best_loss=999
for train_index, test_index in kf.split(x_train):
    x_train['id'] = list(range(0,49352))
    x_train.set_index("id", inplace=True)
    y['id'] = list(range(0,49352))
    y.set_index("id", inplace=True)
    X_train, X_test = x_train.loc[train_index,:], x_train.loc[test_index,:]
    Y_train, Y_test = y.loc[train_index,:], y.loc[test_index,:]
    Tree = OneVsRestClassifier(tree.DecisionTreeClassifier())
    Tree.fit(X_train,Y_train)
    print("Current training accuracy: %0.5f" % Tree.score(X_train,Y_train))
    Validation_score = Tree.score(X_test,Y_test)
    if Validation_score > Best_validation_score:
        Best_validation_score = Validation_score
        Best_Tree = Tree
    Y_labeled = Tree.predict_proba(X_test)
    loss = log_loss(Y_test,Y_labeled)
    if best_loss > loss:
        best_loss = loss
    print ("Current validation accuracy: %0.5f" % Validation_score)
print ("Best validation accuracy: %0.5f" % Best_validation_score)
print ("Best log loss: %0.5f" % best_loss)

Current training accuracy: 0.97334
Current validation accuracy: 0.53241
Current training accuracy: 0.97373
Current validation accuracy: 0.54113
Current training accuracy: 0.97278
Current validation accuracy: 0.55218
Current training accuracy: 0.97341
Current validation accuracy: 0.53212
Current training accuracy: 0.97442
Current validation accuracy: 0.53171
Current training accuracy: 0.97382
Current validation accuracy: 0.53556
Current training accuracy: 0.97438
Current validation accuracy: 0.54225
Current training accuracy: 0.97249
Current validation accuracy: 0.54853
Current training accuracy: 0.97321
Current validation accuracy: 0.55502
Current training accuracy: 0.97384
Current validation accuracy: 0.53698
Current training accuracy: 0.97267
Current validation accuracy: 0.53890
Current training accuracy: 0.97339
Current validation accuracy: 0.52938
Current training accuracy: 0.97373
Current validation accuracy: 0.54610
Current training accuracy: 0.97283
Current validation accuracy: 

In [9]:
Y_Prediction = Best_Tree.predict_proba(x_test)
test['listing_id'] = test['listing_id'].astype(int)
ids = test['listing_id']
labels = pd.DataFrame(Y_Prediction)
labels.reset_index(drop=True, inplace=True)
ids.reset_index(drop=True, inplace=True)
out = pd.concat([ids, labels], axis=1, ignore_index=True)
out.columns = ['listing_id','high','low','medium']
out

Unnamed: 0,listing_id,high,low,medium
0,7142618,0.0,1.0,0.0
1,7210040,1.0,1.0,0.0
2,7174566,0.0,1.0,0.0
3,7191391,0.0,0.0,0.0
4,7171695,0.0,1.0,0.0
...,...,...,...,...
74654,6928108,0.0,0.0,0.0
74655,6906674,0.0,0.0,1.0
74656,6897967,0.0,1.0,0.0
74657,6842183,0.0,1.0,0.0


In [10]:
out.to_csv('submission.csv',index = False)

# Improvement on parameters of the decision tree and k-fold cross validation

In [8]:
#Choose the best k for k-fold cross validation.
best_loss = 999
for i in range(2,11):
    kf = RepeatedKFold(n_splits=i, n_repeats=1, random_state=None)
    for train_index, test_index in kf.split(x_train):
        x_train['id'] = list(range(0,49352))
        x_train.set_index("id", inplace=True)
        y['id'] = list(range(0,49352))
        y.set_index("id", inplace=True)
        X_train, X_test = x_train.loc[train_index,:], x_train.loc[test_index,:]
        Y_train, Y_test = y.loc[train_index,:], y.loc[test_index,:]
        Tree = OneVsRestClassifier(tree.DecisionTreeClassifier(criterion='gini',max_depth=4,random_state=0))
        Tree.fit(X_train,Y_train)
        Y_labeled = Tree.predict_proba(X_test)
        loss = log_loss(Y_test,Y_labeled)
        if best_loss > loss:
            best_loss = loss
            res = i
print ("Best result: %d" % res)

Best result: 10


In [8]:
#Choose the best max_depth.
best_loss=999
kf = RepeatedKFold(n_splits=8, n_repeats=1, random_state=None)
for train_index, test_index in kf.split(x_train):
    x_train['id']=list(range(0,49352))
    x_train.set_index("id", inplace=True)
    y['id']=list(range(0,49352))
    y.set_index("id", inplace=True)
    X_train, X_test = x_train.loc[train_index,:], x_train.loc[test_index,:]
    Y_train, Y_test = y.loc[train_index,:], y.loc[test_index,:]
    for i in range(2,15):
        Tree=OneVsRestClassifier(tree.DecisionTreeClassifier(criterion='gini',max_depth=i,random_state=0))
        Tree.fit(X_train,Y_train)
        #validation_score = Tree.score(X_test,Y_test)
        Y_labeled=Tree.predict_proba(X_test)
        loss=log_loss(Y_test,Y_labeled)
        if best_loss>loss:
            best_loss=loss
            best_depth = i
print ("Best result: %d" % best_depth)

Best result: 6


In [8]:
#Choose the best max_leaf_nodes.
best_loss = 999
kf = RepeatedKFold(n_splits=8, n_repeats=1, random_state=None)
for i in range (15,30):
    for train_index, test_index in kf.split(x_train):
        x_train['id'] = list(range(0,49352))
        x_train.set_index("id", inplace=True)
        y['id'] = list(range(0,49352))
        y.set_index("id", inplace=True)
        X_train, X_test = x_train.loc[train_index,:], x_train.loc[test_index,:]
        Y_train, Y_test = y.loc[train_index,:], y.loc[test_index,:]
        Tree = OneVsRestClassifier(tree.DecisionTreeClassifier(criterion='gini',max_depth=5,random_state=0,max_leaf_nodes=i))
        Tree.fit(X_train,Y_train)
        #validation_score = Tree.score(X_test,Y_test)
        Y_labeled = Tree.predict_proba(X_test)
        loss = log_loss(Y_test,Y_labeled)
        if best_loss > loss:
            best_loss = loss
            best_res = i
print ("Best result: %d" % best_res)

Best result: 23


In [8]:
#Choose the best min_samples_leaf.
best_loss=999
kf = RepeatedKFold(n_splits=8, n_repeats=1, random_state=None)
for i in range (2,20):
    for train_index, test_index in kf.split(x_train):
        x_train['id']=list(range(0,49352))
        x_train.set_index("id", inplace=True)
        y['id']=list(range(0,49352))
        y.set_index("id", inplace=True)
        X_train, X_test = x_train.loc[train_index,:], x_train.loc[test_index,:]
        Y_train, Y_test = y.loc[train_index,:], y.loc[test_index,:]
        Tree=OneVsRestClassifier(tree.DecisionTreeClassifier(criterion='gini',max_depth=5,random_state=0,max_leaf_nodes=23,min_samples_leaf=i))
        Tree.fit(X_train,Y_train)
        #validation_score = Tree.score(X_test,Y_test)
        Y_labeled=Tree.predict_proba(X_test)
        loss=log_loss(Y_test,Y_labeled)
        if best_loss>loss:
            best_loss=loss
            best_res = i
print ("Best result: %d" % best_res)

Best result: 6


In [13]:
#Choose the best criterion. The result shows the two criterion has similar effects.
kf = RepeatedKFold(n_splits=8, n_repeats=1, random_state=None)
for train_index, test_index in kf.split(x_train):
    x_train['id'] = list(range(0,49352))
    x_train.set_index("id", inplace=True)
    y['id'] = list(range(0,49352))
    y.set_index("id", inplace=True)
    X_train, X_test = x_train.loc[train_index,:], x_train.loc[test_index,:]
    Y_train, Y_test = y.loc[train_index,:], y.loc[test_index,:]
    Tree = OneVsRestClassifier(tree.DecisionTreeClassifier(criterion='entropy',max_depth=6,random_state=0)).fit(X_train,Y_train)
    validation_score1 = Tree.score(X_test,Y_test)
    Y_labeled = Tree.predict_proba(X_test)
    loss1 = log_loss(Y_test,Y_labeled)
    Tree = OneVsRestClassifier(tree.DecisionTreeClassifier(criterion='gini',max_depth=6,random_state=0)).fit(X_train,Y_train)
    validation_score2 = Tree.score(X_test,Y_test)
    Y_labeled = Tree.predict_proba(X_test)
    loss2 = log_loss(Y_test,Y_labeled)
    if validation_score1 >= validation_score2:
        print('Validation accuray: Entropy')
    else:
        print('Validation accuracy: Gini index')
    if loss1 <= loss2:
        print('Log loss: Entropy')
    else:
        print('Log loss: Gini index')


Validation accuracy: Gini index
Log loss: Gini index
Validation accuray: Entropy
Log loss: Gini index
Validation accuracy: Gini index
Log loss: Entropy
Validation accuracy: Gini index
Log loss: Gini index
Validation accuracy: Gini index
Log loss: Gini index
Validation accuray: Entropy
Log loss: Gini index
Validation accuracy: Gini index
Log loss: Gini index
Validation accuray: Entropy
Log loss: Gini index


# Final version

In [8]:
best_loss = 999
Best_validation_score = 0
kf = RepeatedKFold(n_splits=10, n_repeats=5, random_state=None)
best_precision=0
precision=[]
recall=[]
for train_index, test_index in kf.split(x_train):
    x_train['id'] = list(range(0,49352))
    x_train.set_index("id", inplace=True)
    y['id'] = list(range(0,49352))
    y.set_index("id", inplace=True)
    X_train, X_test = x_train.loc[train_index,:], x_train.loc[test_index,:]
    Y_train, Y_test = y.loc[train_index,:], y.loc[test_index,:]
    Tree = OneVsRestClassifier(tree.DecisionTreeClassifier(criterion='gini',max_depth=6,random_state=0,splitter='best',max_leaf_nodes=23,min_samples_leaf=6))
    Tree.fit(X_train,Y_train)
    print("Current training accuracy: %0.5f" % Tree.score(X_train,Y_train))
    #accuracy
    Validation_score = Tree.score(X_test,Y_test)
    if Validation_score > Best_validation_score:
        Best_validation_score = Validation_score
        #Best_Tree = Tree   
    #log loss
    Y_labeled = Tree.predict_proba(X_test)
    loss = log_loss(Y_test,Y_labeled)
    if best_loss > loss:
        best_loss = loss
        #Best_Tree = Tree  
    #precision
    Y_Pred = Tree.predict(X_test)
    print(multilabel_confusion_matrix(Y_test, Y_Pred))
    if best_precision < precision_score(Y_test, Y_Pred, average='macro'):
        best_precision = precision_score(Y_test, Y_Pred, average='macro')
        Best_Tree = Tree  
    precision.append(precision_score(Y_test, Y_Pred, average='macro'))
    recall.append(recall_score(Y_test, Y_Pred, average='macro'))
    print ("Current validation accuracy: %0.5f" % Validation_score)
print ("Best validation accuracy: %0.5f" % Best_validation_score)
print ("Best log loss: %0.5f" % best_loss)

#Y_pred = Best_Tree.predict(X_test)
#multilabel_confusion_matrix(Y_test, Y_pred)   
print(precision)
print(recall)


Current training accuracy: 0.64387
[[[4528   23]
  [ 362   23]]

 [[ 519 1013]
  [ 314 3090]]

 [[3773   16]
  [1130   17]]]
Current validation accuracy: 0.63331
Current training accuracy: 0.64409
[[[4524   42]
  [ 319   51]]

 [[ 478  983]
  [ 315 3160]]

 [[3825   20]
  [1076   15]]]
Current validation accuracy: 0.65032
Current training accuracy: 0.60398
[[[4523   20]
  [ 370   22]]

 [[ 646  899]
  [ 511 2879]]

 [[3782    0]
  [1153    0]]]
Current validation accuracy: 0.58744


  'precision', 'predicted', average, warn_for)


Current training accuracy: 0.61141
[[[4534   32]
  [ 325   44]]

 [[ 621  896]
  [ 429 2989]]

 [[3785    2]
  [1148    0]]]
Current validation accuracy: 0.61398
Current training accuracy: 0.64786
[[[4552   19]
  [ 340   24]]

 [[ 439 1016]
  [ 269 3211]]

 [[3843    1]
  [1090    1]]]
Current validation accuracy: 0.65471
Current training accuracy: 0.63088
[[[4521   21]
  [ 362   31]]

 [[ 517  989]
  [ 366 3063]]

 [[3792   30]
  [1095   18]]]
Current validation accuracy: 0.62533
Current training accuracy: 0.63350
[[[4516   23]
  [ 351   45]]

 [[ 583  973]
  [ 327 3052]]

 [[3775    0]
  [1160    0]]]
Current validation accuracy: 0.62756


  'precision', 'predicted', average, warn_for)


Current training accuracy: 0.61031
[[[4511   32]
  [ 343   49]]

 [[ 594  881]
  [ 476 2984]]

 [[3852    0]
  [1083    0]]]
Current validation accuracy: 0.61378


  'precision', 'predicted', average, warn_for)


Current training accuracy: 0.60344
[[[4517   38]
  [ 343   37]]

 [[ 658  867]
  [ 516 2894]]

 [[3790    0]
  [1145    0]]]
Current validation accuracy: 0.59291


  'precision', 'predicted', average, warn_for)


Current training accuracy: 0.62546
[[[4498   39]
  [ 359   39]]

 [[ 546  950]
  [ 398 3041]]

 [[3837    0]
  [1098    0]]]
Current validation accuracy: 0.62391


  'precision', 'predicted', average, warn_for)


Current training accuracy: 0.64324
[[[4493   41]
  [ 368   34]]

 [[ 492 1015]
  [ 295 3134]]

 [[3823    8]
  [1101    4]]]
Current validation accuracy: 0.64040
Current training accuracy: 0.61662
[[[4537   30]
  [ 331   38]]

 [[ 651  889]
  [ 435 2961]]

 [[3765    0]
  [1171    0]]]
Current validation accuracy: 0.60737


  'precision', 'predicted', average, warn_for)


Current training accuracy: 0.60639
[[[4503   31]
  [ 355   46]]

 [[ 642  871]
  [ 494 2928]]

 [[3820    3]
  [1111    1]]]
Current validation accuracy: 0.60142
Current training accuracy: 0.64126
[[[4535   31]
  [ 331   38]]

 [[ 479 1014]
  [ 281 3161]]

 [[3809    2]
  [1122    2]]]
Current validation accuracy: 0.64762
Current training accuracy: 0.61848
[[[4534   23]
  [ 349   29]]

 [[ 626  876]
  [ 438 2995]]

 [[3811    0]
  [1124    0]]]
Current validation accuracy: 0.61216


  'precision', 'predicted', average, warn_for)


Current training accuracy: 0.64282
[[[4495   26]
  [ 368   46]]

 [[ 506  969]
  [ 310 3150]]

 [[3835   39]
  [1039   22]]]
Current validation accuracy: 0.64620
Current training accuracy: 0.64640
[[[4541   34]
  [ 329   31]]

 [[ 463 1075]
  [ 297 3100]]

 [[3757    0]
  [1178    0]]]
Current validation accuracy: 0.63343


  'precision', 'predicted', average, warn_for)


Current training accuracy: 0.63966
[[[4522   35]
  [ 344   34]]

 [[ 510  989]
  [ 336 3100]]

 [[3786   28]
  [1107   14]]]
Current validation accuracy: 0.63749
Current training accuracy: 0.59786
[[[4521   16]
  [ 367   31]]

 [[ 694  852]
  [ 546 2843]]

 [[3781    6]
  [1143    5]]]
Current validation accuracy: 0.58278
Current training accuracy: 0.61988
[[[4534   31]
  [ 322   48]]

 [[ 601  854]
  [ 416 3064]]

 [[3835   15]
  [1070   15]]]
Current validation accuracy: 0.63283
Current training accuracy: 0.64526
[[[4533   35]
  [ 330   38]]

 [[ 485  997]
  [ 315 3139]]

 [[3822    0]
  [1114    0]]]
Current validation accuracy: 0.64344


  'precision', 'predicted', average, warn_for)


Current training accuracy: 0.59343
[[[4492   41]
  [ 357   46]]

 [[ 724  792]
  [ 589 2831]]

 [[3823    0]
  [1113    0]]]
Current validation accuracy: 0.58225


  'precision', 'predicted', average, warn_for)


Current training accuracy: 0.60513
[[[4556   23]
  [ 319   37]]

 [[ 684  796]
  [ 510 2945]]

 [[3793   18]
  [1110   14]]]
Current validation accuracy: 0.60628
Current training accuracy: 0.64286
[[[4523   16]
  [ 367   29]]

 [[ 483 1059]
  [ 270 3123]]

 [[3785    4]
  [1146    0]]]
Current validation accuracy: 0.63749
Current training accuracy: 0.64104
[[[4496   31]
  [ 367   41]]

 [[ 544 1035]
  [ 325 3031]]

 [[3764    0]
  [1171    0]]]
Current validation accuracy: 0.62209


  'precision', 'predicted', average, warn_for)


Current training accuracy: 0.59903
[[[4519   40]
  [ 314   62]]

 [[ 679  827]
  [ 531 2898]]

 [[3800    5]
  [1130    0]]]
Current validation accuracy: 0.59635
Current training accuracy: 0.65795
[[[4528   28]
  [ 350   29]]

 [[ 394 1073]
  [ 200 3268]]

 [[3840    7]
  [1084    4]]]
Current validation accuracy: 0.66687
Current training accuracy: 0.64111
[[[4508   53]
  [ 327   47]]

 [[ 513  950]
  [ 298 3174]]

 [[3840    6]
  [1086    3]]]
Current validation accuracy: 0.65046
Current training accuracy: 0.64239
[[[4503   35]
  [ 350   47]]

 [[ 546 1012]
  [ 320 3057]]

 [[3754   20]
  [1139   22]]]
Current validation accuracy: 0.63161
Current training accuracy: 0.60209
[[[4510   43]
  [ 334   48]]

 [[ 645  830]
  [ 529 2931]]

 [[3837    5]
  [1090    3]]]
Current validation accuracy: 0.60304
Current training accuracy: 0.64792
[[[4507   26]
  [ 365   38]]

 [[ 516 1087]
  [ 282 3051]]

 [[3726   10]
  [1188   12]]]
Current validation accuracy: 0.62743
Current training accuracy: 0

  'precision', 'predicted', average, warn_for)


Current training accuracy: 0.63874
[[[4509   52]
  [ 324   50]]

 [[ 481  987]
  [ 343 3124]]

 [[3836    5]
  [1091    3]]]
Current validation accuracy: 0.64195
Current training accuracy: 0.59504
[[[4501   34]
  [ 362   38]]

 [[ 698  795]
  [ 535 2907]]

 [[3842    0]
  [1093    0]]]
Current validation accuracy: 0.59615


  'precision', 'predicted', average, warn_for)


Current training accuracy: 0.64791
[[[4523   45]
  [ 318   49]]

 [[ 467 1051]
  [ 289 3128]]

 [[3779    5]
  [1147    4]]]
Current validation accuracy: 0.63951
Current training accuracy: 0.66087
[[[4525   37]
  [ 349   25]]

 [[ 424 1069]
  [ 234 3209]]

 [[3817    0]
  [1119    0]]]
Current validation accuracy: 0.65458


  'precision', 'predicted', average, warn_for)


Current training accuracy: 0.64380
[[[4528   19]
  [ 355   34]]

 [[ 478 1051]
  [ 282 3125]]

 [[3778   18]
  [1115   25]]]
Current validation accuracy: 0.64344
Current training accuracy: 0.61454
[[[4508   37]
  [ 344   46]]

 [[ 606  901]
  [ 455 2973]]

 [[3792   26]
  [1101   16]]]
Current validation accuracy: 0.61155
Current training accuracy: 0.64955
[[[4537   41]
  [ 324   33]]

 [[ 496 1064]
  [ 328 3047]]

 [[3732    0]
  [1203    0]]]
Current validation accuracy: 0.62290


  'precision', 'predicted', average, warn_for)


Current training accuracy: 0.63575
[[[4551   23]
  [ 326   35]]

 [[ 529  962]
  [ 323 3121]]

 [[3787   18]
  [1112   18]]]
Current validation accuracy: 0.64255
Current training accuracy: 0.62953
[[[4505   16]
  [ 394   20]]

 [[ 531  958]
  [ 365 3081]]

 [[3858    2]
  [1074    1]]]
Current validation accuracy: 0.62837
Current training accuracy: 0.59124
[[[4515   19]
  [ 371   30]]

 [[ 711  768]
  [ 513 2943]]

 [[3857    0]
  [1078    0]]]
Current validation accuracy: 0.60223


  'precision', 'predicted', average, warn_for)


Current training accuracy: 0.65088
[[[4535   36]
  [ 309   55]]

 [[ 452 1045]
  [ 267 3171]]

 [[3802    0]
  [1133    0]]]
Current validation accuracy: 0.65289


  'precision', 'predicted', average, warn_for)


Current training accuracy: 0.64646
[[[4503   26]
  [ 356   50]]

 [[ 474 1030]
  [ 306 3125]]

 [[3819   18]
  [1084   14]]]
Current validation accuracy: 0.64316
Current training accuracy: 0.63021
[[[4526   26]
  [ 360   23]]

 [[ 557  962]
  [ 362 3054]]

 [[3799    0]
  [1136    0]]]
Current validation accuracy: 0.62310
Best validation accuracy: 0.66687
Best log loss: 0.66905
[0.5894196658271725, 0.5798969483069364, 0.4286176443402489, 0.4494389125968074, 0.6059266472638978, 0.5756922823803376, 0.47334185848252347, 0.45899839756387123, 0.42093651215693223, 0.420654806648292, 0.5140098015586084, 0.4426381461675579, 0.5393772455499226, 0.6026167953947178, 0.4437980645033086, 0.5880977840041993, 0.40647934899431903, 0.5280728430111186, 0.6278460184140661, 0.6298755285030897, 0.4264979288660431, 0.43671075493626815, 0.6137964448008554, 0.46373877464264845, 0.4382981727423439, 0.46194323814537, 0.5417434079639004, 0.5243248194848583, 0.6160901662732878, 0.560595513234439, 0.62550575230311

  'precision', 'predicted', average, warn_for)


In [9]:
Y_Prediction=Best_Tree.predict_proba(x_test)
test['listing_id'] = test['listing_id'].astype(int)
ids = test['listing_id']
labels = pd.DataFrame(Y_Prediction)
labels.reset_index(drop=True, inplace=True)
ids.reset_index(drop=True, inplace=True)
out = pd.concat([ids, labels], axis=1, ignore_index=True)
out.columns=['listing_id','high','low','medium']
out

Unnamed: 0,listing_id,high,low,medium
0,7142618,0.019887,0.848758,0.234660
1,7210040,0.165278,0.482436,0.226322
2,7174566,0.054809,0.666523,0.190476
3,7191391,0.165278,0.482436,0.504732
4,7171695,0.019887,0.848758,0.224175
...,...,...,...,...
74654,6928108,0.413793,0.304584,0.290961
74655,6906674,0.105691,0.561784,0.274501
74656,6897967,0.054809,0.666523,0.190476
74657,6842183,0.049593,0.877449,0.043655


In [10]:
out.to_csv('submission.csv',index = False)

In [11]:
out['listing_id'].dtype

dtype('int32')