In [2]:
import math
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
# from sklearn.neural_network import MLPClassifier
%matplotlib inline

In [3]:
df_train = pd.read_csv('../Kaggle_Datasets/Facebook/train.csv')
df_test = pd.read_csv('../Kaggle_Datasets/Facebook/test.csv')

In [4]:
mini_df = df_train[(df_train.x > 1.0) & (df_train.x < 1.5) & (df_train.y > 2.0) & (df_train.y < 2.25)]

In [5]:
def apply_accu_class(x):
    rise_ranges = ((50, 90), (130, 160), (220, 310), (380, 600))
    fall_ranges = ((0, 40), (110, 120), (190, 210), (320, 370), )
    peak_ranges = ((90, 110), (160, 190), (310, 320), )
    trough_ranges = ((40, 50), (120, 130), (210, 220), (370, 380))
    class_dict = {rise_ranges: 1, fall_ranges: -1, peak_ranges: 2, trough_ranges: -2}
    for rang in [rise_ranges, fall_ranges, peak_ranges, trough_ranges]:
        for a, b in rang:
            if a <= x < b:
                return class_dict[rang]

In [6]:
def mod_df(mini_df):
    mini_df.loc[:, 'hour'] = mini_df.time // 60 % 24 + 1
    mini_df.loc[:, 'day'] = mini_df.time // (60*24) % 7 + 1
    mini_df.loc[:, 'week'] = mini_df.time // (60*24*7) % 52 + 1
    mini_df.loc[:, 'month'] = mini_df.time // (60*24*30) % 12 + 1
    mini_df.loc[:, 'year'] = mini_df.time // (60*24*365) + 1

    mini_df.loc[:, 'hours'] = mini_df.time // 60 + 1
    mini_df.loc[:, 'days'] = mini_df.time // (60*24) + 1
    mini_df.loc[:, 'weeks'] = mini_df.time // (60*24*7) + 1
    mini_df.loc[:, 'months'] = mini_df.time // (60*24*30) + 1
    mini_df.loc[:, 'years'] = mini_df.time // (60*24*365) + 1
    
    mini_df.loc[:, 'hour_float'] = mini_df.time / 60 % 24 + 1
    mini_df.loc[:, 'day_float'] = mini_df.time / (60*24) % 7 + 1
    mini_df.loc[:, 'week_float'] = mini_df.time / (60*24*7) % 52 + 1
    mini_df.loc[:, 'month_float'] = mini_df.time / (60*24*30) % 12 + 1
    mini_df.loc[:, 'year_float'] = mini_df.time / (60*24*365) + 1

    mini_df.loc[:, 'accu_class'] = mini_df.days.apply(apply_accu_class)
    mini_df.loc[:, 'log2_accuracy'] = np.log2(mini_df.accuracy) * 10
    mini_df.loc[:, 'log10_accuracy'] = np.log10(mini_df.accuracy) * 10
    mini_df.loc[:, 'log2_accuracy_int'] = (np.log2(mini_df.accuracy) * 10).astype(int)
    mini_df.loc[:, 'log10_accuracy_int'] = (np.log10(mini_df.accuracy) * 10).astype(int)
    
    return mini_df

In [7]:
mini_df = mod_df(mini_df)
mini_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,row_id,x,y,accuracy,time,place_id,hour,day,week,month,...,hour_float,day_float,week_float,month_float,year_float,accu_class,log2_accuracy,log10_accuracy,log2_accuracy_int,log10_accuracy_int
1420,1420,1.2804,2.2015,11,315689,7316184998,6,3,32,8,...,6.483333,3.228472,32.318353,8.307616,1.600626,1,34.594316,10.413927,34,10
1753,1753,1.2712,2.1926,56,470758,1110401599,22,5,47,11,...,22.966667,5.915278,47.702183,11.897176,1.895658,-1,58.073549,17.48188,58,17
2243,2243,1.3021,2.0379,66,366243,6235551797,9,3,37,9,...,9.05,3.335417,37.333631,9.477847,1.696809,1,60.443941,18.195439,60,18
2254,2254,1.4721,2.1028,50,316873,7031210677,2,4,32,8,...,2.216667,4.050694,32.435813,8.335023,1.602879,1,56.438562,16.9897,56,16
2527,2527,1.3758,2.0215,73,216724,6696283507,13,4,22,6,...,13.066667,4.502778,22.500397,6.016759,1.412336,1,61.898246,18.633229,61,18


In [8]:
mini_df.columns

Index([u'row_id', u'x', u'y', u'accuracy', u'time', u'place_id', u'hour',
       u'day', u'week', u'month', u'year', u'hours', u'days', u'weeks',
       u'months', u'years', u'hour_float', u'day_float', u'week_float',
       u'month_float', u'year_float', u'accu_class', u'log2_accuracy',
       u'log10_accuracy', u'log2_accuracy_int', u'log10_accuracy_int'],
      dtype='object')

In [9]:
len(mini_df.place_id.unique())

1215

In [10]:
train, test = train_test_split(mini_df, random_state=88)

In [11]:
attributes = [u'x', u'y', u'accuracy', u'time', u'hour', u'day', u'week', u'month',
       u'year', u'log2_accuracy', u'log10_accuracy',]

In [12]:
model = RandomForestClassifier(n_estimators=20, warm_start=True, random_state=3, n_jobs=-1)

In [13]:
model.fit(train.loc[:, attributes], train.place_id)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=3, verbose=0, warm_start=True)

In [14]:
model.score(test.loc[:, attributes], test.place_id)

0.53307538691962053

In [15]:
# Using hours, days... instead of hour, day...
attributes_2 = [u'x', u'y', u'accuracy', u'time', u'hours', u'days', u'weeks', u'months',
       u'year', u'log2_accuracy', u'log10_accuracy',]

In [16]:
model_2 = RandomForestClassifier(n_estimators=20, warm_start=True, random_state=3, n_jobs=-1)

In [17]:
model_2.fit(train.loc[:, attributes_2], train.place_id)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=3, verbose=0, warm_start=True)

In [18]:
model_2.score(test.loc[:, attributes_2], test.place_id)

0.43609585621567648

In [22]:
# Removing log2 and log10 accuracy
attributes_3 = [u'x', u'y', u'time', u'hour', u'day', u'week', u'month',
       u'year', u'accuracy',]

In [23]:
model_3 = RandomForestClassifier(n_estimators=20, warm_start=True, random_state=3, n_jobs=-1)

In [24]:
model_3.fit(train.loc[:, attributes_3], train.place_id)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=3, verbose=0, warm_start=True)

In [25]:
model_3.score(test.loc[:, attributes_3], test.place_id)

0.54418372441337992

In [26]:
# Using only log10 instead of accuracy
attributes_4 = [u'x', u'y', u'time', u'hour', u'day', u'week', u'month',
       u'year', u'log10_accuracy',]

In [27]:
model_4 = RandomForestClassifier(n_estimators=20, warm_start=True, random_state=3, n_jobs=-1)

In [28]:
model_4.fit(train.loc[:, attributes_4], train.place_id)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=3, verbose=0, warm_start=True)

In [29]:
model_4.score(test.loc[:, attributes_4], test.place_id)

0.54430853719420869

In [30]:
# Using only log2 instead of accuracy
attributes_4a = [u'x', u'y', u'time', u'hour', u'day', u'week', u'month',
       u'year', u'log2_accuracy',]

In [31]:
model_4a = RandomForestClassifier(n_estimators=20, warm_start=True, random_state=3, n_jobs=-1)

In [32]:
model_4a.fit(train.loc[:, attributes_4a], train.place_id)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=3, verbose=0, warm_start=True)

In [33]:
model_4a.score(test.loc[:, attributes_4a], test.place_id)

0.54430853719420869

In [34]:
# Removing time
attributes_5 = [u'x', u'y', u'hour', u'day', u'week', u'month',
       u'year', u'log10_accuracy',]

In [35]:
model_5 = RandomForestClassifier(n_estimators=20, warm_start=True, random_state=3, n_jobs=-1)

In [36]:
model_5.fit(train.loc[:, attributes_5], train.place_id)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=3, verbose=0, warm_start=True)

In [37]:
model_5.score(test.loc[:, attributes_5], test.place_id)

0.53132800798801794

In [28]:
# Using float day, week and month instead of rounded ints
attributes_6 = [u'x', u'y', u'time', u'hour_float', u'day_float', u'week_float', u'month_float',
       u'year_float', u'log2_accuracy',]

In [39]:
model_6 = RandomForestClassifier(n_estimators=20, warm_start=True, random_state=3, n_jobs=-1)

In [40]:
model_6.fit(train.loc[:, attributes_6], train.place_id)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=3, verbose=0, warm_start=True)

In [41]:
model_6.score(test.loc[:, attributes_6], test.place_id)

0.54655516724912634

In [46]:
# Using int accuracy
attributes_7 = [u'x', u'y', u'time', u'hour_float', u'day_float', u'week_float', u'month_float',
       u'year_float', u'log2_accuracy_int',]

In [47]:
model_7 = RandomForestClassifier(n_estimators=20, warm_start=True, random_state=3, n_jobs=-1)

In [48]:
model_7.fit(train.loc[:, attributes_7], train.place_id)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=3, verbose=0, warm_start=True)

In [49]:
model_7.score(test.loc[:, attributes_7], test.place_id)

0.54530703944083869

In [21]:
# Using all three accuracies
attributes_8 = [u'x', u'y', u'time', u'hour_float', u'day_float', u'week_float', u'month_float',
       u'year_float', u'accuracy', 'log2_accuracy', 'log10_accuracy']

In [22]:
model_8 = RandomForestClassifier(n_estimators=20, warm_start=True, random_state=3, n_jobs=-1)

In [23]:
model_8.fit(train.loc[:, attributes_8], train.place_id)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=3, verbose=0, warm_start=True)

In [24]:
model_8.score(test.loc[:, attributes_8], test.place_id)

0.54068896655017473

In [25]:
# Using float and integer values for day, hour, time e.t.c
attributes_9 = [u'x', u'y', u'time', u'hour_float', u'day_float', u'week_float', u'month_float',
       u'year_float', 'log2_accuracy', 'hour', 'day', 'week', 'month']

In [60]:
model_9 = RandomForestClassifier(n_estimators=20, warm_start=True, random_state=3, n_jobs=-1)

In [61]:
model_9.fit(train.loc[:, attributes_9], train.place_id)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=3, verbose=0, warm_start=True)

In [63]:
model_9.score(test.loc[:, attributes_9], test.place_id)

0.532825761357963

In [27]:
# Adding accu_class
attributes_10 = [u'x', u'y', u'time', u'hour_float', u'day_float', u'week_float', u'month_float',
       u'year_float', u'log2_accuracy', 'accu_class']

In [29]:
model_10 = RandomForestClassifier(n_estimators=20, warm_start=True, random_state=3, n_jobs=-1)

In [30]:
model_10.fit(train.loc[:, attributes_10], train.place_id)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=3, verbose=0, warm_start=True)

In [32]:
model_10.score(test.loc[:, attributes_10], test.place_id)

0.54106340489266103