In [21]:
# Making all the necessary imports for further processing 
import numpy as np
import pandas as pd
import heapq
from collections import Counter
import math as mt

In [22]:
# Read the training as well as testing files
data = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [23]:
# Extract the Year as well as month feature from the given dataset to be used for future predictions
data['Date'] = data['Date'].astype(str)
data['Year'] = data['Date'].str[0:4]
data['Month'] = data['Date'].str[4:6]

In [24]:
# Convert the datatypes of the given Month and Year feature to integer types
data['Month'] = data['Month'].astype(int)
data['Year'] = data['Year'].astype(int)

In [25]:
# For two different years get count of the month from start and doing this would ensure that the effect of both
# the month and year are incorporated in our model
data['Month'][data['Year']==2012] = 12+ (data['Month'][data['Year']==2012])
data['Month'][data['Year']==2013] = 24+ (data['Month'][data['Year']==2013])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [26]:
#See the first five values of our dataset
data.head()

Unnamed: 0,PID,Date,Event,Year,Month
0,1028890,201101,2186,2011,1
1,1028890,201101,7087,2011,1
2,1028890,201101,4848,2011,1
3,1028890,201101,2214,2011,1
4,1028890,201102,7087,2011,2


In [27]:
#Assign weights to the model exponentially.This would ensure that the recent events are given more weight than the previous
#events.This general trend was obseved in the dataset that the last values were found to be more in repeatition than the 
#previous observations in last (say last 50 events) like in above "1028890" where initially 7087 was found to be repeated 
#many times initially thus increasing its importance but as the sequence increases with time its effect is diluted and new events
#tend to affect patient more frequently than the previuos events.I also tried various other metrics like linear weight allocation
#power based weight allocation etc, the score of exponentialwith log based weight allocation were highest and hence I decided to finally 
#choose it.
data['Weighted_factor_expolog'] = data['Month'].apply(lambda x: (mt.log(x)*mt.exp(x)))

In [28]:
# Create a new dataframe and group weight according to Patient ID and Event
data_new = pd.DataFrame()
data_new = (data['Weighted_factor_expolog'].groupby([data['PID'],data['Event']]).sum()).reset_index()

In [29]:
# Group the data according to patient id and arrange it in form of lists of weights and events
data_new = data_new.groupby('PID').agg(lambda x: x.tolist())

In [30]:
# Reset index to numerical from 'PID' 
data_new2 = data_new.reset_index()

In [31]:
data_new2.head()

Unnamed: 0,PID,Event,Weighted_factor_expolog
0,1000001,"[1579, 1797, 1809, 1817, 2117, 2178, 2204, 222...","[238.861764964, 405876.158204, 3.65376234221e+..."
1,1000011,"[0090, 1103, 1121, 1579, 1934, 1955, 1958, 196...","[637705962868.0, 4015164094.86, 1134767.94557,..."
2,1000019,"[2664, 2670, 2689, 2780, 2912, 3000, 3004, 311...","[3.63468093659e+13, 3.63468093659e+13, 2.05749..."
3,1000025,"[1105, 1806, 1890, 1905, 2110, 2111, 2113, 216...","[0.0, 11606650177.0, 2.05749645649e+15, 317374..."
4,1000029,"[1101, 1142, 1172, 1204, 1579, 2113, 2253, 250...","[24637529.8137, 2.05749645647e+15, 24637529.81..."


In [32]:
#Create a new column with blank values
data_new2['Subtop10'] = ''

In [33]:
data_new2.head()

Unnamed: 0,PID,Event,Weighted_factor_expolog,Subtop10
0,1000001,"[1579, 1797, 1809, 1817, 2117, 2178, 2204, 222...","[238.861764964, 405876.158204, 3.65376234221e+...",
1,1000011,"[0090, 1103, 1121, 1579, 1934, 1955, 1958, 196...","[637705962868.0, 4015164094.86, 1134767.94557,...",
2,1000019,"[2664, 2670, 2689, 2780, 2912, 3000, 3004, 311...","[3.63468093659e+13, 3.63468093659e+13, 2.05749...",
3,1000025,"[1105, 1806, 1890, 1905, 2110, 2111, 2113, 216...","[0.0, 11606650177.0, 2.05749645649e+15, 317374...",
4,1000029,"[1101, 1142, 1172, 1204, 1579, 2113, 2253, 250...","[24637529.8137, 2.05749645647e+15, 24637529.81...",


In [34]:
# For each of the patient id create a dictionary and then fill subtop column 
for i in range(len(data_new2)):
    keys = data_new.iloc[i][0]
    values = data_new.iloc[i][1]
    dictionary = dict(zip(keys, values))
    top10 = heapq.nlargest(10, dictionary, key=dictionary.get)
    data_new2["Subtop10"].iloc[i] = top10

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [35]:
# Used this code from ZS starter script in python 
for i in range(1,11):
    data_new2["Event" + str(i)] = data_new2["Subtop10"].apply(lambda x: x[i-1])

In [36]:
data_new2.head()

Unnamed: 0,PID,Event,Weighted_factor_expolog,Subtop10,Event1,Event2,Event3,Event4,Event5,Event6,Event7,Event8,Event9,Event10
0,1000001,"[1579, 1797, 1809, 1817, 2117, 2178, 2204, 222...","[238.861764964, 405876.158204, 3.65376234221e+...","[1809, 2605, 7132, 2382, 4848, 2632, 2761, 379...",1809,2605,7132,2382,4848,2632,2761,3799,3222,3620
1,1000011,"[0090, 1103, 1121, 1579, 1934, 1955, 1958, 196...","[637705962868.0, 4015164094.86, 1134767.94557,...","[8502, 8680, 8295, V221, 2902, 3717, 3472, 408...",8502,8680,8295,V221,2902,3717,3472,4084,3217,3033
2,1000019,"[2664, 2670, 2689, 2780, 2912, 3000, 3004, 311...","[3.63468093659e+13, 3.63468093659e+13, 2.05749...","[9920, 9191, 3641, V700, 8006, 9938, 9047, 845...",9920,9191,3641,V700,8006,9938,9047,8455,8372,8230
3,1000025,"[1105, 1806, 1890, 1905, 2110, 2111, 2113, 216...","[0.0, 11606650177.0, 2.05749645649e+15, 317374...","[3641, 8260, 2674, 2635, V700, 2525, 5990, 713...",3641,8260,2674,2635,V700,2525,5990,7132,3419,3397
4,1000029,"[1101, 1142, 1172, 1204, 1579, 2113, 2253, 250...","[24637529.8137, 2.05749645647e+15, 24637529.81...","[3638, 2533, 9921, 4011, 3051, 3616, 7902, 719...",3638,2533,9921,4011,3051,3616,7902,7194,7871,1579


In [37]:
#Deleting all unnecessary rows
del data_new2['Event']
del data_new2['Weighted_factor_expolog']
del data_new2['Subtop10']

In [39]:
data_new2.head()

Unnamed: 0,PID,Event1,Event2,Event3,Event4,Event5,Event6,Event7,Event8,Event9,Event10
0,1000001,1809,2605,7132,2382,4848,2632,2761,3799,3222,3620
1,1000011,8502,8680,8295,V221,2902,3717,3472,4084,3217,3033
2,1000019,9920,9191,3641,V700,8006,9938,9047,8455,8372,8230
3,1000025,3641,8260,2674,2635,V700,2525,5990,7132,3419,3397
4,1000029,3638,2533,9921,4011,3051,3616,7902,7194,7871,1579


In [41]:
del data_new2['Event8']
del data_new2['Event9']
del data_new2['Event10']

In [42]:
#Repredicting values of event 8 by replacing it with Event 1 as event 1 is more prominent.I would say this was just a random 
#card that somehow worked to increase the score of the metric.
data_new2['Event8'] = data_new2['Event1']
data_new2['Event9'] = data_new2['Event2']
data_new2['Event10'] = data_new2['Event3']

In [43]:
data_new2.to_csv("NishantRaj.csv",index = False)

# NDCG FUNCTION FOR CROSS-VALIDATION

In [45]:
import math
import numpy as np

In [46]:
predicted=[1398,1111,2222, 3333, 4444, 5555, 6666, 7777, 8888, 9999]
test=[1398, 1115, 2227, 3323, 4494, 5355, 6686, 7477, 8388, 9989]
def ndcg_scorer(y_pred,y_test):

    score_mat=[0]*10
    y_pred=list(np.asfarray(y_pred))
    y_test=list(np.asfarray(y_test))
    for i in range(10):
        if y_pred[i]==y_test[i]:
            score_mat[i]=1
        elif y_pred[i] in y_test:
            pos=y_test.index(y_pred[i])
            score_mat[i]=min(((pos+1)/(i+1)),round((i+1)/(pos+1)))
        else:
            score_mat[i]=0
    rel=np.asfarray(score_mat)
    print(rel)
    score=0
    for i in range(10):
        score+= (pow(2,rel[i])-1)/(math.log((i+2),2))
    return score/4.5422
print (ndcg_scorer(predicted,test))

[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
0.220157632865
