In [1]:
# Used to read the Parquet data
import pyarrow.parquet as parquet
# Used to train the baseline model
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr

from sklearn.utils import shuffle

import pandas as pd


import os
# redirect output 
# sys.stdout = open("out.txt", "w")


# Load data

In [2]:

# Where the downloaded data are
input_path = '/home/faraon/data/collabTrain/'
# Where to store results
output_path = './'


In [3]:
# Read all day to train model on as Pandas dataframe

dirs = ["date=2018-02-01"]

data = pd.DataFrame({})
for file in dirs: 
    temp_data = parquet.read_table(input_path + file).to_pandas()
    data = pd.concat([data, temp_data])

In [4]:
data.head()

Unnamed: 0,instanceId_userId,instanceId_objectType,instanceId_objectId,audit_pos,audit_clientType,audit_timestamp,audit_timePassed,audit_experiment,audit_resourceType,metadata_ownerId,...,auditweights_userOwner_USER_INTERNAL_UNLIKE,auditweights_userOwner_USER_PRESENT_SEND,auditweights_userOwner_USER_PROFILE_VIEW,auditweights_userOwner_USER_SEND_MESSAGE,auditweights_userOwner_USER_STATUS_COMMENT_CREATE,auditweights_userOwner_VIDEO,auditweights_userOwner_VOTE_POLL,auditweights_x_ActorsRelations,auditweights_likersSvd_spark_hyper,auditweights_source_PROMO
0,138,Post,16788021,1,API,1517512273619,3651351,XPRM-5386_q2x3,8,2154,...,,,,,,,,,,
1,222,Post,23852723,33,WEB,1517495744978,3937907,XPRM-5386_G2,8,23080,...,,,,,,,,,,
2,384,Post,12454221,15,WEB,1517508395567,1517508395567,XPRM-5386_q2x2,8,39056,...,,,,,,,,,,
3,666,Post,18335103,13,WEB,1517486842005,1517486842005,XPRM-5386_G0,8,69277,...,,,,,,,,,,
4,1209,Post,22920031,2,MOB,1517498414790,2031754,XPRM-5386_G1,8,23941,...,,,,,,,,1024.0,,


In [5]:

def feedback_to_float(x):
    res = []
    feeddict = {
        "Commented": 0,
        "ReShared": 0,
        "Liked": 1,
        "Clicked": 0,
        "Ignored": 0,
        "Unliked": 0,
        "Complaint": 0,
        "Disliked": 0,  
        "Viewed": 0
    }
    for feed in x:
        res.append(feeddict[feed])

    return np.array(res).mean().astype(int)



# Construct the label (liked objects)
data['liked'] = data['feedback'].apply(feedback_to_float)


In [6]:
# features = data.columns.values

# Select some features 

In [7]:

# instanceId_userId
# liked 
selected_features = ['userOwnerCounters_USER_FEED_REMOVE', 
                     'userOwnerCounters_CREATE_IMAGE', 'userOwnerCounters_VIDEO',  'user_is_active', 
                     'auditweights_feedOwner_RECOMMENDED_GROUP', 'auditweights_svd_prelaunch',
                     'auditweights_ctr_high', 'auditweights_friendLikes', 'auditweights_ctr_gender'
                    ]

In [8]:
exetended_selected_features = selected_features.copy()
exetended_selected_features.append('liked')
exetended_selected_features.append('instanceId_userId')

selected_data = data[exetended_selected_features]

In [9]:

selected_data.head(20)

Unnamed: 0,userOwnerCounters_USER_FEED_REMOVE,userOwnerCounters_CREATE_IMAGE,userOwnerCounters_VIDEO,user_is_active,auditweights_feedOwner_RECOMMENDED_GROUP,auditweights_svd_prelaunch,auditweights_ctr_high,auditweights_friendLikes,auditweights_ctr_gender,liked,instanceId_userId
0,0.0,0.0,0.0,1.0,,0.60316,0.065857,,0.009836,0,138
1,0.0,0.0,0.0,1.0,,0.675362,0.135539,,0.023984,0,222
2,0.0,0.0,0.0,1.0,,0.761416,0.060081,,0.009776,0,384
3,,,,1.0,,0.851362,0.038051,,0.00266,0,666
4,,,,1.0,,0.887637,0.194643,,0.005013,0,1209
5,,,,1.0,,0.702343,0.172163,1.0,0.016191,0,3012
6,0.0,0.0,0.0,1.0,,0.690506,0.146267,1.0,0.00574,0,3273
7,0.0,0.0,0.0,1.0,,0.761575,0.127646,1.0,0.022038,1,4092
8,0.0,0.0,0.0,1.0,,0.82062,0.086106,1.0,0.013726,0,4092
9,0.0,0.0,0.0,1.0,,0.245339,0.054828,,0.004651,0,4395


# Transform data into pairs

In [10]:
def transform_data(data):
    users_data = data.groupby('instanceId_userId')
    
    df_results = pd.DataFrame({})

    for user_data in users_data:
        user_data = user_data[1]

        user_data_liked = user_data[user_data['liked'] == 1]
        user_data_disliked = user_data[user_data['liked'] == 0]

        if user_data_liked.shape[0] == 0 or user_data_disliked.shape[0] == 0:
             continue

        # generate
        data_liked = user_data_liked[selected_features]
        data_disliked = user_data_disliked[selected_features]
        
        data_liked['key'] = 0
        data_disliked['key'] = 0
        
        temp_data = pd.merge(data_liked, data_disliked, on='key', how='inner')
        temp_data.drop('key',1, inplace=True)
        
        df_results =  pd.concat([df_results, temp_data])
        
                
    return df_results


# Binarize pairs

In [11]:
def binarize(data, features):
    result_frame = pd.DataFrame({})
    
    for feature in features:
        a = data[feature + "_x"] >= data[feature + "_y"]
        a = a.to_frame()
        a.colunds = [feature]
        result_frame = pd.concat([result_frame, a], axis=1, sort=False)    
    return result_frame

    

# Transform and save transformed data

In [12]:
tranformed_data = transform_data(selected_data)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [14]:
tranformed_data = tranformed_data.fillna(0.0)

In [15]:
tranformed_data

Unnamed: 0,userOwnerCounters_USER_FEED_REMOVE_x,userOwnerCounters_CREATE_IMAGE_x,userOwnerCounters_VIDEO_x,user_is_active_x,auditweights_feedOwner_RECOMMENDED_GROUP_x,auditweights_svd_prelaunch_x,auditweights_ctr_high_x,auditweights_friendLikes_x,auditweights_ctr_gender_x,userOwnerCounters_USER_FEED_REMOVE_y,userOwnerCounters_CREATE_IMAGE_y,userOwnerCounters_VIDEO_y,user_is_active_y,auditweights_feedOwner_RECOMMENDED_GROUP_y,auditweights_svd_prelaunch_y,auditweights_ctr_high_y,auditweights_friendLikes_y,auditweights_ctr_gender_y
0,0.000000,0.0,0.000000,1.0,0.0,0.847771,0.142697,1.0,0.019438,0.000000,0.0,0.0,1.0,0.0,0.856120,0.083528,1.0,0.003679
1,0.000000,0.0,0.000000,1.0,0.0,0.847771,0.142697,1.0,0.019438,0.000000,0.0,0.0,1.0,0.0,0.719209,0.071965,1.0,0.003981
2,0.000000,0.0,0.000000,1.0,0.0,0.847771,0.142697,1.0,0.019438,0.000000,0.0,0.0,1.0,0.0,0.912177,0.139855,1.0,0.017423
0,0.000000,0.0,0.000000,1.0,0.0,0.761575,0.127646,1.0,0.022038,0.000000,0.0,0.0,1.0,0.0,0.820620,0.086106,1.0,0.013726
0,0.000000,0.0,0.000000,1.0,0.0,0.846524,0.116270,1.0,0.010827,0.000000,0.0,0.0,1.0,0.0,0.605460,0.045281,0.0,0.006447
0,0.000000,0.0,0.000000,1.0,0.0,0.941127,0.120096,1.0,0.021854,0.000000,0.0,0.0,1.0,0.0,0.188079,0.002853,0.0,0.000232
0,0.000000,0.0,0.000000,1.0,0.0,0.591711,0.130765,0.0,0.024730,0.000000,0.0,0.0,1.0,0.0,0.905515,0.072718,0.0,0.013827
1,0.000000,0.0,0.000000,1.0,0.0,0.591711,0.130765,0.0,0.024730,0.000000,0.0,0.0,1.0,0.0,0.878543,0.071014,0.0,0.009759
2,0.000000,0.0,0.000000,1.0,0.0,0.591711,0.130765,0.0,0.024730,0.000000,0.0,0.0,1.0,0.0,0.666558,0.137661,0.0,0.025557
0,0.000000,0.0,0.000000,1.0,0.0,0.730241,0.069272,1.0,0.012253,0.000000,0.0,0.0,1.0,0.0,0.785698,0.088225,1.0,0.002783


In [16]:
binarized_tranformed_data = binarize(tranformed_data, selected_features)

binarized_tranformed_data

  import sys


Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8
0,True,True,True,True,True,False,True,True,True
1,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,False,True,True,True
0,True,True,True,True,True,False,True,True,True
0,True,True,True,True,True,True,True,True,True
0,True,True,True,True,True,True,True,True,True
0,True,True,True,True,True,False,True,True,True
1,True,True,True,True,True,False,True,True,True
2,True,True,True,True,True,False,False,True,False
0,True,True,True,True,True,False,False,True,True


In [17]:
mask  = np.random.randint(2, size=binarized_tranformed_data.shape[0]).astype(bool)
# reverted_mask = np.logical_not(mask)
print(mask)

binarized_tranformed_data['order'] = True
# binarized_tranformed_data[reverted_mask]['order'] = 0

binarized_tranformed_data[mask] = ~binarized_tranformed_data[mask]

[ True  True False ... False  True  True]


In [18]:
binarized_tranformed_data = binarized_tranformed_data.astype(int)
selected_features

['userOwnerCounters_USER_FEED_REMOVE',
 'userOwnerCounters_CREATE_IMAGE',
 'userOwnerCounters_VIDEO',
 'user_is_active',
 'auditweights_feedOwner_RECOMMENDED_GROUP',
 'auditweights_svd_prelaunch',
 'auditweights_ctr_high',
 'auditweights_friendLikes',
 'auditweights_ctr_gender']

In [23]:
binarized_tranformed_data.columns = [*selected_features, "order"]

binarized_tranformed_data.to_csv("pairs.csv", index=False)
binarized_tranformed_data

Unnamed: 0,userOwnerCounters_USER_FEED_REMOVE,userOwnerCounters_CREATE_IMAGE,userOwnerCounters_VIDEO,user_is_active,auditweights_feedOwner_RECOMMENDED_GROUP,auditweights_svd_prelaunch,auditweights_ctr_high,auditweights_friendLikes,auditweights_ctr_gender,order
0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,0,1,1,1,1
0,0,0,0,0,0,1,0,0,0,0
0,1,1,1,1,1,1,1,1,1,1
0,1,1,1,1,1,1,1,1,1,1
0,1,1,1,1,1,0,1,1,1,1
1,0,0,0,0,0,1,0,0,0,0
2,1,1,1,1,1,0,0,1,0,1
0,0,0,0,0,0,1,1,0,0,0


 # Read transformed data

In [8]:

import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from scipy.stats import spearmanr

tranformed_data = pd.read_csv("pairs.csv")

X = tranformed_data
Y = tranformed_data['order']
X.drop('order',1, inplace=True)

X



Unnamed: 0,userOwnerCounters_USER_FEED_REMOVE,userOwnerCounters_CREATE_IMAGE,userOwnerCounters_VIDEO,user_is_active,auditweights_feedOwner_RECOMMENDED_GROUP,auditweights_svd_prelaunch,auditweights_ctr_high,auditweights_friendLikes,auditweights_ctr_gender
0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,0,1,1,1
3,0,0,0,0,0,1,0,0,0
4,1,1,1,1,1,1,1,1,1
5,1,1,1,1,1,1,1,1,1
6,1,1,1,1,1,0,1,1,1
7,0,0,0,0,0,1,0,0,0
8,1,1,1,1,1,0,0,1,0
9,0,0,0,0,0,1,1,0,0


In [9]:
X = X.values
Y = Y.values


c = list(zip(X, Y))

shuffle(c)

X, Y = zip(*c)


In [10]:
X = np.array(X)

In [11]:
print(X.shape)
X

(24094, 9)


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 0, 1],
       [0, 0, 0, ..., 1, 0, 1]])

In [12]:
Y = np.array(Y)
print(Y.shape)
Y

(24094,)


array([0, 0, 1, ..., 1, 0, 0])

# Test sklearn  decision tree

In [13]:
from sklearn.utils import shuffle

def split_data(X, Y, p, k):
    N = X.shape[0]
    assert(p <= k)
    assert(X.shape[0] == Y.shape[0])

    b1, b2 = int((N/k) * p), int((N/k) * (p + 1) )
    X_train = np.concatenate((X[0:b1,], X[b2:N,]), axis=0)
    y_train = np.concatenate((Y[0:b1], Y[b2:N]), axis=0)

    X_test = X[b1:b2]
    y_test = Y[b1:b2]

    return X_train, y_train, X_test, y_test

k = 5


In [14]:
import numpy as  np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error

for p in range(0, 5):
    X_train, y_train, X_test, y_test = split_data(X, Y, p, k)
     
#     print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    model = DecisionTreeClassifier(criterion="entropy")
    model.fit(X_train, y_train)
    y_train_sc = model.predict_proba(X_train)
    y_train_sc = np.argmax(y_train_sc, axis=1)
    
    y_score = model.predict_proba(X_test)
    y_score = np.argmax(y_score, axis=1)
    
    print(p)
    print("train spearmanr: ", spearmanr(y_train, y_train_sc))
    print("train MSE", mean_squared_error(y_train, y_train_sc))
    print("test spearmanr", spearmanr(y_test, y_score))
    print("test MSE", mean_squared_error(y_test, y_score))
    print("--------------------------------------------")



0
train spearmanr:  SpearmanrResult(correlation=1.0, pvalue=0.0)
train MSE 0.0
test spearmanr SpearmanrResult(correlation=1.0, pvalue=0.0)
test MSE 0.0
--------------------------------------------
1
train spearmanr:  SpearmanrResult(correlation=0.9999999999999998, pvalue=0.0)
train MSE 0.0
test spearmanr SpearmanrResult(correlation=0.9999999999999998, pvalue=0.0)
test MSE 0.0
--------------------------------------------
2
train spearmanr:  SpearmanrResult(correlation=0.9999999999999998, pvalue=0.0)
train MSE 0.0
test spearmanr SpearmanrResult(correlation=1.0, pvalue=0.0)
test MSE 0.0
--------------------------------------------
3
train spearmanr:  SpearmanrResult(correlation=1.0, pvalue=0.0)
train MSE 0.0
test spearmanr SpearmanrResult(correlation=1.0, pvalue=0.0)
test MSE 0.0
--------------------------------------------
4
train spearmanr:  SpearmanrResult(correlation=1.0, pvalue=0.0)
train MSE 0.0
test spearmanr SpearmanrResult(correlation=1.0, pvalue=0.0)
test MSE 0.0
---------------

# Test custom  ID3

In [15]:
import pandas as pd
from ID3 import ID3


In [16]:
def split_dataframe(X, Y, p, k):
    N = X.shape[0]
    assert(p <= k)
    assert(X.shape[0] == Y.shape[0])

    b1, b2 = int((N/k) * p), int((N/k) * (p + 1) )
    
    if p not in [0,k]:
        X_train = pd.concat((X[0:b1], X[b2:N]), axis=0)
    elif p == 0:
        X_train = X[b2:N]
    elif p == k - 1:
        X_train = X[0:b1]
        
    y_train = np.concatenate((Y[0:b1], Y[b2:N]), axis=0)

    X_test = X[b1:b2]
    y_test = Y[b1:b2]

    return X_train, y_train, X_test, y_test

k = 5


In [35]:
for p in range(0, 5):
    X_train, y_train, X_test, y_test = split_dataframe(tranformed_data, Y, p, k)
#     print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    root =  ID3(X_train, y_train)
    
    y_train_sc = root.predict_mul(X_train)
    y_score = root.predict_mul(X_test)
    
    print(p)
    print("train spearmanr: ", spearmanr(y_train, y_train_sc))
    print("train MSE", mean_squared_error(y_train, y_train_sc))
    print("test spearmanr", spearmanr(y_test, y_score))
    print("test MSE", mean_squared_error(y_test, y_score))
    print("--------------------------------------------")


0
train spearmanr:  SpearmanrResult(correlation=1.0, pvalue=0.0)
train MSE 0.0
test spearmanr SpearmanrResult(correlation=1.0, pvalue=0.0)
test MSE 0.0
--------------------------------------------
1
train spearmanr:  SpearmanrResult(correlation=0.9999999999999998, pvalue=0.0)
train MSE 0.0
test spearmanr SpearmanrResult(correlation=0.9999999999999998, pvalue=0.0)
test MSE 0.0
--------------------------------------------
2
train spearmanr:  SpearmanrResult(correlation=0.9999999999999998, pvalue=0.0)
train MSE 0.0
test spearmanr SpearmanrResult(correlation=1.0, pvalue=0.0)
test MSE 0.0
--------------------------------------------
3
train spearmanr:  SpearmanrResult(correlation=1.0, pvalue=0.0)
train MSE 0.0
test spearmanr SpearmanrResult(correlation=1.0, pvalue=0.0)
test MSE 0.0
--------------------------------------------
4
train spearmanr:  SpearmanrResult(correlation=1.0, pvalue=0.0)
train MSE 0.0
test spearmanr SpearmanrResult(correlation=1.0, pvalue=0.0)
test MSE 0.0
---------------

# Calc hackathon metric

In [36]:
# Used to read the Parquet data
import pyarrow.parquet as parquet
# Used to train the baseline model
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr

from sklearn.utils import shuffle

import pandas as pd
import os




# Where the downloaded data are
input_path = '/home/faraon/data/collabTrain/'
# Where to store results
output_path = './'

dirs = ["date=2018-03-21"]

data = pd.DataFrame({})
for file in dirs: 
    temp_data = parquet.read_table(input_path + file).to_pandas()
    data = pd.concat([data, temp_data])

data['label'] = data['feedback'].apply(lambda x: 1.0 if("Liked" in x) else 0.0)

data = data[ [*selected_features, "instanceId_userId", "instanceId_objectId", "label"] ] 
data

Unnamed: 0,userOwnerCounters_USER_FEED_REMOVE,userOwnerCounters_CREATE_IMAGE,userOwnerCounters_VIDEO,user_is_active,auditweights_feedOwner_RECOMMENDED_GROUP,auditweights_svd_prelaunch,auditweights_ctr_high,auditweights_friendLikes,auditweights_ctr_gender,instanceId_userId,instanceId_objectId,label
0,,,,1.0,,,0.048260,1.0,0.000644,441,20596240,0.0
1,0.0,0.000000,0.000000,1.0,,,0.086790,,0.013699,1428,8442021,1.0
2,0.0,0.000000,0.000000,1.0,,,0.146731,,0.018617,2433,34839619,1.0
3,0.0,0.000000,0.000000,1.0,,,0.069550,1.0,0.006115,2694,38571608,1.0
4,0.0,0.000000,0.000000,1.0,,,0.130680,,0.016702,2706,36159634,0.0
5,,,,1.0,,,0.185955,1.0,0.023838,3237,38249708,1.0
6,,,,1.0,,0.415182,0.112180,1.0,0.007865,4212,37935941,0.0
7,,,,1.0,,,0.149359,1.0,0.018663,4974,22545330,0.0
8,,,,1.0,,,0.119953,1.0,0.008056,4974,39073211,0.0
9,,,,1.0,,,0.026249,1.0,0.003599,5028,33540929,0.0


In [37]:
data = data.fillna(0.0)
data


Unnamed: 0,userOwnerCounters_USER_FEED_REMOVE,userOwnerCounters_CREATE_IMAGE,userOwnerCounters_VIDEO,user_is_active,auditweights_feedOwner_RECOMMENDED_GROUP,auditweights_svd_prelaunch,auditweights_ctr_high,auditweights_friendLikes,auditweights_ctr_gender,instanceId_userId,instanceId_objectId,label
0,0.0,0.000000,0.000000,1.0,0.0,0.000000,0.048260,1.0,0.000644,441,20596240,0.0
1,0.0,0.000000,0.000000,1.0,0.0,0.000000,0.086790,0.0,0.013699,1428,8442021,1.0
2,0.0,0.000000,0.000000,1.0,0.0,0.000000,0.146731,0.0,0.018617,2433,34839619,1.0
3,0.0,0.000000,0.000000,1.0,0.0,0.000000,0.069550,1.0,0.006115,2694,38571608,1.0
4,0.0,0.000000,0.000000,1.0,0.0,0.000000,0.130680,0.0,0.016702,2706,36159634,0.0
5,0.0,0.000000,0.000000,1.0,0.0,0.000000,0.185955,1.0,0.023838,3237,38249708,1.0
6,0.0,0.000000,0.000000,1.0,0.0,0.415182,0.112180,1.0,0.007865,4212,37935941,0.0
7,0.0,0.000000,0.000000,1.0,0.0,0.000000,0.149359,1.0,0.018663,4974,22545330,0.0
8,0.0,0.000000,0.000000,1.0,0.0,0.000000,0.119953,1.0,0.008056,4974,39073211,0.0
9,0.0,0.000000,0.000000,1.0,0.0,0.000000,0.026249,1.0,0.003599,5028,33540929,0.0


In [38]:
root =  ID3(tranformed_data, Y)

In [40]:
def calc_score(data):
    scores = []
        
    for row1 in data.iterrows():
        count = 0
        for row2 in data.iterrows():
            count += root.predict(row1[1][selected_features] >= row2[1][selected_features])
        scores.append(count)
    data["score"] = np.array(scores)
    
    return data


In [41]:
data.head()
data = data.groupby("instanceId_userId")\
    .apply(calc_score)

In [42]:
data.head()



Unnamed: 0,userOwnerCounters_USER_FEED_REMOVE,userOwnerCounters_CREATE_IMAGE,userOwnerCounters_VIDEO,user_is_active,auditweights_feedOwner_RECOMMENDED_GROUP,auditweights_svd_prelaunch,auditweights_ctr_high,auditweights_friendLikes,auditweights_ctr_gender,instanceId_userId,instanceId_objectId,label,score
0,0.0,0.0,0.0,1.0,0.0,0.0,0.04826,1.0,0.000644,441,20596240,0.0,1
1,0.0,0.0,0.0,1.0,0.0,0.0,0.08679,0.0,0.013699,1428,8442021,1.0,1
2,0.0,0.0,0.0,1.0,0.0,0.0,0.146731,0.0,0.018617,2433,34839619,1.0,1
3,0.0,0.0,0.0,1.0,0.0,0.0,0.06955,1.0,0.006115,2694,38571608,1.0,1
4,0.0,0.0,0.0,1.0,0.0,0.0,0.13068,0.0,0.016702,2706,36159634,0.0,1


In [43]:

# Used to calculate metrics
from sklearn.metrics import roc_auc_score

data = data[["instanceId_userId", "score", "label"]]

def auc(labels, scores):
    # This is important! AUC can be computed only when both positive and negative examples are
    # available
    if len(labels) > sum(labels) > 0:
        return roc_auc_score(labels, scores)

    return float('NaN')

In [44]:
%%time
data.groupby("instanceId_userId")\
    .apply(lambda y: auc(y.label.values, y.score.values))\
    .dropna().mean()


CPU times: user 34.2 s, sys: 3.99 ms, total: 34.2 s
Wall time: 34.4 s


0.5