In [1]:
# Used to read the Parquet data
import pyarrow.parquet as parquet
# Used to train the baseline model
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr

from sklearn.utils import shuffle

import pandas as pd


import os
# redirect output 
# sys.stdout = open("out.txt", "w")


# Load data

In [2]:

# Where the downloaded data are
input_path = '/home/skutukov/Documents/collabTrain/'
# Where to store results
output_path = './'


In [3]:
# Read all day to train model on as Pandas dataframe

dirs = ["date=2018-02-01"]

data = pd.DataFrame({})
for file in dirs: 
    temp_data = parquet.read_table(input_path + file).to_pandas()
    data = pd.concat([data, temp_data])

In [4]:
data.head()

Unnamed: 0,instanceId_userId,instanceId_objectType,instanceId_objectId,audit_pos,audit_clientType,audit_timestamp,audit_timePassed,audit_experiment,audit_resourceType,metadata_ownerId,...,auditweights_userOwner_USER_INTERNAL_UNLIKE,auditweights_userOwner_USER_PRESENT_SEND,auditweights_userOwner_USER_PROFILE_VIEW,auditweights_userOwner_USER_SEND_MESSAGE,auditweights_userOwner_USER_STATUS_COMMENT_CREATE,auditweights_userOwner_VIDEO,auditweights_userOwner_VOTE_POLL,auditweights_x_ActorsRelations,auditweights_likersSvd_spark_hyper,auditweights_source_PROMO
0,138,Post,16788021,1,API,1517512273619,3651351,XPRM-5386_q2x3,8,2154,...,,,,,,,,,,
1,222,Post,23852723,33,WEB,1517495744978,3937907,XPRM-5386_G2,8,23080,...,,,,,,,,,,
2,384,Post,12454221,15,WEB,1517508395567,1517508395567,XPRM-5386_q2x2,8,39056,...,,,,,,,,,,
3,666,Post,18335103,13,WEB,1517486842005,1517486842005,XPRM-5386_G0,8,69277,...,,,,,,,,,,
4,1209,Post,22920031,2,MOB,1517498414790,2031754,XPRM-5386_G1,8,23941,...,,,,,,,,1024.0,,


In [5]:

def feedback_to_float(x):
    res = []
    feeddict = {
        "Commented": 0,
        "ReShared": 0,
        "Liked": 1,
        "Clicked": 0,
        "Ignored": 0,
        "Unliked": 0,
        "Complaint": 0,
        "Disliked": 0,  
        "Viewed": 0
    }
    for feed in x:
        res.append(feeddict[feed])

    return np.array(res).mean().astype(int)



# Construct the label (liked objects)
data['liked'] = data['feedback'].apply(feedback_to_float)


In [6]:
data

Unnamed: 0,instanceId_userId,instanceId_objectType,instanceId_objectId,audit_pos,audit_clientType,audit_timestamp,audit_timePassed,audit_experiment,audit_resourceType,metadata_ownerId,...,auditweights_userOwner_USER_PRESENT_SEND,auditweights_userOwner_USER_PROFILE_VIEW,auditweights_userOwner_USER_SEND_MESSAGE,auditweights_userOwner_USER_STATUS_COMMENT_CREATE,auditweights_userOwner_VIDEO,auditweights_userOwner_VOTE_POLL,auditweights_x_ActorsRelations,auditweights_likersSvd_spark_hyper,auditweights_source_PROMO,liked
0,138,Post,16788021,1,API,1517512273619,3651351,XPRM-5386_q2x3,8,2154,...,,,,,,,,,,0
1,222,Post,23852723,33,WEB,1517495744978,3937907,XPRM-5386_G2,8,23080,...,,,,,,,,,,0
2,384,Post,12454221,15,WEB,1517508395567,1517508395567,XPRM-5386_q2x2,8,39056,...,,,,,,,,,,0
3,666,Post,18335103,13,WEB,1517486842005,1517486842005,XPRM-5386_G0,8,69277,...,,,,,,,,,,0
4,1209,Post,22920031,2,MOB,1517498414790,2031754,XPRM-5386_G1,8,23941,...,,,,,,,1024.0,,,0
5,3012,Post,11058330,9,WEB,1517454139475,42463232,XPRM-5386_q4x3,8,17170,...,,,,,,,,,,0
6,3273,Video,629704,2,MOB,1517477144035,3681895,XPRM-5386_q4x1,6,19481,...,,,,,,,,,,0
7,4092,Post,12501606,2,API,1517506625604,3123396,XPRM-5386_q2x3,8,32544,...,,,,,,,,,,1
8,4092,Post,31434608,3,WEB,1517514400257,952489,XPRM-5386_q2x3,8,14285,...,,,,,,,,,,0
9,4395,Photo,5119828,9,API,1517506256767,1067143,XPRM-5386_q4x2,3,67929,...,,,,,,,,,,0


# Select some features 

In [7]:
# instanceId_userId
# liked 
selected_features = ['auditweights_svd_prelaunch',
                     'auditweights_ctr_high', 'auditweights_friendLikes', 'auditweights_ctr_gender'
                    ]

In [8]:
selected_data = data[[*selected_features, 'liked', 'instanceId_userId'] ]

In [9]:

selected_data.head(20)

Unnamed: 0,auditweights_svd_prelaunch,auditweights_ctr_high,auditweights_friendLikes,auditweights_ctr_gender,liked,instanceId_userId
0,0.60316,0.065857,,0.009836,0,138
1,0.675362,0.135539,,0.023984,0,222
2,0.761416,0.060081,,0.009776,0,384
3,0.851362,0.038051,,0.00266,0,666
4,0.887637,0.194643,,0.005013,0,1209
5,0.702343,0.172163,1.0,0.016191,0,3012
6,0.690506,0.146267,1.0,0.00574,0,3273
7,0.761575,0.127646,1.0,0.022038,1,4092
8,0.82062,0.086106,1.0,0.013726,0,4092
9,0.245339,0.054828,,0.004651,0,4395


# Transform data into pairs

In [10]:
def transform_data(data):
    users_data = data.groupby('instanceId_userId')
    
    df_results = pd.DataFrame({})

    for user_data in users_data:
        user_data = user_data[1]

        user_data_liked = user_data[user_data['liked'] == 1]
        user_data_disliked = user_data[user_data['liked'] == 0]

        if user_data_liked.shape[0] == 0 or user_data_disliked.shape[0] == 0:
             continue

        # generate
        data_liked = user_data_liked[selected_features]
        data_disliked = user_data_disliked[selected_features]
        
        data_liked['key'] = 0
        data_disliked['key'] = 0
        
        temp_data = pd.merge(data_liked, data_disliked, on='key', how='inner')
        temp_data.drop('key',1, inplace=True)
        
        df_results =  pd.concat([df_results, temp_data])
        
                
    return df_results


# Binarize pairs

In [11]:
def binarize(data, features):
    result_frame = pd.DataFrame({})
    
    for feature in features:
        a = data[feature + "_x"] >= data[feature + "_y"]
        a = a.to_frame()
        a.colunds = [feature]
        result_frame = pd.concat([result_frame, a], axis=1, sort=False)    
    return result_frame

    

# Transform and save transformed data

In [12]:
tranformed_data = transform_data(selected_data)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [13]:
tranformed_data = tranformed_data.fillna(0.0)

In [14]:
tranformed_data.head()

Unnamed: 0,auditweights_svd_prelaunch_x,auditweights_ctr_high_x,auditweights_friendLikes_x,auditweights_ctr_gender_x,auditweights_svd_prelaunch_y,auditweights_ctr_high_y,auditweights_friendLikes_y,auditweights_ctr_gender_y
0,0.847771,0.142697,1.0,0.019438,0.85612,0.083528,1.0,0.003679
1,0.847771,0.142697,1.0,0.019438,0.719209,0.071965,1.0,0.003981
2,0.847771,0.142697,1.0,0.019438,0.912177,0.139855,1.0,0.017423
0,0.761575,0.127646,1.0,0.022038,0.82062,0.086106,1.0,0.013726
0,0.846524,0.11627,1.0,0.010827,0.60546,0.045281,0.0,0.006447


In [15]:
binarized_tranformed_data = binarize(tranformed_data, selected_features)

binarized_tranformed_data

  import sys


Unnamed: 0,0,0.1,0.2,0.3
0,False,True,True,True
1,True,True,True,True
2,False,True,True,True
0,False,True,True,True
0,True,True,True,True
0,True,True,True,True
0,False,True,True,True
1,False,True,True,True
2,False,False,True,False
0,False,False,True,True


In [16]:
mask  = np.random.randint(2, size=binarized_tranformed_data.shape[0]).astype(bool)
# reverted_mask = np.logical_not(mask)
print(mask)

binarized_tranformed_data['order'] = True
# binarized_tranformed_data[reverted_mask]['order'] = 0

binarized_tranformed_data[mask] = ~binarized_tranformed_data[mask]

[False False False ... False  True False]


In [18]:
binarized_tranformed_data = binarized_tranformed_data.astype(int)
binarized_tranformed_data.head()

Unnamed: 0,0,0.1,0.2,0.3,order
0,0,1,1,1,1
1,1,1,1,1,1
2,0,1,1,1,1
0,1,0,0,0,0
0,0,0,0,0,0


In [19]:
binarized_tranformed_data.columns = [*selected_features, "order"]

binarized_tranformed_data.to_csv("pairs.csv", index=False)
binarized_tranformed_data

Unnamed: 0,auditweights_svd_prelaunch,auditweights_ctr_high,auditweights_friendLikes,auditweights_ctr_gender,order
0,0,1,1,1,1
1,1,1,1,1,1
2,0,1,1,1,1
0,1,0,0,0,0
0,0,0,0,0,0
0,0,0,0,0,0
0,1,0,0,0,0
1,1,0,0,0,0
2,0,0,1,0,1
0,1,1,0,0,0


 # Read transformed data

In [21]:

import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from scipy.stats import spearmanr

tranformed_data = pd.read_csv("pairs.csv")

tranformed_data.head()

Unnamed: 0,auditweights_svd_prelaunch,auditweights_ctr_high,auditweights_friendLikes,auditweights_ctr_gender,order
0,0,1,1,1,1
1,1,1,1,1,1
2,0,1,1,1,1
3,1,0,0,0,0
4,0,0,0,0,0


In [22]:
X = tranformed_data
Y = tranformed_data['order']
X.drop('order',1, inplace=True)

X.head()

Unnamed: 0,auditweights_svd_prelaunch,auditweights_ctr_high,auditweights_friendLikes,auditweights_ctr_gender
0,0,1,1,1
1,1,1,1,1
2,0,1,1,1
3,1,0,0,0
4,0,0,0,0


In [23]:
Y.head()

0    1
1    1
2    1
3    0
4    0
Name: order, dtype: int64

In [24]:
X = X.values
Y = Y.values


c = list(zip(X, Y))

shuffle(c)

X, Y = zip(*c)


In [25]:
X = np.array(X)

In [26]:
print(X.shape)
X

(24094, 4)


array([[0, 1, 1, 1],
       [1, 1, 1, 1],
       [0, 1, 1, 1],
       ...,
       [1, 1, 1, 1],
       [0, 1, 0, 1],
       [0, 0, 1, 0]])

In [27]:
Y = np.array(Y)
print(Y.shape)
Y

(24094,)


array([1, 1, 1, ..., 1, 0, 1])

# Test sklearn  decision tree

In [28]:
from sklearn.utils import shuffle

def split_data(X, Y, p, k):
    N = X.shape[0]
    assert(p <= k)
    assert(X.shape[0] == Y.shape[0])

    b1, b2 = int((N/k) * p), int((N/k) * (p + 1) )
    X_train = np.concatenate((X[0:b1,], X[b2:N,]), axis=0)
    y_train = np.concatenate((Y[0:b1], Y[b2:N]), axis=0)

    X_test = X[b1:b2]
    y_test = Y[b1:b2]

    return X_train, y_train, X_test, y_test

k = 5


In [29]:
import numpy as  np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error

for p in range(0, 5):
    X_train, y_train, X_test, y_test = split_data(X, Y, p, k)
     
#     print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    model = DecisionTreeClassifier(criterion="entropy")
    model.fit(X_train, y_train)
    y_train_sc = model.predict_proba(X_train)
    y_train_sc = np.argmax(y_train_sc, axis=1)
    
    y_score = model.predict_proba(X_test)
    y_score = np.argmax(y_score, axis=1)
    
    print(p)
    print("train spearmanr: ", spearmanr(y_train, y_train_sc))
    print("train MSE", mean_squared_error(y_train, y_train_sc))
    print("test spearmanr", spearmanr(y_test, y_score))
    print("test MSE", mean_squared_error(y_test, y_score))
    print("--------------------------------------------")



0
train spearmanr:  SpearmanrResult(correlation=0.829322992708502, pvalue=0.0)
train MSE 0.0853392820087155
test spearmanr SpearmanrResult(correlation=0.8061383303204391, pvalue=0.0)
test MSE 0.09692818596928186
--------------------------------------------
1
train spearmanr:  SpearmanrResult(correlation=0.8248472005864803, pvalue=0.0)
train MSE 0.0875745784695201
test spearmanr SpearmanrResult(correlation=0.8240606866465946, pvalue=0.0)
test MSE 0.0879850591409006
--------------------------------------------
2
train spearmanr:  SpearmanrResult(correlation=0.8214383648624205, pvalue=0.0)
train MSE 0.08928664072632944
test spearmanr SpearmanrResult(correlation=0.8377752872375919, pvalue=0.0)
test MSE 0.08113716538700975
--------------------------------------------
3
train spearmanr:  SpearmanrResult(correlation=0.8240266064688537, pvalue=0.0)
train MSE 0.08798962386511025
test spearmanr SpearmanrResult(correlation=0.8273262293467247, pvalue=0.0)
test MSE 0.08632496368541191
-------------

# Test custom  ID3

In [30]:
import pandas as pd
from ID3 import ID3


In [31]:
def split_dataframe(X, Y, p, k):
    N = X.shape[0]
    assert(p <= k)
    assert(X.shape[0] == Y.shape[0])

    b1, b2 = int((N/k) * p), int((N/k) * (p + 1) )
    
    if p not in [0,k]:
        X_train = pd.concat((X[0:b1], X[b2:N]), axis=0)
    elif p == 0:
        X_train = X[b2:N]
    elif p == k - 1:
        X_train = X[0:b1]
        
    y_train = np.concatenate((Y[0:b1], Y[b2:N]), axis=0)

    X_test = X[b1:b2]
    y_test = Y[b1:b2]

    return X_train, y_train, X_test, y_test

k = 5


In [32]:
for p in range(0, 5):
    X_train, y_train, X_test, y_test = split_dataframe(tranformed_data, Y, p, k)
#     print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    root =  ID3(X_train, y_train)
    
    y_train_sc = root.predict_mul(X_train)
    y_score = root.predict_mul(X_test)
    
    print(p)
    print("train spearmanr: ", spearmanr(y_train, y_train_sc))
    print("train MSE", mean_squared_error(y_train, y_train_sc))
    print("test spearmanr", spearmanr(y_test, y_score))
    print("test MSE", mean_squared_error(y_test, y_score))
    print("--------------------------------------------")


0
train spearmanr:  SpearmanrResult(correlation=0.829322992708502, pvalue=0.0)
train MSE 0.0853392820087155
test spearmanr SpearmanrResult(correlation=0.8061383303204391, pvalue=0.0)
test MSE 0.09692818596928186
--------------------------------------------
1
train spearmanr:  SpearmanrResult(correlation=0.8248472005864803, pvalue=0.0)
train MSE 0.0875745784695201
test spearmanr SpearmanrResult(correlation=0.8240606866465946, pvalue=0.0)
test MSE 0.0879850591409006
--------------------------------------------
2
train spearmanr:  SpearmanrResult(correlation=0.8214383648624205, pvalue=0.0)
train MSE 0.08928664072632944
test spearmanr SpearmanrResult(correlation=0.8377752872375919, pvalue=0.0)
test MSE 0.08113716538700975
--------------------------------------------
3
train spearmanr:  SpearmanrResult(correlation=0.8240266064688537, pvalue=0.0)
train MSE 0.08798962386511025
test spearmanr SpearmanrResult(correlation=0.8273262293467247, pvalue=0.0)
test MSE 0.08632496368541191
-------------

# Calc hackathon metric on same day

In [42]:
# Used to read the Parquet data
import pyarrow.parquet as parquet
# Used to train the baseline model
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr

from sklearn.utils import shuffle

import pandas as pd
import os




# Where the downloaded data are
input_path = '/home/skutukov/Documents/collabTrain/'
# Where to store results
output_path = './'

dirs = ["date=2018-02-01"]

data = pd.DataFrame({})
for file in dirs: 
    temp_data = parquet.read_table(input_path + file).to_pandas()
    data = pd.concat([data, temp_data])

data['label'] = data['feedback'].apply(lambda x: 1.0 if("Liked" in x) else 0.0)

data = data[ [*selected_features, "instanceId_userId", "instanceId_objectId", "label"] ] 
data.head()

Unnamed: 0,auditweights_svd_prelaunch,auditweights_ctr_high,auditweights_friendLikes,auditweights_ctr_gender,instanceId_userId,instanceId_objectId,label
0,0.60316,0.065857,,0.009836,138,16788021,0.0
1,0.675362,0.135539,,0.023984,222,23852723,0.0
2,0.761416,0.060081,,0.009776,384,12454221,0.0
3,0.851362,0.038051,,0.00266,666,18335103,0.0
4,0.887637,0.194643,,0.005013,1209,22920031,0.0


In [43]:
data = data.fillna(0.0)
data.head()


Unnamed: 0,auditweights_svd_prelaunch,auditweights_ctr_high,auditweights_friendLikes,auditweights_ctr_gender,instanceId_userId,instanceId_objectId,label
0,0.60316,0.065857,0.0,0.009836,138,16788021,0.0
1,0.675362,0.135539,0.0,0.023984,222,23852723,0.0
2,0.761416,0.060081,0.0,0.009776,384,12454221,0.0
3,0.851362,0.038051,0.0,0.00266,666,18335103,0.0
4,0.887637,0.194643,0.0,0.005013,1209,22920031,0.0


In [35]:
root =  ID3(tranformed_data, Y)

In [44]:
def calc_score(data):
    scores = []
        
    for row1 in data.iterrows():
        count = 0
        for row2 in data.iterrows():
            count += root.predict(row1[1][selected_features] >= row2[1][selected_features])
        scores.append(count)
    data["score"] = np.array(scores)
    
    return data


In [45]:
data.head()


Unnamed: 0,auditweights_svd_prelaunch,auditweights_ctr_high,auditweights_friendLikes,auditweights_ctr_gender,instanceId_userId,instanceId_objectId,label
0,0.60316,0.065857,0.0,0.009836,138,16788021,0.0
1,0.675362,0.135539,0.0,0.023984,222,23852723,0.0
2,0.761416,0.060081,0.0,0.009776,384,12454221,0.0
3,0.851362,0.038051,0.0,0.00266,666,18335103,0.0
4,0.887637,0.194643,0.0,0.005013,1209,22920031,0.0


In [46]:
data = data.groupby("instanceId_userId")\
    .apply(calc_score)
data.head()

Unnamed: 0,auditweights_svd_prelaunch,auditweights_ctr_high,auditweights_friendLikes,auditweights_ctr_gender,instanceId_userId,instanceId_objectId,label,score
0,0.60316,0.065857,0.0,0.009836,138,16788021,0.0,1
1,0.675362,0.135539,0.0,0.023984,222,23852723,0.0,1
2,0.761416,0.060081,0.0,0.009776,384,12454221,0.0,1
3,0.851362,0.038051,0.0,0.00266,666,18335103,0.0,1
4,0.887637,0.194643,0.0,0.005013,1209,22920031,0.0,1


In [47]:
# Used to calculate metrics
from sklearn.metrics import roc_auc_score

data = data[["instanceId_userId", "score", "label"]]
data.head()



Unnamed: 0,instanceId_userId,score,label
0,138,1,0.0
1,222,1,0.0
2,384,1,0.0
3,666,1,0.0
4,1209,1,0.0


In [48]:
def auc(labels, scores):
    # This is important! AUC can be computed only when both positive and negative examples are
    # available
    if len(labels) > sum(labels) > 0:
        return roc_auc_score(labels, scores)

    return float('NaN')

In [49]:
%%time
data.groupby("instanceId_userId")\
    .apply(lambda y: auc(y.label.values, y.score.values))\
    .dropna().mean()


CPU times: user 24.6 s, sys: 3.91 ms, total: 24.6 s
Wall time: 24.6 s


0.5222475554999821