In [6]:
# Used to read the Parquet data
import pyarrow.parquet as parquet
# Used to train the baseline model
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error

from sklearn.utils import shuffle

import pandas as pd

import os
# redirect output 
# sys.stdout = open("out.txt", "w")


# Load data

In [2]:

# Where the downloaded data are
input_path = '/home/faraon/data/collabTrain/'
# Where to store results
output_path = './'


In [3]:
# Read all day to train model on as Pandas dataframe

dirs = ["date=2018-02-01"]

data = pd.DataFrame({})
for file in dirs: 
    temp_data = parquet.read_table(input_path + file).to_pandas()
    data = pd.concat([data, temp_data])

In [4]:
data.head()

Unnamed: 0,instanceId_userId,instanceId_objectType,instanceId_objectId,audit_pos,audit_clientType,audit_timestamp,audit_timePassed,audit_experiment,audit_resourceType,metadata_ownerId,...,auditweights_userOwner_USER_INTERNAL_UNLIKE,auditweights_userOwner_USER_PRESENT_SEND,auditweights_userOwner_USER_PROFILE_VIEW,auditweights_userOwner_USER_SEND_MESSAGE,auditweights_userOwner_USER_STATUS_COMMENT_CREATE,auditweights_userOwner_VIDEO,auditweights_userOwner_VOTE_POLL,auditweights_x_ActorsRelations,auditweights_likersSvd_spark_hyper,auditweights_source_PROMO
0,138,Post,16788021,1,API,1517512273619,3651351,XPRM-5386_q2x3,8,2154,...,,,,,,,,,,
1,222,Post,23852723,33,WEB,1517495744978,3937907,XPRM-5386_G2,8,23080,...,,,,,,,,,,
2,384,Post,12454221,15,WEB,1517508395567,1517508395567,XPRM-5386_q2x2,8,39056,...,,,,,,,,,,
3,666,Post,18335103,13,WEB,1517486842005,1517486842005,XPRM-5386_G0,8,69277,...,,,,,,,,,,
4,1209,Post,22920031,2,MOB,1517498414790,2031754,XPRM-5386_G1,8,23941,...,,,,,,,,1024.0,,


In [5]:

def feedback_to_float(x):
    res = []
    feeddict = {
        "Commented": 0,
        "ReShared": 0,
        "Liked": 1,
        "Clicked": 0,
        "Ignored": 0,
        "Unliked": 0,
        "Complaint": 0,
        "Disliked": 0,  
        "Viewed": 0
    }
    for feed in x:
        res.append(feeddict[feed])

    return np.array(res).mean().astype(int)



# Construct the label (liked objects)
data['liked'] = data['feedback'].apply(feedback_to_float)


In [6]:
# features = data.columns.values

# Select some features 

In [8]:

# instanceId_userId
# liked 
selected_features = ['userOwnerCounters_USER_FEED_REMOVE', 
                     'userOwnerCounters_CREATE_IMAGE', 'userOwnerCounters_VIDEO',  'user_is_active', 
                     'auditweights_feedOwner_RECOMMENDED_GROUP', 'auditweights_svd_prelaunch',
                    ]

# Binarize features

In [9]:
def binary_features(data):
    for feature in selected_features:
        print(feature)
        data[feature] = (data[feature] > data[feature].unique().mean()).astype(int)
    
    return  data

data = binary_features(data.fillna(0.0))


userOwnerCounters_USER_FEED_REMOVE
userOwnerCounters_CREATE_IMAGE
userOwnerCounters_VIDEO
user_is_active
auditweights_feedOwner_RECOMMENDED_GROUP
auditweights_svd_prelaunch


In [10]:
exetended_selected_features = selected_features.copy()
exetended_selected_features.append('liked')
exetended_selected_features.append('instanceId_userId')

selected_data = data[exetended_selected_features]

In [11]:

selected_data.head(20)

Unnamed: 0,userOwnerCounters_USER_FEED_REMOVE,userOwnerCounters_CREATE_IMAGE,userOwnerCounters_VIDEO,user_is_active,auditweights_feedOwner_RECOMMENDED_GROUP,auditweights_svd_prelaunch,liked,instanceId_userId
0,0,0,0,1,0,0,0,138
1,0,0,0,1,0,1,0,222
2,0,0,0,1,0,1,0,384
3,0,0,0,1,0,1,0,666
4,0,0,0,1,0,1,0,1209
5,0,0,0,1,0,1,0,3012
6,0,0,0,1,0,1,0,3273
7,0,0,0,1,0,1,1,4092
8,0,0,0,1,0,1,0,4092
9,0,0,0,1,0,0,0,4395


# Transform data into pairs

In [12]:


def transform_data(data):
    users_data = data.groupby('instanceId_userId')
    
    df_positive = pd.DataFrame({})
    df_negative =  pd.DataFrame({})

    for user_data in users_data:
        user_data = user_data[1]

        user_data_liked = user_data[user_data['liked'] == 1]
        user_data_disliked = user_data[user_data['liked'] == 0]

        user_data_liked = shuffle(user_data_liked)
        user_data_disliked = shuffle(user_data_disliked)

        if user_data_liked.shape[0] == 0 or user_data_disliked.shape[0] == 0:
             continue

        liked_mask  = np.random.randint(2, size=user_data_liked.shape[0]).astype(bool)
        disliked_mask = np.random.randint(2, size=user_data_disliked.shape[0]).astype(bool)

        liked_mask[0] = True
        disliked_mask[0] = True
        
        # generate positive 
        user_data_liked_masked = user_data_liked[liked_mask]
        user_data_disliked_masked = user_data_disliked[disliked_mask]
    
        data_liked = user_data_liked_masked[selected_features]
        data_disliked = user_data_disliked_masked[selected_features]
        
        data_liked['key'] = 0
        data_disliked['key'] = 0
        
        temp_data = pd.merge(data_liked, data_disliked, on='key', how='inner')
        temp_data.drop('key',1, inplace=True)
        
        df_positive =  pd.concat([df_positive, temp_data])
        
         # generate negative
        revert_liked_mask = np.logical_not(liked_mask)
        revert_disliked_mask = np.logical_not(disliked_mask)
        
        user_data_liked_masked = user_data_liked[revert_liked_mask]
        user_data_disliked_masked = user_data_disliked[revert_disliked_mask]
    
        data_liked = user_data_liked_masked[selected_features]
        data_disliked = user_data_disliked_masked[selected_features]
        
        data_liked['key'] = 0
        data_disliked['key'] = 0
        
        temp_data = pd.merge(data_disliked, data_liked, on='key', how='inner')
        temp_data.drop('key',1, inplace=True)
        df_negative =  pd.concat([df_negative, temp_data])
                
    return df_positive, df_negative


# Transform and save transformed data

In [13]:

tranformed_data_positive, tranformed_data_negative  = transform_data(selected_data)

tranformed_data_positive['order'] = 1
tranformed_data_negative['order'] = 0


tranformed_data = pd.concat([tranformed_data_positive, tranformed_data_negative])

tranformed_data.to_csv("pairs.csv", index=False)
tranformed_data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,userOwnerCounters_USER_FEED_REMOVE_x,userOwnerCounters_CREATE_IMAGE_x,userOwnerCounters_VIDEO_x,user_is_active_x,auditweights_feedOwner_RECOMMENDED_GROUP_x,auditweights_svd_prelaunch_x,userOwnerCounters_USER_FEED_REMOVE_y,userOwnerCounters_CREATE_IMAGE_y,userOwnerCounters_VIDEO_y,user_is_active_y,auditweights_feedOwner_RECOMMENDED_GROUP_y,auditweights_svd_prelaunch_y,order
0,0,0,0,1,0,1,0,0,0,1,0,1,1
1,0,0,0,1,0,1,0,0,0,1,0,1,1
2,0,0,0,1,0,1,0,0,0,1,0,1,1
0,0,0,0,1,0,1,0,0,0,1,0,1,1
0,0,0,0,1,0,1,0,0,0,1,0,0,1
0,0,0,0,1,0,1,0,0,0,1,0,0,1
0,0,0,0,1,0,0,0,0,0,1,0,1,1
0,0,0,0,1,0,1,0,0,0,1,0,1,1
1,0,0,0,1,0,1,0,0,0,1,0,1,1
0,0,0,0,1,0,1,0,0,0,1,0,1,1


 # Read transformed data

In [1]:

import pandas as pd

tranformed_data = pd.read_csv("pairs.csv")
X = tranformed_data
Y = tranformed_data['order']
X.drop('order',1, inplace=True)
X = X.values
Y = Y.values

In [2]:
tranformed_data.head()


Unnamed: 0,userOwnerCounters_USER_FEED_REMOVE_x,userOwnerCounters_CREATE_IMAGE_x,userOwnerCounters_VIDEO_x,user_is_active_x,auditweights_feedOwner_RECOMMENDED_GROUP_x,auditweights_svd_prelaunch_x,userOwnerCounters_USER_FEED_REMOVE_y,userOwnerCounters_CREATE_IMAGE_y,userOwnerCounters_VIDEO_y,user_is_active_y,auditweights_feedOwner_RECOMMENDED_GROUP_y,auditweights_svd_prelaunch_y
0,0,0,0,1,0,1,0,0,0,1,0,1
1,0,0,0,1,0,1,0,0,0,1,0,1
2,0,0,0,1,0,1,0,0,0,1,0,1
3,0,0,0,1,0,1,0,0,0,1,0,1
4,0,0,0,1,0,1,0,0,0,1,0,0


In [3]:

print(X.shape)
X

(18065, 12)


array([[0, 0, 0, ..., 1, 0, 1],
       [0, 0, 0, ..., 1, 0, 1],
       [0, 0, 0, ..., 1, 0, 1],
       ...,
       [0, 0, 0, ..., 1, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]])

In [4]:
print(Y.shape)
Y

(18065,)


array([1, 1, 1, ..., 0, 0, 0])

# Test sklearn  decision tree

In [5]:
from sklearn.utils import shuffle
X = shuffle(X)
Y = shuffle(Y)

def split_data(X, Y, p, k):
    N = X.shape[0]
    assert(p <= k)
    assert(X.shape[0] == Y.shape[0])

    b1, b2 = int((N/k) * p), int((N/k) * (p + 1) )
    X_train = np.concatenate((X[0:b1,], X[b2:N,]), axis=0)
    y_train = np.concatenate((Y[0:b1], Y[b2:N]), axis=0)

    X_test = X[b1:b2]
    y_test = Y[b1:b2]

    return X_train, y_train, X_test, y_test

k = 5


In [8]:
import numpy as  np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error

for p in range(0, 5):
    X_train, y_train, X_test, y_test = split_data(X, Y, p, k)
    X_train = shuffle(X_train)
    y_train = shuffle(y_train)
    X_test = shuffle(X_test)
    y_test = shuffle(y_test)
    
#     print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    model = DecisionTreeClassifier(criterion="entropy")
    model.fit(X_train, y_train)
    y_score = model.predict_proba(X_test)
    y_score =np.argmax(y_score, axis=1)
    print("test MSE: ", mean_squared_error(y_test, y_score))



test MSE:  0.04373097149183504
test MSE:  0.043454193191253807
test MSE:  0.03625795737614171
test MSE:  0.03819540548021035
test MSE:  0.03791862717962912


# Test custom  ID3

In [9]:
import pandas as pd
from ID3 import ID3


In [10]:
def split_dataframe(X, Y, p, k):
    N = X.shape[0]
    assert(p <= k)
    assert(X.shape[0] == Y.shape[0])

    b1, b2 = int((N/k) * p), int((N/k) * (p + 1) )
    
    if p not in [0,k]:
        X_train = pd.concat((X[0:b1], X[b2:N]), axis=0)
    elif p == 0:
        X_train = X[b2:N]
    elif p == k - 1:
        X_train = X[0:b1]
        
    y_train = np.concatenate((Y[0:b1], Y[b2:N]), axis=0)

    X_test = X[b1:b2]
    y_test = Y[b1:b2]

    return X_train, y_train, X_test, y_test

k = 5


In [11]:

for p in range(0, 5):
    X_train, y_train, X_test, y_test = split_dataframe(tranformed_data, Y, p, k)
    X_train = shuffle(X_train)
    y_train = shuffle(y_train)
    X_test = shuffle(X_test)
    y_test = shuffle(y_test)
    
#     print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    root =  ID3(X_train, y_train)

    y_score = root.predict_mul(X_test)
    print("test MSE: ", mean_squared_error(y_test, y_score))



test MSE:  0.05563243841682812
test MSE:  0.04898975920287849
test MSE:  0.04926653750345973
test MSE:  0.04677553279822862
test MSE:  0.04566841959590368
