# UniCredit Training Camp 2021 - Knowledge Graph Completion

## TEAM 'EST'
### Authors:  Elisa Valeriani, 1783955 - Stefania Sferragatta, 1958081 -  Tansel Simsek,  1942297


In [1]:
from scipy.stats import loguniform
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
import os
import time

In [None]:
#paths to files
base_path = '/content/gdrive/MyDrive/datasets/unicredittrainingcamp/'
sep = ','
dataset_name = ''
trainset_path = base_path + dataset_name + 'kg_train.csv'
validset_path = base_path + dataset_name + 'kg_validation.csv'
testset_path = base_path + 'kg_test_nolabel.csv'
trainvalidset_path = base_path + dataset_name + 'kg_pykeen_trainvalidation.tsv'

In [None]:
# function for splitting the content of the triples in source, relation, target
def make_3_columns(base_path, next_path , name_csv, label_yes = True, take_label = False):
    df = pd.read_csv(base_path + next_path)
    if label_yes:
        df = df[df.Label == 1]
    df = pd.concat([df, df.Triple.str.split(expand = True)], axis = 1)
    if label_yes:
        df = df.rename(columns={0: 'source', 1: 'relation', 2 : 'target', 'Label': 'label'})
    else:
        df = df.rename(columns={0: 'source', 1: 'relation', 2 : 'target'})
    df['source'] = pd.to_numeric(df['source'])
    df['target'] = pd.to_numeric(df['target'])
    if take_label:
        df = df[['source', 'relation', 'target',  'Label']]
    else: 
        df = df[['source', 'relation', 'target']]
    # save the result into tsv file
    with open(base_path + name_csv, 'w') as f:
        df.to_csv(f, header=False, index=False, sep='\t' )

In [None]:
#create the validation and test set with the label and the test set without the label
make_3_columns(base_path, 'kg_validation.csv', 'validation_final_label.tsv', False, True)
make_3_columns(base_path, 'kg_train.csv', 'train_final_label.tsv', False, True)
make_3_columns(base_path, 'kg_test_nolabel.csv', 'test_final.tsv', False)

In [None]:
#create the validation and test set without the label
make_3_columns(base_path, 'kg_validation.csv', 'validation_final.tsv', False, False)
make_3_columns(base_path, 'kg_train.csv', 'train_final.tsv', False, False)

In [None]:
trainset_path = base_path + dataset_name + 'train_final.tsv'
validset_path = base_path + dataset_name + 'validation_final.tsv'

In [None]:
#load triples from training+validation-set file
all_triples = TriplesFactory.from_path(trainvalidset_path)
#load triples from training-set file and validation-set file
train_triples = TriplesFactory.from_path(trainset_path)
valid_triples = TriplesFactory.from_path(validset_path)

In [None]:
#train embeddings
result = pipeline( 
    training=train_triples,
    testing=valid_triples,
    model='TransR',
    loss='MSELoss', 
    #model configuration
    model_kwargs = dict(
        embedding_dim=200 #dimensionality of the netity embeddings; default: 50
    ),
    loss_kwargs = dict(), #loss configuration
    training_loop='SLCWATrainingLoop', 
    regularizer='LpRegularizer', 
    #regularizer configuration
    regularizer_kwargs=dict(
        p=2.0, #default
        weight=1.0 #default
    ),
    optimizer='SGD', 
    #optimizer configuration
    optimizer_kwargs = dict(
        lr=0.1
    ),
    negative_sampler='Basic', 
    dimensions=200, #default: 50
    #training configuration
    training_kwargs=dict(
        num_epochs=200,
        use_tqdm_batch=False,
    ),  
    #runtime configuration
    random_seed=1234,
    device='gpu'
)

In [None]:
#save result to file (model and evaluation)
result_path = base_path + 'results'
result.save_to_directory(result_path)

In [None]:
#extract entity embeddings
entity_ids = torch.LongTensor(range(all_triples.num_entities))
e_emb = result.model.entity_embeddings
e_emb.cpu()
e_emb_numpy = e_emb(entity_ids).detach().numpy()
entity2embedding = {}
for eid in range(all_triples.num_entities):
    e = all_triples.entity_id_to_label[eid]
    entity2embedding[e] = list(e_emb_numpy[eid])

In [None]:
#extract relation embeddings
relation_ids = torch.LongTensor(range(all_triples.num_relations))
r_emb = result.model.relation_embeddings
r_emb.cpu()
r_emb_numpy = r_emb(relation_ids).detach().numpy()
relation2embedding = {}
for rid in range(all_triples.num_relations):
    r = all_triples.relation_id_to_label[rid]
    relation2embedding[r] = list(r_emb_numpy[rid])

In [None]:
# function for computing the embedding
def create_embeddings(df,relation2embedding,entity2embedding):
    full_embedding_list = []

    for row in range(df.shape[0]):
        s=df[0][row]
        r=df[1][row]
        t=df[2][row]
        full_embedding = entity2embedding[str(s)] + relation2embedding[str(r)] + entity2embedding[str(t)]
        full_embedding_list.append(full_embedding)  

    df['embedding']= full_embedding_list
    return df

Creation of the embedding for the training dataset

In [None]:
df_train = pd.read_csv('train_final_label.tsv', sep='\t', header=None)

In [None]:
emb_train = create_embeddings(df_train,relation2embedding,entity2embedding)

In [None]:
#save the result into a new csv file
with open('/content/gdrive/MyDrive/unicredittrainingcamp/' + 'df_train_emb.csv', 'w') as f:
    emb_train.to_csv(f, header=False, index=False)

Creation of the embedding for the validation dataset

In [None]:
df_val = pd.read_csv('/content/gdrive/MyDrive/unicredittrainingcamp/validation_final_label.tsv', sep='\t', header=None)

In [None]:
emb_val = create_embeddings(df_val,relation2embedding,entity2embedding)

In [None]:
with open('/content/gdrive/MyDrive/unicredittrainingcamp/' + 'df_val_emb.csv', 'w') as f:
    emb_val.to_csv(f, header=False, index=False )

Creation of the embedding for the test dataset

In [None]:
df_test = pd.read_csv('/content/gdrive/MyDrive/unicredittrainingcamp/test_final.tsv', sep='\t', header=None)

In [None]:
emb_test = create_embeddings(df_test,relation2embedding,entity2embedding)

In [None]:
with open('/content/gdrive/MyDrive/unicredittrainingcamp/' + 'df_test_emb.csv', 'w') as f:
    emb_test.to_csv(f, header=False, index=False )

In [3]:
#Load the csv creted above
train = pd.read_csv('df_train_emb.csv', header = None)
test = pd.read_csv('df_test_emb.csv', header = None)
dev = pd.read_csv('df_val_emb.csv', header = None)

In [4]:
#Rename the columns of the dataframe
train = train.rename(columns={0: 'source', 1: 'relation', 2 : 'target', 3: 'label', 4: 'embedding'})
dev = dev.rename(columns={0: 'source', 1: 'relation', 2 : 'target', 3: 'label', 4: 'embedding'})
test = test.rename(columns={0: 'source', 1: 'relation', 2 : 'target', 3: 'embedding'})

In [5]:
# #Concatenate the new columns with the embedding to the dataframe
# def add_new_column(dataframe):
#     new_list = []
#     for i in range(len(dataframe)):
#         res = dataframe['embedding'][i].strip('][').split(', ')
#         res = [float(j) for j in res]
#         new_list.append(res)
#     dataframe['embedding'] = new_list
#     df_list = dataframe['embedding'].tolist()
#     df = pd.DataFrame(df_list)
#     dataframe = pd.concat([dataframe, df], axis=1)
#     dataframe = dataframe.drop(['embedding'], axis=1)
#     return dataframe

In [6]:
# train = add_new_column(train)
# test = add_new_column(test)
# dev = add_new_column(dev)

In [7]:
# train.head()

Unnamed: 0,source,relation,target,label,0,1,2,3,4,5,...,420,421,422,423,424,425,426,427,428,429
0,34881,intercommunality,14230,0,-0.077233,0.119895,-0.003629,0.017712,0.060872,0.100077,...,0.101224,-0.117935,-0.056366,0.046605,0.077758,0.098981,0.028646,-0.087241,-0.063717,-0.102795
1,9387,ownerOper,39573,0,0.051407,-0.097245,0.058831,-0.049638,-0.066242,-0.001903,...,0.002603,-0.082168,-0.057717,-0.085935,-0.093695,-0.00302,0.041622,0.061183,-0.107571,-0.040124
2,12480,coach,24064,0,-0.007626,0.077514,0.076412,0.022497,0.047771,-0.106071,...,-0.015453,0.083406,-0.036119,-0.060431,-0.011616,-0.025314,0.005912,0.083081,0.082056,0.068306
3,6871,branches,22010,0,-0.060748,0.065322,-0.056058,0.01138,0.103271,-0.109446,...,-0.05634,0.115712,-0.062918,-0.069767,0.070984,-0.067909,0.10528,-0.007993,-0.094237,0.042079
4,13789,damsire,33095,0,0.11177,0.004248,0.064284,-0.081097,0.00835,0.062,...,0.079775,-0.01249,0.071471,-0.021408,-0.006365,-0.060582,-0.107537,0.081445,0.039402,0.005463


In [8]:
relation_dict = {}
unique_elements = train.relation.unique()
for i in range(len(unique_elements)):
    relation_dict[unique_elements[i]] = i 
    

In [9]:
# Compute a label encoding for the classifier
def encode_labels(df):
    my_list = []
    for i in df.relation:
        my_list.append(relation_dict[i])
    df['relation'] = my_list
    return df

In [10]:
train = encode_labels(train)
dev = encode_labels(dev)
test = encode_labels(test)

In [11]:
X_train = train[['source', 'relation', 'target']]
Y_train = pd.DataFrame(train, columns = ['label'])
X_dev = dev[['source', 'relation', 'target']]
Y_dev = pd.DataFrame(dev, columns = ['label'])
X_test = test[['source', 'relation', 'target']]

In [None]:
#Perform the RandomForestClassifier
clf=RandomForestClassifier(n_estimators=100, bootstrap = True)
clf.fit(X_train,Y_train)

In [64]:
# Do the prediction for evaluate the model applied
y_pred=clf.predict(X_dev)

In [65]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(Y_dev, y_pred))
print("f1_score:",metrics.f1_score(Y_dev, y_pred))

Accuracy: 0.7196758452422447
f1_score: 0.7197735191637631


#### Save the results obtained into a file for the submission

In [66]:
y_pred=clf.predict(X_test)

In [67]:
#create the id field
id_column = []
for i in range(len(y_pred)):
    id_column.append(i+114754)

In [68]:
#rename the columns as requested
df = test[['source']]
df['Id'] = id_column
df['Predicted'] = y_pred
df = df[['Id', 'Predicted']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Id'] = id_column
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Predicted'] = y_pred


In [69]:
# store the results in a csv
with open('./unicredittrainingcamp/' + 'EST_2.csv', 'w') as f:
    df.to_csv(f, header=True, index=False )