<a href="https://colab.research.google.com/github/SuYenTing/Shopee-Code-League-2021/blob/main/shopee_multi_channel_contacts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Shopee Code League - Multi-Channel Contacts
2021/03/10 蘇彥庭

In [None]:
# 匯入套件
import pandas as pd

In [None]:
# 讀取原始檔案
df = pd.read_json('contacts.json',  encoding='utf8')

In [None]:
# 整理相同Email欄位的ticket id set資訊
emailDf = df.groupby('Email', as_index=False).agg({'Id': lambda x: set(x)})
emailDf = emailDf.loc[emailDf['Email'] != '', ['Id']]

# 整理相同Phone欄位的ticket id set資訊
phoneDf = df.groupby('Phone', as_index=False).agg({'Id': lambda x: set(x)})
phoneDf = phoneDf.loc[phoneDf['Phone'] != '', ['Id']]

# 整理相同OrderId欄位的ticket id set資訊
orderIdDf = df.groupby('OrderId', as_index=False).agg({'Id': lambda x: set(x)})
orderIdDf = orderIdDf.loc[orderIdDf['OrderId'] != '', ['Id']]

In [None]:
# 整合ticket id set資訊
ticketSetsList = pd.concat([emailDf, phoneDf, orderIdDf])['Id'].tolist()

In [None]:
# 整理為字典格式 配合待會的DFS演算法
# key為每個ticket id
# value為與key有相同資訊的ticket id
graph = {}
# 迴圈每個set
for iSet in ticketSetsList:
    # 迴圈每個set裡面的id
    for id in iSet:
        # 判斷此id是否已為字典的key 若沒有則直接建立 若有則更新set
        if id not in graph:
            graph[id] = set(iSet)
        else:
            graph.get(id).update(iSet)

In [None]:
# DFS演算法參考: https://www.educative.io/edpresso/how-to-implement-depth-first-search-in-python
def dfs(visited, graph, id):
    if id not in visited:
        iUserTicketSet.append(id)
        visited.add(id)
        for neighbour in graph[id]:
            dfs(visited, graph, neighbour)

In [None]:
# 建立已訪問過的ticket id清單
visited = set()
# 建立使用者對應的tickets清單
userTicketSets = list()
# 利用DFS演算法整理出同一使用者所擁有的tickets清單
# 迴圈每個ticket id
for id in range(len(graph)):
    iUserTicketSet = list()
    dfs(visited, graph, id)
    if iUserTicketSet:
        userTicketSets.append(sorted(iUserTicketSet))

In [None]:
# 建立每個id所屬的idSets及自訂user編號
output = pd.DataFrame({'id': userTicketSets, 'idSets': userTicketSets})
output['user'] = output.index
output = output.explode('id')

In [None]:
# 計算同個使用者Contacts次數
contactsDf = output.merge(df[['Id', 'Contacts']], left_on='id', right_on='Id')
contactsDf = contactsDf.groupby(['user'], as_index=False).agg({'Contacts': 'sum'})

In [None]:
# 合併資料並整理Kaggle提交檔案格式
output = output.merge(contactsDf, left_on='user', right_on='user')
output['idSets'] = ['-'.join(map(str, elem)) for elem in output['idSets']]
output['idSets_contacts'] = output['idSets'] + ', ' + output['Contacts'].astype(str)
output = output[['id', 'idSets_contacts']]
output.columns = ['ticket_id', 'ticket_trace/contact']
output = output.sort_values(['ticket_id'])
output.to_csv('answer.csv', index=False)