In [1]:
import pandas as pd

In [2]:
card_dict = {}
with open('data/card') as f:
    for line in f:
        id, name, _, _ = line.split(",")
        card_dict[id] = name

acc_dict = {}
with open('data/account') as f:
    for line in f:
        id, name, _, _ = line.split(",")
        acc_dict[id] = name

In [3]:
print(f"We have {len(card_dict)} cards and {len(acc_dict)} accounts")

We have 600000 cards and 800000 accounts


In [4]:
df_acc = pd.read_csv('data/account_to_account', sep=",", names=["src", "tgt","_1", "amt", "strategy","_2", "bus","_3","_4","_5","_6","_7","_8"], header=None)
df_acc = df_acc[["src", "tgt", "amt", "strategy", "bus"]]
df_acc

Unnamed: 0,src,tgt,amt,strategy,bus
0,284452,734522,0.0,strategy_name-2,buscode2
1,734522,284452,50.0,strategy_name-1,buscode2
2,785045,318621,9.0,strategy_name-2,buscode2
3,318621,785045,11.0,strategy_name-1,buscode3
4,785045,318621,3.0,strategy_name-3,buscode3
...,...,...,...,...,...
6010507,292418,353258,5.0,strategy_name-2,buscode1
6010508,309564,163379,0.0,strategy_name-3,buscode3
6010509,163379,309564,50.0,strategy_name-1,buscode1
6010510,540919,727046,0.0,strategy_name-2,buscode3


In [5]:
df_card = pd.read_csv('data/account_to_card', sep=",", names=["src", "tgt","_1", "amt", "strategy","_2", "bus","_3","_4","_5","_6","_7","_8"], header=None)
df_card = df_card[["src", "tgt", "amt", "strategy", "bus"]]
df_card

Unnamed: 0,src,tgt,amt,strategy,bus
0,684821,434860,5.0,strategy_name-4,buscode3
1,684821,434860,0.0,strategy_name-4,buscode3
2,684821,434860,33.0,strategy_name-4,buscode3
3,349837,98007,2.0,strategy_name-4,buscode1
4,181713,317857,40.0,strategy_name-4,buscode2
...,...,...,...,...,...
3410186,455846,6902,0.0,strategy_name-4,buscode1
3410187,58995,391212,8.0,strategy_name-5,buscode3
3410188,58995,391212,0.0,strategy_name-4,buscode2
3410189,58995,391212,0.0,strategy_name-5,buscode2


In [6]:
# Look up names in the corresponding files
df_card["src_name"] = df_card["src"].apply(lambda x: acc_dict[str(x)])
df_card["tgt_name"] = df_card["tgt"].apply(lambda x: card_dict[str(x)])
df_acc["src_name"] = df_acc["src"].apply(lambda x: acc_dict[str(x)])
df_acc["tgt_name"] = df_acc["tgt"].apply(lambda x: acc_dict[str(x)])

In [7]:
# Account to card is 0 and account to account is 1
df_card["type"] = 0
df_acc["type"] = 1

In [8]:
df = pd.concat([df_card, df_acc])
df

Unnamed: 0,src,tgt,amt,strategy,bus,src_name,tgt_name,type
0,684821,434860,5.0,strategy_name-4,buscode3,John,Jobs,0
1,684821,434860,0.0,strategy_name-4,buscode3,John,Jobs,0
2,684821,434860,33.0,strategy_name-4,buscode3,John,Jobs,0
3,349837,98007,2.0,strategy_name-4,buscode1,John,Mike,0
4,181713,317857,40.0,strategy_name-4,buscode2,Mike,Mike,0
...,...,...,...,...,...,...,...,...
6010507,292418,353258,5.0,strategy_name-2,buscode1,Jobs,John,1
6010508,309564,163379,0.0,strategy_name-3,buscode3,Mike,Jobs,1
6010509,163379,309564,50.0,strategy_name-1,buscode1,Jobs,Mike,1
6010510,540919,727046,0.0,strategy_name-2,buscode3,Jobs,Mike,1


In [9]:
df["amt"] = df["amt"].apply(lambda x: int(float(x)))
df["strategy"] = df["strategy"].apply(lambda x: int(x.replace("strategy_name-", "")))
df["bus"] = df["bus"].apply(lambda x: int(x.replace("buscode", "")))

In [10]:
# Drop rows that are completely similar
df = df.drop_duplicates()

In [11]:
# Make edge embeddings
df["edge"] = df["src_name"].astype(str) + "-" + df["tgt_name"].astype(str) + "-" + df["type"].astype(str) + "-" + df["strategy"].astype(str) + "-" + df["bus"].astype(str) + "-" + df["amt"].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["edge"] = df["src_name"].astype(str) + "-" + df["tgt_name"].astype(str) + "-" + df["type"].astype(str) + "-" + df["strategy"].astype(str) + "-" + df["bus"].astype(str) + "-" + df["amt"].astype(str)


In [12]:
# Count the freuquency of edge embeddings
df["edge_count"] = df.groupby(['edge'])["src"].transform('count')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["edge_count"] = df.groupby(['edge'])["src"].transform('count')


In [13]:
# Remove edges that are not frequent enough
df = df[df["edge_count"] >= 10000]

In [14]:
# Save dataframe as input file
df[["src", "tgt", "amt", "strategy", "bus", "src_name", "tgt_name", "type"]].to_csv("data/full.csv", index=False)