节点、边（链接）、节点特征、边特征、标签。

边权重等等

同构图、异构图等等。

In [1]:
import os
import requests

# 文件URL
file_urls = [
    "https://raw.githubusercontent.com/batuhan-demirci/fifa21_dataset/master/data/tbl_player.csv",
    "https://raw.githubusercontent.com/batuhan-demirci/fifa21_dataset/master/data/tbl_player_skill.csv",
    "https://raw.githubusercontent.com/batuhan-demirci/fifa21_dataset/master/data/tbl_team.csv"
]

# 指定保存文件的路径
save_path = "data/fifa21_dataset"

# 确保保存路径存在，如果不存在则创建
os.makedirs(save_path, exist_ok=True)

# 下载并保存文件
for url in file_urls:
    filename = os.path.join(save_path, os.path.basename(url))
    response = requests.get(url)
    with open(filename, "wb") as f:
        f.write(response.content)
    print(f"文件 {os.path.basename(url)} 下载完成，保存在 {save_path}")

文件 tbl_player.csv 下载完成，保存在 data/fifa21_dataset
文件 tbl_player_skill.csv 下载完成，保存在 data/fifa21_dataset
文件 tbl_team.csv 下载完成，保存在 data/fifa21_dataset


In [2]:
import pandas as pd

# 加载数据
player_df = pd.read_csv("data/fifa21_dataset/tbl_player.csv")
skill_df = pd.read_csv("data/fifa21_dataset/tbl_player_skill.csv")
team_df = pd.read_csv("data/fifa21_dataset/tbl_team.csv")

# 提取子集
player_df = player_df[["int_player_id", "str_player_name", "str_positions", "int_overall_rating", "int_team_id"]]
skill_df = skill_df[["int_player_id", "int_long_passing", "int_ball_control", "int_dribbling"]]
team_df = team_df[["int_team_id", "str_team_name", "int_overall"]]

# 合并数据
player_df = player_df.merge(skill_df, on='int_player_id') # 以int_player_id列进行合并
fifa_df = player_df.merge(team_df, on='int_team_id') # 以int_team_id列进行合并

# 排序
fifa_df = fifa_df.sort_values(by="int_overall_rating", ascending=False) # 以int_overall_rating列降序
print("Players: ", fifa_df.shape[0])

fifa_df = fifa_df[0:10] # 取10个数据

fifa_df.head(10)

Players:  18767


Unnamed: 0,int_player_id,str_player_name,str_positions,int_overall_rating,int_team_id,int_long_passing,int_ball_control,int_dribbling,str_team_name,int_overall
0,1,Lionel Andrés Messi Cuccittini,"RW, ST, CF",93,5.0,91,96,96,FC Barcelona,84
33,2,Cristiano Ronaldo dos Santos Aveiro,"ST, LW",92,6.0,77,92,88,Juventus,83
57,3,Jan Oblak,GK,91,8.0,40,30,12,Atlético Madrid,83
121,5,Neymar da Silva Santos Júnior,"LW, CAM",91,7.0,81,95,95,Paris Saint-Germain,83
89,4,Kevin De Bruyne,"CAM, CM",91,2.0,93,92,88,Manchester City,85
152,6,Robert Lewandowski,ST,91,4.0,70,88,85,FC Bayern München,84
178,10,Sadio Mané,LW,90,1.0,71,89,91,Liverpool,85
1,12,Marc-André ter Stegen,GK,90,5.0,63,30,21,FC Barcelona,84
179,11,Virgil van Dijk,CB,90,1.0,86,77,70,Liverpool,85
177,9,Mohamed Salah Ghaly,RW,90,1.0,75,89,90,Liverpool,85


In [3]:
# 确保没有重复的节点
max(fifa_df["int_player_id"].value_counts())

1

In [4]:
# 以int_player_id列升序
sorted_df = fifa_df.sort_values(by="int_player_id")

# 挑选节点特征
node_features = sorted_df[["str_positions", "int_long_passing", "int_ball_control", "int_dribbling"]]

# 转换非数字列
pd.set_option('mode.chained_assignment', None)
positions = node_features["str_positions"].str.split(",", expand=True)
node_features["first_position"] = positions[0]

# one-hot编码
node_features = pd.concat([node_features, pd.get_dummies(node_features["first_position"])], axis=1, join='inner')
node_features.drop(["str_positions", "first_position"], axis=1, inplace=True)

# 转换True和False为1和0
node_features = node_features.astype(int)
node_features.head()

Unnamed: 0,int_long_passing,int_ball_control,int_dribbling,CAM,CB,GK,LW,RW,ST
0,91,96,96,0,0,0,0,1,0
33,77,92,88,0,0,0,0,0,1
57,40,30,12,0,0,1,0,0,0
89,93,92,88,1,0,0,0,0,0
121,81,95,95,0,0,0,1,0,0


In [5]:
# 转换成numpy数组
x = node_features.to_numpy()
print(x)
x.shape # [num_nodes, num_features]

[[91 96 96  0  0  0  0  1  0]
 [77 92 88  0  0  0  0  0  1]
 [40 30 12  0  0  1  0  0  0]
 [93 92 88  1  0  0  0  0  0]
 [81 95 95  0  0  0  1  0  0]
 [70 88 85  0  0  0  0  0  1]
 [75 89 90  0  0  0  0  1  0]
 [71 89 91  0  0  0  1  0  0]
 [86 77 70  0  1  0  0  0  0]
 [63 30 21  0  0  1  0  0  0]]


(10, 9)

In [6]:
# 以int_player_id列定义顺序
sorted_df = fifa_df.sort_values(by="int_player_id")

# 挑选标签列
labels = sorted_df[["int_overall"]]
labels.head(10)

Unnamed: 0,int_overall
0,84
33,83
57,83
89,85
121,83
152,84
177,85
178,85
179,85
1,84


In [7]:
# 转换成numpy数组
y = labels.to_numpy()
print(y)
y.shape # [num_nodes, 1]

[[84]
 [83]
 [83]
 [85]
 [83]
 [84]
 [85]
 [85]
 [85]
 [84]]


(10, 1)

In [8]:
fifa_df.head(10)

Unnamed: 0,int_player_id,str_player_name,str_positions,int_overall_rating,int_team_id,int_long_passing,int_ball_control,int_dribbling,str_team_name,int_overall
0,1,Lionel Andrés Messi Cuccittini,"RW, ST, CF",93,5.0,91,96,96,FC Barcelona,84
33,2,Cristiano Ronaldo dos Santos Aveiro,"ST, LW",92,6.0,77,92,88,Juventus,83
57,3,Jan Oblak,GK,91,8.0,40,30,12,Atlético Madrid,83
121,5,Neymar da Silva Santos Júnior,"LW, CAM",91,7.0,81,95,95,Paris Saint-Germain,83
89,4,Kevin De Bruyne,"CAM, CM",91,2.0,93,92,88,Manchester City,85
152,6,Robert Lewandowski,ST,91,4.0,70,88,85,FC Bayern München,84
178,10,Sadio Mané,LW,90,1.0,71,89,91,Liverpool,85
1,12,Marc-André ter Stegen,GK,90,5.0,63,30,21,FC Barcelona,84
179,11,Virgil van Dijk,CB,90,1.0,86,77,70,Liverpool,85
177,9,Mohamed Salah Ghaly,RW,90,1.0,75,89,90,Liverpool,85


In [13]:
# 重映射球员ID
fifa_df["int_player_id"] = fifa_df.reset_index().index
fifa_df.head(10)

Unnamed: 0,int_player_id,str_player_name,str_positions,int_overall_rating,int_team_id,int_long_passing,int_ball_control,int_dribbling,str_team_name,int_overall
0,0,Lionel Andrés Messi Cuccittini,"RW, ST, CF",93,5.0,91,96,96,FC Barcelona,84
33,1,Cristiano Ronaldo dos Santos Aveiro,"ST, LW",92,6.0,77,92,88,Juventus,83
57,2,Jan Oblak,GK,91,8.0,40,30,12,Atlético Madrid,83
121,3,Neymar da Silva Santos Júnior,"LW, CAM",91,7.0,81,95,95,Paris Saint-Germain,83
89,4,Kevin De Bruyne,"CAM, CM",91,2.0,93,92,88,Manchester City,85
152,5,Robert Lewandowski,ST,91,4.0,70,88,85,FC Bayern München,84
178,6,Sadio Mané,LW,90,1.0,71,89,91,Liverpool,85
1,7,Marc-André ter Stegen,GK,90,5.0,63,30,21,FC Barcelona,84
179,8,Virgil van Dijk,CB,90,1.0,86,77,70,Liverpool,85
177,9,Mohamed Salah Ghaly,RW,90,1.0,75,89,90,Liverpool,85


In [9]:
# 每支球队需要有多少球员
fifa_df["str_team_name"].value_counts()

str_team_name
Liverpool              3
FC Barcelona           2
Juventus               1
Atlético Madrid        1
Paris Saint-Germain    1
Manchester City        1
FC Bayern München      1
Name: count, dtype: int64

In [10]:
import itertools
import numpy as np

teams = fifa_df["str_team_name"].unique()
all_edges = np.array([], dtype=np.int32).reshape((0, 2)) # 链接存储

for team in teams:
    team_df = fifa_df[fifa_df["str_team_name"] == team]
    players = team_df["int_player_id"].values # 获得需要连接的队员id

    permutations = list(itertools.combinations(players, 2)) # 迭代的两两连接
    print(permutations)

    edges_source = [e[0] for e in permutations]
    edges_target = [e[1] for e in permutations]
    
    team_edges = np.column_stack([edges_source, edges_target])
    all_edges = np.vstack([all_edges, team_edges])
    
# 转换成Pytorch Geometric格式
edge_index = all_edges.transpose()

edge_index # [2, num_edges]

[(0, 7)]
[]
[]
[]
[]
[]
[(6, 8), (6, 9), (8, 9)]


array([[0., 6., 6., 8.],
       [7., 8., 9., 9.]])

In [11]:
# 复制数组中的列并颠倒它们的顺序
edge_index  = np.concatenate([edge_index, edge_index[::-1]], axis=1)
# 去除重复的边，确保每条边只出现一次
edge_index  = np.unique(edge_index, axis=1)

edge_index 

array([[0., 6., 6., 7., 8., 8., 9., 9.],
       [7., 8., 9., 0., 6., 9., 6., 8.]])

In [12]:
from torch_geometric.data import Data
data = Data(x=x, edge_index=edge_index, y=y)

# 有向图
data

Data(x=[10, 9], edge_index=[2, 8], y=[10, 1])