In [1]:
import pandas as pd
from utils import randomSplit, Accumulator
#from torch import nn
import torch
import numpy as np
from sklearn.preprocessing import StandardScaler  # 归一化
from sklearn.model_selection import KFold         # k折交叉验
from sklearn.decomposition import PCA             # pca

def rank2class(df):
    if df['LgRk'] <= 6:
        return 0
    elif 6 < df['LgRk'] <= 13:
        return 1
    else:
        return 2

In [2]:
class myDataset():
    def __init__(self, renorm=False, pca=False, pca_dim=15):
        self.player = pd.read_csv('DLdata\outfielder_combined_from_raw_fillna.csv', header=0, encoding='utf-8')
        self.player = self.player.drop(['Player', 'Pos', 'Comp', 'Age', 'Born'], axis=1)
        scaler = StandardScaler()  # z score 归一化
        if renorm:
            for col in self.player.columns:
                if col in ('Squad', "Season"): continue
                self.player[[col]] = scaler.fit_transform(self.player[[col]])

        if pca:
            squad = self.player['Squad']
            season = self.player['Season']
            values = self.player.drop(['Squad', 'Season'], axis=1).values
            comp_dim = pca_dim
            pca = PCA(n_components=comp_dim)   # 15
            values = pca.fit_transform(values)
            # print(sum(pca.explained_variance_ratio_))
            self.player = pd.DataFrame(values)
            self.player['Squad'] = squad
            self.player["Season"] = season

        self.league = pd.read_excel('DLdata/SquadPerformance2021.xlsx', sheet_name='Sheet1')
        self.league['tier'] = self.league.apply(rank2class, axis=1)
        self.league = self.league[['Squad', 'Season', 'tier']]  


        self.X = []  # in GNN: list of Data object; in CNN: list of 2-dim tensors; in MLs list of 1-dim tensors
        self.y = []
        for id in self.league.index:
            squad = self.league.at[id, 'Squad']
            season = self.league.at[id, 'Season']
            if season == 2021: continue  # 发现github的数据在2021年仅仅只有前几场比赛的，所以不得不舍弃了orzzz
            self.X.append(self.Squad2Player(squad, season)) 
            self.y.append(self.league.at[id, 'tier'])  # 作为监督的label

        self.X = torch.cat(self.X, axis=0)  # [98, 135, 40]
        self.X = self.X.numpy()
        self.y = np.array(self.y)
    
    def Squad2Player(self, squad, season):
        "输入squad的名字，输出球队球员的数据"
        season = str(season) + '-' + str(int(season) + 1)
        re = self.player[self.player['Squad'] == squad]
        #print(re.columns)
        re = re[re['Season'] == season].drop(['Squad', 'Season'], axis=1)
        re = torch.Tensor(re.values)  # shape [nplayers, features]
        # print(re.shape) [20, 151]
        #self.size_stat.append(re.shape[0])
        

        upBound = 30
        if re.shape[0] < upBound:
            a = (upBound - re.shape[0]) // 2
            b = upBound - re.shape[0] - a
            a = torch.zeros((a, re.shape[1]))
            b = torch.zeros((b, re.shape[1]))
            return torch.cat([a, re, b], dim=0).unsqueeze(0)
        elif re.shape[0] == upBound:
            return re.unsqueeze(0)
        else:
            return re[:upBound, :].unsqueeze(0)
        

    # 使用K折交叉验证
    def DataIter(self, K=10, shuffle=True):
        KF = KFold(n_splits=K, shuffle=shuffle)  
        for train_index, test_index in KF.split(self.X):
            #print("TRAIN", train_index, "TEST", test_index)
            X_train, X_test = self.X[train_index], self.X[test_index]
            Y_train, Y_test = self.y[train_index], self.y[test_index]
            yield X_train, Y_train, X_test, Y_test
    

mydataset = myDataset(pca=1, pca_dim=15)
print(mydataset.X.shape)

(392, 30, 15)


In [3]:
from gtda.homology import EuclideanCechPersistence
EP = EuclideanCechPersistence(homology_dimensions=[0, 1, 2, 3, 4])  # Parameter explained in the text
diagrams = EP.fit_transform(mydataset.X)
diagrams.shape

from gtda.diagrams import PersistenceEntropy
PE = PersistenceEntropy()
features1 = PE.fit_transform(diagrams)
print(features1.shape)
print(features1)

(392, 5)
[[ 4.07629656  6.50669919 -1.         -1.         -1.        ]
 [ 4.11461758 -1.         -1.         -1.         -1.        ]
 [ 4.11388066  6.6680747  -1.         -1.         -1.        ]
 ...
 [ 4.61221254  6.48477753 -1.         -1.          2.38307977]
 [ 4.28656544 -1.         -1.         -1.         -1.        ]
 [ 4.31622902 -1.         -1.         -1.          0.7600188 ]]


In [4]:
from gtda.plotting import plot_diagram
i = 7  # range 0- 97，是队伍
plot_diagram(diagrams[i])

In [5]:
# VR
from gtda.point_clouds import ConsistentRescaling
CR = ConsistentRescaling()
re = CR.fit_transform(mydataset.X.transpose(0, 2, 1))
from gtda.homology import VietorisRipsPersistence
VR = VietorisRipsPersistence(homology_dimensions=[0, 1, 2, 3, 4])
diagrams2 = VR.fit_transform(mydataset.X)
features2 = PE.fit_transform(diagrams2)
print(features2)
print(features2.shape)

[[ 4.07629657  0.         -1.         -1.         -1.        ]
 [ 4.11461758  0.         -1.         -1.         -1.        ]
 [ 4.1138807   0.         -1.         -1.         -1.        ]
 ...
 [ 4.61221254  1.37384641 -1.         -1.         -1.        ]
 [ 4.28656544  0.18055373 -1.         -1.         -1.        ]
 [ 4.31622901  0.99271572 -1.         -1.         -1.        ]]
(392, 5)


In [6]:
x = np.concatenate([features1, features2], axis=1)
print(x.shape)

(392, 10)


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from tqdm import tqdm

mydataset.X = np.concatenate([features1, features2], axis=1)
exp_stats_train = []
exp_stats_valid = []
EXPERIMENT = 5
K = 10
pybar = tqdm(total=EXPERIMENT*K, desc='k折交叉验证', unit='次')
for exp in range(EXPERIMENT):
    for X_train, y_train, X_valid, y_valid in mydataset.DataIter():
        #model = RandomForestClassifier()  #没事了，随机森林
        model = svm.SVC(C=1.0, kernel='linear',  # linear效果最好
            gamma='scale', decision_function_shape='ovo')
        model.fit(X_train, y_train)
        acc = model.score(X_valid, y_valid)
        print(f'训练精度 {model.score(X_train, y_train):.3f}', end='  ')
        print('测试集精度', acc)
        exp_stats_train.append(model.score(X_train, y_train))  
        exp_stats_valid.append(acc)

exp_stats_train = np.array(exp_stats_train)
exp_stats_valid = np.array(exp_stats_valid)

print(f'训练集acc: mean = {exp_stats_train.mean():.3f}, std = {exp_stats_train.std():.3f}')
print(f'估计泛化误差：验证集acc: mean = {exp_stats_valid.mean():.3f}, std = {exp_stats_valid.std():.3f}')

k折交叉验证:   0%|          | 0/50 [00:00<?, ?次/s]

训练精度 0.466  测试集精度 0.425
训练精度 0.455  测试集精度 0.4
训练精度 0.462  测试集精度 0.46153846153846156
训练精度 0.479  测试集精度 0.41025641025641024
训练精度 0.456  测试集精度 0.4358974358974359
训练精度 0.467  测试集精度 0.4358974358974359
训练精度 0.462  测试集精度 0.48717948717948717
训练精度 0.462  测试集精度 0.46153846153846156
训练精度 0.470  测试集精度 0.358974358974359
训练精度 0.482  测试集精度 0.3076923076923077
训练精度 0.463  测试集精度 0.6
训练精度 0.460  测试集精度 0.375
训练精度 0.484  测试集精度 0.28205128205128205
训练精度 0.490  测试集精度 0.23076923076923078
训练精度 0.467  测试集精度 0.4358974358974359
训练精度 0.484  测试集精度 0.4358974358974359
训练精度 0.476  测试集精度 0.41025641025641024
训练精度 0.482  测试集精度 0.41025641025641024
训练精度 0.462  测试集精度 0.5128205128205128
训练精度 0.487  测试集精度 0.3333333333333333
训练精度 0.472  测试集精度 0.475
训练精度 0.455  测试集精度 0.5
训练精度 0.459  测试集精度 0.5128205128205128
训练精度 0.462  测试集精度 0.46153846153846156
训练精度 0.470  测试集精度 0.358974358974359
训练精度 0.453  测试集精度 0.41025641025641024
训练精度 0.490  测试集精度 0.1794871794871795
训练精度 0.473  测试集精度 0.38461538461538464
训练精度 0.470  测试集精度 0.358974358974359
训练精