In [399]:
import pandas as pd
from utils import randomSplit, Accumulator
#from torch import nn
import torch
import numpy as np
from sklearn.preprocessing import StandardScaler  # 归一化
from sklearn.model_selection import KFold         # k折交叉验
from sklearn.decomposition import PCA             # pca

def rank2class(df):
    if df['LgRk'] <= 6:
        return 0
    elif 6 < df['LgRk'] <= 13:
        return 1
    else:
        return 2

In [404]:
class myDataset():
    def __init__(self, renorm=False, pca=False):
        self.player = pd.read_csv('datacleaning/modified_data.csv', header=0, encoding='unicode_escape', delimiter=';')
        self.player = self.player.drop(['Rk', 'Player', 'Nation', 'Pos', 'Comp', 'Age', 'Born'], axis=1)
        if renorm:
            scaler = StandardScaler()  # z score 归一化
            for col in self.player.columns:
                if col == 'Squad': continue
                self.player[[col]] = scaler.fit_transform(self.player[[col]])
        if pca:
            squad = self.player['Squad']
            values = self.player.drop('Squad', axis=1).values
            comp_dim = 25
            pca = PCA(n_components=comp_dim)   # 15
            values = pca.fit_transform(values)
            print(sum(pca.explained_variance_ratio_))
            self.player = pd.DataFrame(values)
            self.player['Squad'] = squad

        self.league = pd.read_excel('datacleaning/Big 5 European Leagues.xlsx', sheet_name='Big 5 European Leagues Stats')
        self.league['tier'] = self.league.apply(rank2class, axis=1)
        self.league = self.league[['Squad', 'tier']]

        self.X = []
        self.y = []
        for id in self.league.index:
            squad = self.league.at[id, 'Squad']
            self.X.append(self.Squad2Player(squad))  # comp_dim
            self.y.append(self.league.at[id, 'tier'])

        self.X = torch.cat(self.X, axis=0)  # [98, 135, 40]
        self.X = self.X.numpy()
        self.y = np.array(self.y)

        #if pca:  # 这里难搞啊
        #    comp_dim = 15
        #    pca = PCA(n_components=comp_dim)   # 15
            # 先transpose到 [98, 40, 135]，再将前两个维度缩减，然后进行主成分分析
        #    self.X = pca.fit_transform(self.X.transpose(0, 2, 1).reshape(-1, 135))
            # reshape到还原的部分是极有可能出现错误的，所以这是导致pca后效果无端大幅下降的原因
        #    self.X = self.X.reshape(98, -1, comp_dim).transpose(0, 2, 1)
            #print(sum(pca.explained_variance_ratio_))  # 测试此时的总解释方差达到 99%

    def Squad2Player(self, squad):
        "输入squad的名字，输出球队球员的数据"
        re = self.player[self.player['Squad'] == squad].drop('Squad', axis=1)
        re = torch.Tensor(re.values)

        # 下面是reshape，在这里就做了吧
        # return shape [1, 135, 40]
        if re.shape[0] < 40:
            a = (40 - re.shape[0]) // 2
            b = 40 - re.shape[0] - a
            a = torch.randn(a, re.shape[1])/100
            b = torch.randn(b, re.shape[1])/100
            return torch.cat([a, re, b], dim=0).unsqueeze(0).transpose(1, 2)
        elif re.shape[0] == 40:
            return re.unsqueeze(0).transpose(1, 2)
        else:
            return re[:40, :].unsqueeze(0).transpose(1, 2)
        

    # 使用K折交叉验证
    def DataIter(self, K=10, shuffle=True):
        KF = KFold(n_splits=K, shuffle=shuffle)  
        for train_index, test_index in KF.split(self.X):
            #print("TRAIN", train_index, "TEST", test_index)
            X_train, X_test = self.X[train_index], self.X[test_index]
            Y_train, Y_test = self.y[train_index], self.y[test_index]
            yield X_train, Y_train, X_test, Y_test
    

mydataset = myDataset(pca=1)
print(mydataset.X.shape)

0.9998697641197498
(98, 25, 40)


In [410]:
from gtda.homology import EuclideanCechPersistence
EP = EuclideanCechPersistence(homology_dimensions=[0, 1, 2, 3, 4])  # Parameter explained in the text
diagrams = EP.fit_transform(mydataset.X)
diagrams.shape

from gtda.diagrams import PersistenceEntropy
PE = PersistenceEntropy()
features1 = PE.fit_transform(diagrams)
print(features1.shape)
print(features1)

(98, 5)
[[ 2.41060734e+00  5.22965660e+00 -1.00000000e+00 -1.00000000e+00
   7.14851309e-01]
 [ 2.23922712e+00  3.36775733e+00  2.19105375e+00  3.10240712e+00
   8.92328843e-02]
 [ 2.48386721e+00  1.56628736e+00  9.04901692e-01  4.02837461e+00
   1.60990275e+00]
 [ 2.60383576e+00  4.21513152e+00 -1.00000000e+00 -1.00000000e+00
   1.59362108e+00]
 [ 2.38870907e+00  6.07680553e+00 -1.00000000e+00 -1.00000000e+00
   7.22024106e-01]
 [ 2.31053739e+00  5.89003368e+00  2.43102000e-01 -1.00000000e+00
  -1.00000000e+00]
 [ 2.26364237e+00  6.21248283e+00  5.03917733e+00  1.65232565e+00
  -1.00000000e+00]
 [ 2.30035285e+00 -1.00000000e+00 -1.00000000e+00 -1.00000000e+00
   7.77964658e-01]
 [ 2.42849369e+00  2.31137437e+00  1.23950275e+00  3.99697917e+00
   1.39759638e+00]
 [ 2.31570323e+00  1.25575240e+00  2.96693533e-01 -1.00000000e+00
  -1.00000000e+00]
 [ 2.41462050e+00  4.96134860e+00  5.41332534e+00  2.73205994e+00
   4.32598780e-01]
 [ 2.38958636e+00  4.93509644e+00  3.13794174e+00  2.8519

In [393]:
from gtda.plotting import plot_diagram
i = 7  # range 0- 97，是队伍
plot_diagram(diagrams[i])

In [408]:
# VR
from gtda.point_clouds import ConsistentRescaling
CR = ConsistentRescaling()
re = CR.fit_transform(mydataset.X.transpose(0, 2, 1))
from gtda.homology import VietorisRipsPersistence
VR = VietorisRipsPersistence(homology_dimensions=[0, 1, 2])
diagrams2 = VR.fit_transform(mydataset.X)
features2 = PE.fit_transform(diagrams2)
print(features2)
print(features2.shape)

[[ 2.41060734  1.76600794 -1.         -1.        ]
 [ 2.23922718 -1.         -1.         -1.        ]
 [ 2.48386721 -1.         -1.         -1.        ]
 [ 2.60383576 -1.         -1.         -1.        ]
 [ 2.38870908  0.89199347 -1.         -1.        ]
 [ 2.31053739  0.         -1.         -1.        ]
 [ 2.26364229  0.         -1.         -1.        ]
 [ 2.30035284  0.         -1.         -1.        ]
 [ 2.42849368  0.         -1.         -1.        ]
 [ 2.31570323 -1.         -1.         -1.        ]
 [ 2.4146205   0.         -1.         -1.        ]
 [ 2.38958637  0.74619461 -1.         -1.        ]
 [ 2.05867417 -1.         -1.         -1.        ]
 [ 2.38585623 -1.         -1.         -1.        ]
 [ 2.31075319 -1.         -1.         -1.        ]
 [ 2.48399844 -1.         -1.         -1.        ]
 [ 2.4515316  -1.         -1.         -1.        ]
 [ 2.3143296  -1.         -1.         -1.        ]
 [ 2.1220446   0.         -1.         -1.        ]
 [ 2.350184   -1.         -1.  

In [411]:
x = np.concatenate([features1, features2], axis=1)
print(x.shape)

(98, 9)


In [425]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
mydataset.X = np.concatenate([features1, features2], axis=1)
record = []
for X_train, y_train, X_valid, y_valid in mydataset.DataIter():
    #model = RandomForestClassifier()  #没事了，随机森林
    model = svm.SVC(C=1.0, kernel='linear',  # linear效果最好
        gamma='scale', decision_function_shape='ovo')
    model.fit(X_train, y_train)
    acc = model.score(X_valid, y_valid)
    print(f'训练精度 {model.score(X_train, y_train):.3f}', end='  ')
    print('测试集精度', acc)
    record.append(acc)

print(f'估计泛化误差：{sum(record)/len(record):.3f}')

训练精度 0.432  测试集精度 0.7
训练精度 0.523  测试集精度 0.2
训练精度 0.455  测试集精度 0.2
训练精度 0.500  测试集精度 0.0
训练精度 0.511  测试集精度 0.4
训练精度 0.466  测试集精度 0.6
训练精度 0.523  测试集精度 0.2
训练精度 0.409  测试集精度 0.2
训练精度 0.461  测试集精度 0.1111111111111111
训练精度 0.494  测试集精度 0.3333333333333333
估计泛化误差：0.294
