In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('fifa_22.csv', sep = '\;')   # 구분자 ; 
data = data.drop('player_id', axis = 1)

In [3]:
print(data.shape)
data.head(30)    # nationality에 "", '' 등 불필요한 문자 있음

(18000, 7)


Unnamed: 0,name,nationality,position,overall,age,potential,team
0,Lionel Messi,Argentina,"ST,CF,RW",93,34,93,Paris Saint-Germain
1,Robert Lewandowski,Poland,ST,92,33,92,Bayern München
2,Kylian Mbappé,France,"ST,LW",91,22,95,Paris Saint-Germain
3,Jan Oblak,Slovenia,GK,91,28,93,Atlético de Madrid
4,Kevin De Bruyne,elgium,"CM,CAM",91,30,91,Manchester City
5,Neymar Jr,Brazil,"CAM,LW",91,29,91,Paris Saint-Germain
6,Cristiano Ronaldo,Portugal,"ST,LW",91,36,91,Manchester United
7,Harry Kane,England,ST,90,28,90,Tottenham Hotspur
8,Gianluigi Donnarumma,Italy,GK,89,22,93,Paris Saint-Germain
9,Alisson,Brazil,GK,89,28,90,Liverpool


In [4]:
data.nationality

0                  Argentina
1                     Poland
2                     France
3                   Slovenia
4                     elgium
                ...         
17995                 Norway
17996          """Australia"
17997       """Saudi Arabia"
17998    Republic of Ireland
17999                 Sweden
Name: nationality, Length: 18000, dtype: object

In [5]:
# 정규표현식으로 날림
data['nationality'] = data['nationality'].str.replace(pat=r'[^A-Za-z0-9]', repl= r' ', regex=True)
data['nationality'] = data['nationality'].str.replace(pat=r'[\s\s+]', repl= r' ', regex=True)

In [6]:
data

Unnamed: 0,name,nationality,position,overall,age,potential,team
0,Lionel Messi,Argentina,"ST,CF,RW",93,34,93,Paris Saint-Germain
1,Robert Lewandowski,Poland,ST,92,33,92,Bayern München
2,Kylian Mbappé,France,"ST,LW",91,22,95,Paris Saint-Germain
3,Jan Oblak,Slovenia,GK,91,28,93,Atlético de Madrid
4,Kevin De Bruyne,elgium,"CM,CAM",91,30,91,Manchester City
...,...,...,...,...,...,...,...
17995,Ulrik Mathisen,Norway,CM,52,22,62,Lillestrøm SK
17996,Damian Tsekenis,Australia,ST,52,20,67,Central Coast Mariners
17997,Abdullah Al Radeef,Saudi Arabia,ST,52,18,70,Al Hilal
17998,Aaron McNally,Republic of Ireland,"RB,RM,LM",52,21,66,Longford Town


# 포지션 별 유사도 측정

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
data['position']

0        ST,CF,RW
1              ST
2           ST,LW
3              GK
4          CM,CAM
           ...   
17995          CM
17996          ST
17997          ST
17998    RB,RM,LM
17999       RB,LB
Name: position, Length: 18000, dtype: object

In [9]:
# CountVectorizer를 적용하기 위해 공백문자로 word 단위가 구분되는 문자열로 변환. 
data['position'] = data['position'].apply(lambda x : x.replace(',',' '))
data

Unnamed: 0,name,nationality,position,overall,age,potential,team
0,Lionel Messi,Argentina,ST CF RW,93,34,93,Paris Saint-Germain
1,Robert Lewandowski,Poland,ST,92,33,92,Bayern München
2,Kylian Mbappé,France,ST LW,91,22,95,Paris Saint-Germain
3,Jan Oblak,Slovenia,GK,91,28,93,Atlético de Madrid
4,Kevin De Bruyne,elgium,CM CAM,91,30,91,Manchester City
...,...,...,...,...,...,...,...
17995,Ulrik Mathisen,Norway,CM,52,22,62,Lillestrøm SK
17996,Damian Tsekenis,Australia,ST,52,20,67,Central Coast Mariners
17997,Abdullah Al Radeef,Saudi Arabia,ST,52,18,70,Al Hilal
17998,Aaron McNally,Republic of Ireland,RB RM LM,52,21,66,Longford Town


In [46]:
# CountVectorizer로 학습시켰더니 18000명 선수에 대한 279개 포지션의 '포지션 매트릭스'가 생성되었다.

count_vect = CountVectorizer(min_df=0, ngram_range=(1, 3))
position_mat = count_vect.fit_transform(data['position'])
print(position_mat.shape)
print(position_mat)

(18000, 279)
  (0, 231)	1
  (0, 43)	1
  (0, 191)	1
  (0, 240)	1
  (0, 45)	1
  (0, 242)	1
  (1, 231)	1
  (2, 231)	1
  (2, 106)	1
  (2, 259)	1
  (3, 56)	1
  (4, 47)	1
  (4, 0)	1
  (4, 48)	1
  (5, 106)	1
  (5, 0)	1
  (5, 4)	1
  (6, 231)	1
  (6, 106)	1
  (6, 259)	1
  (7, 231)	1
  (8, 56)	1
  (9, 56)	1
  (10, 130)	1
  (10, 26)	1
  :	:
  (17982, 106)	1
  (17983, 7)	1
  (17984, 56)	1
  (17985, 56)	1
  (17986, 0)	1
  (17987, 0)	1
  (17988, 57)	1
  (17989, 56)	1
  (17990, 7)	1
  (17991, 47)	1
  (17992, 231)	1
  (17993, 47)	1
  (17994, 7)	1
  (17995, 47)	1
  (17996, 231)	1
  (17997, 231)	1
  (17998, 130)	1
  (17998, 170)	1
  (17998, 91)	1
  (17998, 182)	1
  (17998, 162)	1
  (17998, 166)	1
  (17999, 130)	1
  (17999, 57)	1
  (17999, 147)	1


In [47]:
count_vect.vocabulary_

{'st': 231,
 'cf': 43,
 'rw': 191,
 'st cf': 240,
 'cf rw': 45,
 'st cf rw': 242,
 'lw': 106,
 'st lw': 259,
 'gk': 56,
 'cm': 47,
 'cam': 0,
 'cm cam': 48,
 'cam lw': 4,
 'rb': 130,
 'cdm': 26,
 'rb cdm': 139,
 'cb': 7,
 'rw lw': 192,
 'rm': 170,
 'lm': 91,
 'rm lm': 182,
 'lm cf': 96,
 'rm lm cf': 184,
 'cb cdm': 9,
 'cdm cm': 28,
 'cb cdm cm': 10,
 'lb': 57,
 'cam cf': 1,
 'rm cam': 171,
 'cam rw': 5,
 'rm cam rw': 174,
 'st rm': 266,
 'rm cm': 177,
 'st rm cm': 269,
 'lm lw': 103,
 'rm lm lw': 186,
 'rb lb': 147,
 'rm rw': 189,
 'st rm rw': 272,
 'cf lw': 44,
 'lb lm': 73,
 'st lm': 253,
 'cdm cm cam': 29,
 'cam cf lw': 2,
 'st rw': 273,
 'st rw lw': 274,
 'st lm lw': 257,
 'cm cam cf': 49,
 'cm cam lw': 50,
 'st rm cam': 267,
 'rb rm': 162,
 'lwb': 107,
 'lwb lm': 117,
 'lwb lm lw': 121,
 'lm cam': 92,
 'lm cam lw': 94,
 'rwb': 193,
 'rwb rb': 215,
 'rwb rb rm': 222,
 'rb rm rw': 167,
 'st cam': 232,
 'st cam cf': 233,
 'rm rw lw': 190,
 'cm cam rw': 51,
 'lb lwb': 79,
 'lb lwb lm

# 코사인 유사도(cosine_similarity)이용해서 포지션별 유사도 계산

In [48]:
from sklearn.metrics.pairwise import cosine_similarity

position_cos_sim = cosine_similarity(position_mat, position_mat)

print(position_cos_sim.shape)
print(position_cos_sim[:10])

(18000, 18000)
[[1.         0.40824829 0.23570226 ... 0.40824829 0.         0.        ]
 [0.40824829 1.         0.57735027 ... 1.         0.         0.        ]
 [0.23570226 0.57735027 1.         ... 0.57735027 0.         0.        ]
 ...
 [0.40824829 1.         0.57735027 ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [49]:
# 자료를 정렬하는 것이 아니라 순서만 알고 싶다면 argsort
position_cos_sim_sorted_ind = position_cos_sim.argsort()[:, ::-1]
print(position_cos_sim_sorted_ind[:5])

[[    0  7080 12329 ... 11168 11167  8999]
 [10901 10590  1150 ... 11425 11424  8999]
 [ 6495  7446   706 ... 11243 11241  8999]
 [10431  2245 15312 ... 11480 11479     0]
 [ 1175  5849  5744 ... 11118 11117     0]]


# 추천 ver1. 포지션 코사인 유사도에 의해 포지션을 추천하는 함수

In [50]:
player_name = data[data['name'] == 'Kevin De Bruyne']
player_name

Unnamed: 0,name,nationality,position,overall,age,potential,team,new_rating
4,Kevin De Bruyne,elgium,CM CAM,91,30,91,Manchester City,82.564721


In [51]:
player_index = player_name.index.values
player_index

array([4], dtype=int64)

In [52]:
similar_indexes = position_cos_sim_sorted_ind[player_index, :10]
similar_indexes

array([[ 1175,  5849,  5744, 10155, 12758,  5783, 10117,  3899,  3894,
          288]], dtype=int64)

In [53]:
similar_indexes = similar_indexes.reshape(-1)
print(similar_indexes)

[ 1175  5849  5744 10155 12758  5783 10117  3899  3894   288]


In [54]:
data.iloc[similar_indexes]

Unnamed: 0,name,nationality,position,overall,age,potential,team,new_rating
1175,Nicolás Filhei,Brazil,CM CAM,76,25,76,São Paulo,73.76938
5849,Josh Onomah,England,CM CAM,69,24,76,Fulham,73.655406
5744,Bruno Fagundeiro,Brazil,CM CAM,69,21,69,Cuiabá,70.129859
10155,Lucas Lingman,Finland,CM CAM,65,23,74,HJK Helsinki,72.607449
12758,Franco Ragusa,Chile,CM CAM,63,28,63,Cobresal,67.296112
5783,Stijn Spierings,Netherlands,CM CAM,69,25,73,Toulouse FC,72.144457
10117,Christián Herc,Slovakia,CM CAM,65,23,74,Grasshopper Club Zürich,72.607449
3899,Joel Soñora,United States,CM CAM,71,25,76,Banfield,73.689141
3894,Morgan Gibbs-White,England,CM CAM,71,21,81,Sheffield United,76.243098
288,Giovani Lo Celso,Argentina,CM CAM,81,25,85,Tottenham Hotspur,78.73685


### 함수화

In [55]:
def find_sim_player_ver1(df, sorted_ind, player_name, top_n=10):
    
    player_name = df[df['name'] == player_name]
    
    player_index = player_name.index.values
    similar_indexes = sorted_ind[player_index, :(top_n)]
    
    print(similar_indexes)    
    similar_indexes = similar_indexes.reshape(-1)
    
    return df.iloc[similar_indexes]

In [56]:
similar_player = find_sim_player_ver1(data, position_cos_sim_sorted_ind, 'Jan Oblak', 10)
similar_player

[[10431  2245 15312 15311 15309 15308 15303 15299 15296 12590]]


Unnamed: 0,name,nationality,position,overall,age,potential,team,new_rating
10431,Thomas Mikkelsen,Denmark,GK,65,38,65,Brøndby IF,68.208952
2245,Frederik Rønnow,Denmark,GK,74,29,74,1. FC Union Berlin,72.695709
15312,Adam Wilk,Poland,GK,60,23,67,Cracovia,69.271802
15311,Tim Wiesner,Germany,GK,60,24,67,VfL Osnabrück,69.271802
15309,Lewis Thomas,Wales,GK,60,24,66,Forest Green Rovers,68.803052
15308,Miguel Vargas,Chile,GK,60,25,66,Unión La Calera,68.803052
15303,Ameen Bukhari,Saudi Arabia,GK,60,24,68,Al Nassr,69.740552
15299,Antoine Lejoly,elgium,GK,60,23,67,Beerschot,69.271802
15296,Dean Lyness,England,GK,60,30,60,St. Mirren,65.990552
12590,Koki Otani,Japan,GK,63,32,63,Hokkaido Consadole Sapporo,67.296112


In [57]:
data[data['position'] == 'GK'].sort_values(by = 'potential' , ascending=False).head(20)

Unnamed: 0,name,nationality,position,overall,age,potential,team,new_rating
3,Jan Oblak,Slovenia,GK,91,28,93,Atlético de Madrid,83.709375
8,Gianluigi Donnarumma,Italy,GK,89,22,93,Paris Saint-Germain,83.591023
11,Ederson,Brazil,GK,89,28,91,Manchester City,82.457265
15,Thibaut Courtois,elgium,GK,89,29,91,Real Madrid,82.457265
9,Alisson,Brazil,GK,89,28,90,Liverpool,81.890386
97,Mike Maignan,France,GK,84,26,89,Milan,81.070991
19,Keylor Navas,Costa Rica,GK,88,34,88,Paris Saint-Germain,80.710197
379,Dean Henderson,England,GK,80,24,87,Manchester United,79.775613
33,Wojciech Szczesny,Poland,GK,87,31,87,Juventus,80.101875
36,Hugo Lloris,France,GK,87,34,87,Tottenham Hotspur,80.101875


# 가중평점(평점&평가횟수) 반영한 선수 추천

In [58]:
#고정
C = data['potential'].mean()
m = data['overall'].quantile(0.6)

def new_rating(record):
    v = record['overall']
    R = record['potential']
    
    return ( (v/(v+m)) * R ) + ( (m/(m+v)) * C )

In [59]:
data['new_rating'] = data.apply(new_rating, axis=1)

In [60]:
data

Unnamed: 0,name,nationality,position,overall,age,potential,team,new_rating
0,Lionel Messi,Argentina,ST CF RW,93,34,93,Paris Saint-Germain,83.824787
1,Robert Lewandowski,Poland,ST,92,33,92,Bayern München,83.192442
2,Kylian Mbappé,France,ST LW,91,22,95,Paris Saint-Germain,84.854029
3,Jan Oblak,Slovenia,GK,91,28,93,Atlético de Madrid,83.709375
4,Kevin De Bruyne,elgium,CM CAM,91,30,91,Manchester City,82.564721
...,...,...,...,...,...,...,...,...
17995,Ulrik Mathisen,Norway,CM,52,22,62,Lillestrøm SK,67.256589
17996,Damian Tsekenis,Australia,ST,52,20,67,Central Coast Mariners,69.423256
17997,Abdullah Al Radeef,Saudi Arabia,ST,52,18,70,Al Hilal,70.723256
17998,Aaron McNally,Republic of Ireland,RB RM LM,52,21,66,Longford Town,68.989922


## 추천 ver2. 먼저 포지션 유사성 높은 선수 20개 선정 후(F1), 가중평점순 10개 선정(F2)

In [61]:
def find_sim_player_ver2(df, sorted_ind, player_name, top_n=10):
    player_name = df[df['name'] == player_name]
    player_index = player_name.index.values
    
    similar_indexes = sorted_ind[player_index, :(top_n*2)]
    similar_indexes = similar_indexes.reshape(-1)

    # 기준 선수 index는 제외
    similar_indexes = similar_indexes[similar_indexes != player_index]
    
    return df.iloc[similar_indexes].sort_values('new_rating', ascending=False)[:top_n]

In [62]:
position_cos_sim_sorted_ind

array([[    0,  7080, 12329, ..., 11168, 11167,  8999],
       [10901, 10590,  1150, ..., 11425, 11424,  8999],
       [ 6495,  7446,   706, ..., 11243, 11241,  8999],
       ...,
       [10901, 10590,  1150, ..., 11425, 11424,  8999],
       [13531,  8715, 10296, ..., 11130, 11129,     0],
       [17999, 13563,  8501, ..., 11370, 11369,     0]], dtype=int64)

In [63]:
similar_player = find_sim_player_ver2(data, position_cos_sim_sorted_ind, 'Kevin De Bruyne', 15)
similar_player

Unnamed: 0,name,nationality,position,overall,age,potential,team,new_rating
633,Florian Wirtz,Germany,CM CAM,78,18,89,Bayer 04 Leverkusen,80.745142
288,Giovani Lo Celso,Argentina,CM CAM,81,25,85,Tottenham Hotspur,78.73685
276,Lucas Paquetá,Brazil,CM CAM,81,24,85,Olympique Lyonnais,78.73685
3837,Hamed Junior Traorè,Ivory Coast,CM CAM,71,21,84,Sassuolo,77.775472
1485,Joe Willock,England,CM CAM,75,22,83,Newcastle United,77.42511
3894,Morgan Gibbs-White,England,CM CAM,71,21,81,Sheffield United,76.243098
632,Iniesta,Spain,CM CAM,79,37,79,Vissel Kobe,75.427147
621,Gylfi Sigur?sson,Iceland,CM CAM,79,32,79,Everton,75.427147
3859,Dani de Wit,Netherlands,CM CAM,71,23,77,AZ,74.199933
1175,Nicolás Filhei,Brazil,CM CAM,76,25,76,São Paulo,73.76938
