In [2]:
import pandas as pd
import math
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('fifa.csv')
df.head()

Unnamed: 0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,Club Logo,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,https://cdn.sofifa.org/teams/2/light/241.png,...,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,€226.5M
1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,https://cdn.sofifa.org/teams/2/light/45.png,...,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,€127.1M
2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,Paris Saint-Germain,https://cdn.sofifa.org/teams/2/light/73.png,...,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,€228.1M
3,193080,De Gea,27,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91,93,Manchester United,https://cdn.sofifa.org/teams/2/light/11.png,...,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,€138.6M
4,192985,K. De Bruyne,27,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91,92,Manchester City,https://cdn.sofifa.org/teams/2/light/10.png,...,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,€196.4M


In [4]:
df = df[df['Overall'] > 74]
df.shape

(1244, 60)

# Normalizing Data

In [5]:
print(df.shape)
df.dropna(subset=['Position'], axis=0, inplace=True)
df.shape

(1244, 60)


(1243, 60)

In [6]:
selected_columns = ['Crossing', 'Finishing', 'HeadingAccuracy',
       'ShortPassing', 'Volleys', 'Dribbling', 'Curve', 'FKAccuracy',
       'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed',
       'Agility', 'Reactions', 'Balance', 'ShotPower', 'Jumping',
       'Stamina', 'Strength', 'LongShots', 'Aggression', 'Interceptions',
       'Positioning', 'Vision', 'Penalties', 'Composure', 'Marking',
       'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling',
       'GKKicking', 'GKPositioning', 'GKReflexes']
label = df['Position']
rdf = df[selected_columns]
rdf.head()

Unnamed: 0,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,LongPassing,BallControl,...,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes
0,84.0,95.0,70.0,90.0,86.0,97.0,93.0,94.0,87.0,96.0,...,75.0,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0
1,84.0,94.0,89.0,81.0,87.0,88.0,81.0,76.0,77.0,94.0,...,85.0,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0
2,79.0,87.0,62.0,84.0,84.0,96.0,88.0,87.0,78.0,95.0,...,81.0,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0
3,17.0,13.0,21.0,50.0,13.0,18.0,21.0,19.0,51.0,42.0,...,40.0,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0
4,93.0,82.0,55.0,92.0,82.0,86.0,85.0,83.0,91.0,91.0,...,79.0,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0


In [7]:
pos_map = {
    "F": ['CF', 'LF', 'RF', 'LS', 'ST', 'LW', 'RS', 'RW'],
    "M": ['CAM', 'CDM', 'CM', 'LAM', 'LCM', 'LDM', 'LM', 'RAM', 'RCM', 'RDM', 'RM'],
    "D": ['CB', 'LB', 'LCB', 'LWB', 'RB', 'RCB', 'RWB'],
    "G": ['GK']
    }
label = label.tolist()
for i in range(len(label)):
    mapped = False
    for j in pos_map.keys():
        if label[i] in pos_map[j]:
            label[i] = j
            mapped = True
            break
    if not mapped:
        print(i)
label = pd.DataFrame(label,columns=['Position'])
label.head()

Unnamed: 0,Position
0,F
1,F
2,F
3,G
4,M


In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(rdf, label, test_size=0.3, random_state=30)

In [9]:
X_train = pd.DataFrame(X_train, columns=selected_columns)
X_test = pd.DataFrame(X_test, columns=selected_columns)
Y_train = pd.DataFrame(Y_train, columns=['Position'])
Y_test = pd.DataFrame(Y_test, columns=['Position'])
print("X_train: " + str(X_train.shape))
print("X_test: " + str(X_test.shape))
print("Y_train: " + str(Y_train.shape))
print("Y_test: " + str(Y_test.shape))

X_train: (870, 34)
X_test: (373, 34)
Y_train: (870, 1)
Y_test: (373, 1)


# Clustring Algorithm begin

In [20]:
DESIRED_CLUSTERS = 4
CONSTANT_NUMBER_G = 5
CONSTATN_NUMBER_K = 5

In [11]:
from scipy.spatial import distance_matrix

In [12]:
D_original = pd.DataFrame(distance_matrix(X_train.values,X_train.values), index=list(range(X_train.shape[0])), columns=list(range(X_train.shape[0])))
D_original.shape

(870, 870)

In [13]:
K = CONSTATN_NUMBER_K
R_k = []
for idx ,x in D_original.iterrows():
    R_k.append(list(x.nsmallest(K + 1).keys()))

R_k = pd.DataFrame(R_k)
R_k.shape

(870, 6)

In [14]:
N = X_train.shape[0]
L = list(range(0,N))
G = CONSTANT_NUMBER_G

In [15]:
C_previous = N
C_current = math.floor(N/G)

In [16]:
D_current = np.zeros((N,N))
# R_k_list = R_k.values.tolist()
D_org_list = D_original.values.tolist()
R_k_list = R_k.values.tolist()
k2 = (K + 1)**2
for i in range(N):
    for j in range(i, N):
        sum_val = 0
        R_k_i = R_k_list[i]
        R_k_j = R_k_list[j]
        for h in range(K):
            for p in range(K):                
                # sum_val += D_original.iloc[R_k_i[h],R_k_j[p]]
                sum_val += D_org_list[R_k_i[h]][R_k_j[p]]
        # R_k_i = R_k_list[i]
        # R_k_j = R_k_list[j]
        # temp = D_original.iloc[R_k_i, R_k_j]
        dist = sum_val/(k2)
        D_current[i][j] = dist
        D_current[j][i] = dist
D_current = pd.DataFrame(D_current)
print(D_current.shape)
D_current.head()

(870, 870)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,860,861,862,863,864,865,866,867,868,869
0,22.553955,44.926595,45.851284,44.363636,196.231375,90.20123,57.31459,77.73979,66.398755,74.016108,...,44.252398,79.973745,61.820968,84.224243,56.436411,62.403293,73.374119,184.807299,93.762007,81.43444
1,44.926595,20.494419,36.363306,32.521685,203.269588,71.613182,51.601723,66.103628,39.272678,61.340395,...,34.237055,75.308335,35.240364,71.016189,29.798189,40.074295,56.427931,192.872695,75.614605,66.28997
2,45.851284,36.363306,21.887353,37.185239,216.976581,89.890743,50.630996,77.178184,52.068302,76.80354,...,34.768885,86.918617,48.134549,85.41313,41.175684,49.338861,71.858297,206.259853,89.921689,83.185054
3,44.363636,32.521685,37.185239,23.073137,203.458567,70.497423,55.146238,69.005874,43.098179,60.876288,...,41.643798,76.956924,37.075292,73.706264,32.061537,45.950131,53.809818,193.152149,75.531926,64.932332
4,196.231375,203.269588,216.976581,203.458567,20.444963,196.425013,202.814022,195.661643,210.131946,196.366536,...,202.962955,186.190675,207.610055,194.441998,206.397518,202.563782,198.820016,36.847204,203.693699,191.793349


In [19]:
len(D_current)

870

In [25]:
def identify_key_elements(D_current: pd.DataFrame, C_current: int)->pd.DataFrame:
    
    # Finding first key element which has the least average distance from others
    first_key = math.inf
    first_key_index = int()
    D_current_list = D_current.values.tolist()
    m = len(D_current_list)
    s = set()
    k = list(np.arange(0,m,1))
    for i in range(m):
        avg = 0
        avg = sum(D_current_list[i])/m
        if(avg<first_key):
            first_key = avg
            first_key_index = i
    k.remove(first_key_index)
    s.add(first_key_index)
    n = 1
    
    # Doing iterations until we have enough clusters
    while(n < C_current):
    
        next_key_index = int()
        min_dist = math.inf
        max_dist = -math.inf
    
        # Finding next key element which has the most minimums distance to the current key elements
        for index in k:
            min_dist = math.inf
            for j in range(0,len(s)):
                dist = D_current_list[index][j]
                if(dist < min_dist):
                    min_dist =dist
            if(min_dist > max_dist):
                max_dist = min_dist
                next_key_index = index
        k.remove(next_key_index)
        s.add(next_key_index)
        n = n + 1
    
    # Returning all key elements(aka new clusters)
    return pd.DataFrame(s)




In [26]:
S = identify_key_elements(D_current=D_current, C_current=C_current)