In [16]:
import pandas as pd  
from sklearn.cluster import KMeans  
from sklearn.metrics.pairwise import euclidean_distances  
from sklearn.preprocessing import StandardScaler
import numpy as np 
from difflib import get_close_matches
from fuzzywuzzy import process, fuzz
from nbformat import read, write
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, silhouette_samples
from mpl_toolkits.mplot3d import Axes3D

In [17]:
# Load data
top_players = pd.read_csv('../data/top_players.csv')
clean_combined_df = pd.read_csv('../data/clean_combined_df.csv')

# Step 1: Filter clean_combined_df where ps = 0 for matching
ps_0_df = clean_combined_df[clean_combined_df['ps'] == 0]
ps_1_df = clean_combined_df[clean_combined_df['ps'] == 1]  # Retain all ps = 1 rows

# Step 2: Extract player names from both datasets
ps_0_players_list = ps_0_df['Player'].tolist()
top_player_names_list = top_players['Player'].tolist()

# Step 3: Fuzzy match to identify matches for ps = 0 players
matched_players = []
for player in ps_0_players_list:
    match = process.extractOne(player, top_player_names_list, scorer=fuzz.ratio)
    if match and match[1] > 80:  # Keep matches with high similarity score
        matched_players.append(match[0])

# Step 4: Filter ps = 0 DataFrame for these matches
filtered_ps_0_df = ps_0_df[ps_0_df['Player'].isin(matched_players)]

# Combine matched ps = 0 rows and all ps = 1 rows
final_filtered_df = pd.concat([filtered_ps_0_df, ps_1_df], ignore_index=True)

# Display the final DataFrame
print(final_filtered_df)

     Unnamed: 0               Player                     Team  \
0            27        Achraf Hakimi                Paris S-G   
1           118  Alexis Mac Allister                 Brighton   
2           129              Alisson                Liverpool   
3           135      Alphonso Davies            Bayern Munich   
4           183     Andrew Robertson                Liverpool   
..          ...                  ...                      ...   
328        3035            Y. Senden              UCLA Bruins   
329        3036            Z. Babiak  Michigan State Spartans   
330        3037             Z. Kelly  Michigan State Spartans   
331        3038           Z. Martens      Michigan Wolverines   
332        3039            Z. Ramsey       Washington Huskies   

             Position  Minutes Played  Aerial Duels Attempted  \
0                  DF     2479.000000                1.090000   
1                MFFW     2123.000000                2.420000   
2                  GK   

In [18]:
non_numerical_columns = ['Player', 'Team', 'Position']
df_numeric = final_filtered_df.dropna().drop(columns=non_numerical_columns)

# Separate student players and famous players
students = df_numeric[df_numeric['ps'] == 1].drop(columns=['ps']).reset_index(drop=True)
famous = df_numeric[df_numeric['ps'] == 0].drop(columns=['ps']).reset_index(drop=True)

# Standardize the numerical features for clustering
scaler = StandardScaler()
students_scaled = scaler.fit_transform(students)
famous_scaled = scaler.fit_transform(famous)

# Convert scaled arrays back to DataFrames for easier handling
students_scaled_df = pd.DataFrame(students_scaled, columns=students.columns)
famous_scaled_df = pd.DataFrame(famous_scaled, columns=famous.columns)

# Perform K-Means clustering for grouping
num_clusters = len(famous_scaled_df)  # Use the number of famous players as clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
famous_scaled_df['cluster'] = kmeans.fit_predict(famous_scaled_df)

# Function to find the two closest famous players for each student player
def find_two_closest_famous(student, famous_scaled):
    # Calculate distances between the student and all famous players
    distances = euclidean_distances(
        student.values.reshape(1, -1), famous_scaled.drop(columns=['cluster']).values
    ).flatten()
    # Get indices of the two closest players
    closest_indices = distances.argsort()[:2]
    return famous_scaled.iloc[closest_indices].index.tolist()

# Find the two closest famous players for each student player
matches = []
for _, student in students_scaled_df.iterrows():
    closest_famous = find_two_closest_famous(student, famous_scaled_df)
    matches.append(closest_famous)

# Output the results
results = pd.DataFrame(matches, columns=['Closest_Player_1', 'Closest_Player_2'])
results.index.name = 'Student_Player_Index'
results.reset_index(inplace=True)



In [19]:
students.head()

Unnamed: 0.1,Unnamed: 0,Minutes Played,Aerial Duels Attempted,Defensive Duels Attempted,Touches in Penalty Area,Offsides,Progressive Runs,Fouls Drawn,Goals,Assists,...,Fouls Committed,Total Passes Attempted,Long Passes Attempted,Forward Passes,Passes into Final Third,Passes into Penalty Area,Passes Received,Crosses Attempted,Accurate Crosses,Shot Assists
0,2786,91.142857,3.785714,3.428571,1.214286,0.0,0.285714,0.0,0.0,0.0,...,0.714286,38.5,4.785714,15.071429,3.928571,0.428571,25.357143,0.142857,0.071429,0.285714
1,2787,91.447368,3.421053,8.131579,0.578947,0.0,1.605263,1.631579,0.026316,0.026316,...,0.315789,40.342105,7.131579,18.105263,8.789474,2.657895,28.447368,2.131579,0.894737,0.315789
2,2788,44.276596,1.574468,2.978723,1.446809,0.085106,0.425532,0.382979,0.106383,0.06383,...,0.510638,15.723404,0.87234,4.702128,2.06383,0.978723,10.765957,0.765957,0.276596,0.425532
3,2789,74.4,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,15.0,4.0,8.0,0.4,0.0,7.2,0.0,0.0,0.0
4,2790,69.320513,3.192308,1.820513,2.384615,0.371795,0.435897,0.935897,0.423077,0.051282,...,0.679487,12.820513,0.307692,1.782051,1.038462,0.461538,8.269231,0.576923,0.064103,0.346154


In [20]:
results.head()

Unnamed: 0,Student_Player_Index,Closest_Player_1,Closest_Player_2
0,0,35,26
1,1,33,10
2,2,27,12
3,3,2,21
4,4,15,23


In [21]:
# Function to get player details using indices
def get_player_details(player_indices, df, cols=['Player', 'Team', 'Position']):
    """Fetch details for the given player indices."""
    return final_filtered_df[final_filtered_df['Unnamed: 0'].isin(player_indices)][cols].values.tolist()

# Prepare results with detailed player information
detailed_results = []
for index, row in results.iterrows():
    # Map the student index from the `students` table to the original `Unnamed: 0` in `df`
    student_index_in_students = row['Student_Player_Index']
    student_index_in_original = students.iloc[student_index_in_students]['Unnamed: 0']
    student_row = final_filtered_df[final_filtered_df['Unnamed: 0'] == student_index_in_original]
    student_name = student_row['Player'].values[0]
    student_position = student_row['Position'].values[0]  # Get the position of the student player

    # Map closest famous player indices to the original `Unnamed: 0` in `df`
    closest_indices_in_famous = [
        famous.iloc[row['Closest_Player_1']]['Unnamed: 0'],
        famous.iloc[row['Closest_Player_2']]['Unnamed: 0']
    ]

    # Get details for closest famous players
    closest_players = get_player_details(closest_indices_in_famous, final_filtered_df)

    # Append results
    detailed_results.append({
        'Student_Name': student_name,
        'Student_Position': student_position,  # Add the student position
        'Closest_Player_Names': ', '.join([p[0] for p in closest_players]),
        'Closest_Player_Teams': ', '.join([p[1] for p in closest_players]),
        'Closest_Player_Positions': ', '.join([p[2] for p in closest_players])
    })

# Convert detailed results to a DataFrame
detailed_results_df = pd.DataFrame(detailed_results)

# Save the detailed results
detailed_results_df.to_csv('../data/student_famous_detailed_matches.csv', index=False, encoding='latin-1', errors='ignore')

print("Detailed results have been saved to 'student_famous_detailed_matches.csv'.")
detailed_results_df.head()

Detailed results have been saved to 'student_famous_detailed_matches.csv'.


Unnamed: 0,Student_Name,Student_Position,Closest_Player_Names,Closest_Player_Teams,Closest_Player_Positions
0,A. Adalsteinsson,"CB, LCB","Frenkie de Jong, Kalidou Koulibaly","Barcelona, Napoli","MF, DF"
1,A. Barger,LB,"Bruno Fernandes, João Cancelo","Manchester Utd, Manchester City","MF, DF"
2,A. Bilow,RW,"Bukayo Saka, Gabriel Jesus","Arsenal, Manchester City","FWMF, FW"
3,A. Braman,GK,"Alisson, Ederson","Liverpool, Manchester City","GK, GK"
4,A. Camara,"AMF, CF","Ciro Immobile, Erling Haaland","Lazio, Dortmund","FW, FW"
