In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio


In [14]:
snapshot_lookup = pd.read_parquet('../data/processed/player_snapshot.parquet')

In [15]:
snapshot_lookup.isnull().sum()

player_id                      0
snapshot_date                  0
season_year                    0
name                           0
age                            0
position                       0
sub_position                   0
foot                           0
height_in_cm                   0
country_of_citizenship         0
market_value_in_eur            0
highest_market_value_in_eur    0
mv_ratio_to_peak               0
y_growth                       0
future_market_value            0
years_to_contract_end          0
minutes_total                  0
games_played                   0
minutes_per_game               0
goals_per_90                   0
assists_per_90                 0
delta_minutes_total            0
delta_goals_per_90             0
delta_assists_per_90           0
current_club_id                0
current_club_name              0
club_total_market_value        0
club_win_rate                  0
club_goal_diff_per_game        0
league_name                    0
league_cou

In [16]:
snapshot_lookup.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169226 entries, 0 to 169225
Data columns (total 36 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   player_id                    169226 non-null  int64         
 1   snapshot_date                169226 non-null  datetime64[ns]
 2   season_year                  169226 non-null  int32         
 3   name                         169226 non-null  object        
 4   age                          169226 non-null  float64       
 5   position                     169226 non-null  object        
 6   sub_position                 169226 non-null  object        
 7   foot                         169226 non-null  object        
 8   height_in_cm                 169226 non-null  float64       
 9   country_of_citizenship       169226 non-null  object        
 10  market_value_in_eur          169226 non-null  int64         
 11  highest_market_value_in_eu

In [17]:
snapshot_lookup.age.value_counts()

age
24.741958    105
24.637919     95
25.138946     71
23.381246     67
24.265572     64
            ... 
38.590007      1
37.845311      1
38.173854      1
36.898015      1
16.399726      1
Name: count, Length: 8310, dtype: int64

In [6]:
players = pd.read_csv('../data/players.csv')
players.info()
players.isnull().sum()
players.head()
players.tail()
players.describe()
players.describe(include=['object'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32601 entries, 0 to 32600
Data columns (total 23 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   player_id                             32601 non-null  int64  
 1   first_name                            30539 non-null  object 
 2   last_name                             32601 non-null  object 
 3   name                                  32601 non-null  object 
 4   last_season                           32601 non-null  int64  
 5   current_club_id                       32601 non-null  int64  
 6   player_code                           32601 non-null  object 
 7   country_of_birth                      29802 non-null  object 
 8   city_of_birth                         30146 non-null  object 
 9   country_of_citizenship                32218 non-null  object 
 10  date_of_birth                         32554 non-null  object 
 11  sub_position   

Unnamed: 0,first_name,last_name,name,player_code,country_of_birth,city_of_birth,country_of_citizenship,date_of_birth,sub_position,position,foot,contract_expiration_date,agent_name,image_url,url,current_club_domestic_competition_id,current_club_name
count,30539,32601,32601,32601,29802,30146,32218,32554,32421,32601,30065,20510,16582,32601,32601,32601,32601
unique,7030,23795,31892,31852,185,8578,183,9306,13,5,3,119,2897,26854,32601,14,437
top,David,García,Paulinho,paulinho,France,London,Spain,1996-01-19 00:00:00,Centre-Back,Defender,right,2023-06-30 00:00:00,Wasserman,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/miroslav-klose...,TR1,Kilmarnock Football Club
freq,215,68,13,13,2337,479,1965,20,5744,10389,21149,4502,504,5748,1,3220,185


In [21]:
reg_output = pd.read_parquet('../data/processed/regression_outputs.parquet')

In [22]:
reg_output.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3870 entries, 0 to 3869
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   player_id              3870 non-null   int64         
 1   snapshot_date          3870 non-null   datetime64[ns]
 2   y_growth_pred          3870 non-null   float32       
 3   mv_pred_1y             3870 non-null   float64       
 4   reg_shap_top_features  3870 non-null   object        
dtypes: datetime64[ns](1), float32(1), float64(1), int64(1), object(1)
memory usage: 136.2+ KB


In [26]:
reg_output.head()

Unnamed: 0,player_id,snapshot_date,y_growth_pred,mv_pred_1y,reg_shap_top_features
0,3333,2024-05-27,-0.122202,884969.7,"[{""feature"": ""age"", ""shap_value"": -0.350353598..."
1,7825,2024-06-07,-0.209104,486786.4,"[{""feature"": ""age"", ""shap_value"": -0.351179957..."
2,12029,2024-06-03,0.004567,100457.7,"[{""feature"": ""age"", ""shap_value"": -0.385254442..."
3,12282,2024-06-07,-0.179686,2506597.0,"[{""feature"": ""age"", ""shap_value"": -0.335636049..."
4,12589,2024-05-30,-0.279877,113381.5,"[{""feature"": ""age"", ""shap_value"": -0.500216662..."


In [30]:
reg_output.head(10).to_csv('../app/mock_data/player_recommendations.csv')

In [27]:
pd.read_parquet('../data/processed/development_outputs.parquet')

Unnamed: 0,player_id,age,sub_position,value_million,expected_value_million,valuation_above_curve,ga_per_90,expected_ga_per_90,performance_above_curve,minutes_per_90,expected_minutes_per_90,minutes_above_curve,aging_score,development_tier
0,16733,29.50,Goalkeeper,0.100,0.616667,-0.516667,,0.003040,,,89.462767,,-0.015138,declining
1,73096,20.50,Left-Back,0.050,0.266667,-0.216667,0.000000,0.136622,-0.136622,43.000000,69.107829,-26.107829,-0.336652,declining
2,4582,37.75,Goalkeeper,0.800,0.200000,0.600000,,0.004030,,,88.611910,,0.072315,normal
3,36758,39.75,Goalkeeper,0.025,,,,,,,,,0.025325,normal
4,62553,19.50,Centre-Forward,0.200,0.250000,-0.050000,1.058824,0.509300,0.549524,85.000000,41.008290,43.991710,0.732093,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31039,420465,25.00,Centre-Back,10.000,0.800000,9.200000,0.066039,0.076939,-0.010901,75.018349,80.613884,-5.595536,0.679340,normal
31040,419576,27.50,Right-Back,0.450,0.866667,-0.416667,0.000000,0.137973,-0.137973,53.500000,76.487157,-22.987157,-0.318238,declining
31041,412669,25.25,Right Winger,30.000,0.966667,29.033333,0.548859,0.442605,0.106255,56.658537,61.500164,-4.841627,2.288439,aging well
31042,479638,23.75,Right Winger,4.000,0.800000,3.200000,0.515595,0.450569,0.065026,59.283019,58.671836,0.611183,0.309059,normal


In [31]:
players.head()



Unnamed: 0,player_id,first_name,last_name,name,last_season,current_club_id,player_code,country_of_birth,city_of_birth,country_of_citizenship,...,foot,height_in_cm,contract_expiration_date,agent_name,image_url,url,current_club_domestic_competition_id,current_club_name,market_value_in_eur,highest_market_value_in_eur
0,10,Miroslav,Klose,Miroslav Klose,2015,398,miroslav-klose,Poland,Opole,Germany,...,right,184.0,,ASBW Sport Marketing,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/miroslav-klose...,IT1,Società Sportiva Lazio S.p.A.,1000000.0,30000000.0
1,26,Roman,Weidenfeller,Roman Weidenfeller,2017,16,roman-weidenfeller,Germany,Diez,Germany,...,left,190.0,,Neubauer 13 GmbH,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/roman-weidenfe...,L1,Borussia Dortmund,750000.0,8000000.0
2,65,Dimitar,Berbatov,Dimitar Berbatov,2015,1091,dimitar-berbatov,Bulgaria,Blagoevgrad,Bulgaria,...,,,,CSKA-AS-23 Ltd.,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/dimitar-berbat...,GR1,Panthessalonikios Athlitikos Omilos Konstantin...,1000000.0,34500000.0
3,77,,Lúcio,Lúcio,2012,506,lucio,Brazil,Brasília,Brazil,...,,,,,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/lucio/profil/s...,IT1,Juventus Football Club,200000.0,24500000.0
4,80,Tom,Starke,Tom Starke,2017,27,tom-starke,East Germany (GDR),Freital,Germany,...,right,194.0,,IFM,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/tom-starke/pro...,L1,FC Bayern München,100000.0,3000000.0


In [32]:
snapshot_lookup.columns

Index(['player_id', 'snapshot_date', 'season_year', 'name', 'age', 'position',
       'sub_position', 'foot', 'height_in_cm', 'country_of_citizenship',
       'market_value_in_eur', 'highest_market_value_in_eur',
       'mv_ratio_to_peak', 'y_growth', 'future_market_value',
       'years_to_contract_end', 'minutes_total', 'games_played',
       'minutes_per_game', 'goals_per_90', 'assists_per_90',
       'delta_minutes_total', 'delta_goals_per_90', 'delta_assists_per_90',
       'current_club_id', 'current_club_name', 'club_total_market_value',
       'club_win_rate', 'club_goal_diff_per_game', 'league_name',
       'league_country', 'league_strength', 'league_is_major',
       'is_top5_league', 'has_recent_transfer', 'moved_to_bigger_club_flag'],
      dtype='object')

In [34]:
snapshot_lookup.delta_minutes_total.describe()

count    169226.000000
mean         83.078688
std         634.468392
min       -4469.000000
25%           0.000000
50%           0.000000
75%           0.000000
max        4821.000000
Name: delta_minutes_total, dtype: float64