In [1]:
import pandas as pd
import scipy.stats as stats1
from scipy.stats import shapiro
import numpy as np

In [2]:
stats = pd.read_csv('stats.csv')
stats

Unnamed: 0,id,season_id,player_id,Games_Played,Points,goals,Age,Position,team_id,experience,innate_ability
0,1,1,105,68,2335,672,30,SG,1,9,0.30
1,2,1,109,66,1978,624,29,PG,23,6,0.21
2,3,1,11,70,1863,627,23,SG,12,3,0.13
3,4,1,115,63,1857,685,25,PF,26,5,0.20
4,5,1,97,60,1778,546,21,PG,9,0,0.00
...,...,...,...,...,...,...,...,...,...,...,...
380,381,5,27,70,2370,804,24,PG,25,2,0.08
381,382,5,69,73,2222,837,21,PF,10,1,0.05
382,383,5,165,70,2370,804,24,PG,25,1,0.04
383,384,5,83,70,2370,804,26,PG,8,2,0.08


In [3]:
agility = pd.read_csv('height_to_weight_ratio.csv')
agility

Unnamed: 0,player_id,height_to_weight_ratio,height,weight,season_id,player_name
0,105,1.98,196,99,1,James Harden
1,109,2.14,188,88,1,Damian Lillard
2,11,2.13,198,93,1,Devin Booker
3,115,1.94,211,109,1,Giannis Antetokounmpo
4,97,2.50,185,74,1,Trae Young
...,...,...,...,...,...,...
380,27,2.24,206,92,5,Drew Peterson
381,69,2.10,193,92,5,Jaden Springer
382,165,2.24,206,92,5,Dalano Banton
383,83,1.90,198,104,5,Lamar Stevens


In [4]:
agility_2020 = agility[agility['season_id'] == 2][['player_id', 'height_to_weight_ratio']]
best_2020 = stats[stats['season_id'] == 2].sort_values(by='Points', ascending=False).head(20)[['player_id', 'Points']]
best_2020_agility = pd.merge(best_2020, agility_2020, on = 'player_id')

In [5]:
agility_2021 = agility[agility['season_id'] == 3][['player_id', 'height_to_weight_ratio']]
best_2021 = stats[stats['season_id'] == 3].sort_values(by='Points', ascending=False).head(20)[['player_id', 'Points']]
best_2021_agility = pd.merge(best_2021, agility_2021, on = 'player_id')

In [6]:
agility_2022 = agility[agility['season_id'] == 4][['player_id', 'height_to_weight_ratio']]
best_2022 = stats[stats['season_id'] == 4].sort_values(by='Points', ascending=False).head(20)[['player_id', 'Points']]
best_2022_agility = pd.merge(best_2022, agility_2022, on = 'player_id')

In [7]:
agility_2023 = agility[agility['season_id'] == 5][['player_id', 'height_to_weight_ratio']]
best_2023 = stats[stats['season_id'] == 5].sort_values(by='Points', ascending=False).head(20)[['player_id', 'Points']]
best_2023_agility = pd.merge(best_2023, agility_2023, on = 'player_id')

In [8]:
group1_data = pd.concat([best_2020_agility['height_to_weight_ratio'], best_2021_agility['height_to_weight_ratio']])
group2_data = pd.concat([best_2022_agility['height_to_weight_ratio'], best_2023_agility['height_to_weight_ratio']])

In [9]:
#checking if height data is normal.
stat1, p1 = shapiro(group1_data)
print(f"Statistic: {stat1}, p-value: {p1}")

if p1 > 0.05:
    print("Group1 data is normal.")
else:
    print("Group1 data is not normal.")

#-----------------------------------------------#
stat2, p2 = shapiro(group2_data)
print(f"Statistic: {stat2}, p-value: {p2}")

if p2 > 0.05:
    print("Group2 data is normal.")
else:
    print("Group2 data is not normal.")

Statistic: 0.95395643879926, p-value: 0.10379665993353299
Group1 data is normal.
Statistic: 0.9667774773471002, p-value: 0.26948302457481166
Group2 data is normal.


In [10]:
#So we have determined that the data for both groups is normal.
#And now we can perform the t-test.

In [11]:
t_stat, p_value = stats1.ttest_ind(group1_data, group2_data, alternative = 'greater')
alpha = 0.05
if p_value < alpha:
    print("The average agility of the top 20 players, has increased significantly.")
else:
    print("The average agility of the top 20 players, has not increased significantly.")

The average agility of the top 20 players, has not increased significantly.


In [12]:
#But what if data wasn't normal?
#Then, we needed to see if we could normalize it or not! For this, we use Z-score normalization.
#(If the data follows a normal distribution, Z-score is more appropriate.)
#(If the data has specific upper and lower bounds, Min-Max is more suitable.)

In [13]:
####؟؟؟؟؟؟
mean1 = np.mean(group1_data)
std1 = np.std(group1_data)

z_scores1 = (group1_data - mean1) / std1
print(z_scores1)

0     1.241089
1     1.187011
2    -0.056782
3    -0.164938
4     0.646232
5     0.700310
6     0.267686
7    -1.841354
8     0.538076
9     0.321764
10   -0.219016
11   -0.219016
12   -0.002704
13   -1.462809
14    0.321764
15   -1.354653
16    0.105452
17   -2.111744
18   -0.219016
19   -1.084263
0     2.484882
1     1.349245
2     0.375842
3    -0.435328
4     0.808465
5    -0.002704
6     0.105452
7     1.295167
8    -0.759795
9    -0.110860
10   -0.219016
11    1.295167
12    0.483998
13    0.538076
14    0.105452
15    0.159530
16   -1.949510
17    0.538076
18   -2.111744
19   -0.543483
Name: height_to_weight_ratio, dtype: float64


In [14]:
mean2 = np.mean(group2_data)
std2 = np.std(group2_data)

z_scores2 = (group2_data - mean1) / std2
print(z_scores2)

0     0.559925
1     2.079320
2     0.503651
3    -0.678100
4     0.447377
5     0.447377
6    -0.678100
7    -0.115361
8    -0.734374
9     0.278556
10    1.235212
11   -2.197495
12   -2.197495
13   -1.578483
14   -2.028674
15   -0.790648
16   -0.227909
17   -1.015744
18    1.572855
19   -0.396731
20   -0.790648
0    -0.790648
1    -0.790648
2     1.122664
3     1.122664
4     0.785021
5     0.447377
6    -0.678100
7    -1.184565
8     0.278556
9     0.334830
10    0.897569
11    0.334830
12    0.785021
13   -0.734374
14    1.122664
15    0.334830
16    0.334830
17   -0.565553
18    0.841295
19   -0.621826
Name: height_to_weight_ratio, dtype: float64


In [15]:
#checking are they normal rn?
stat1, p1 = shapiro(z_scores1)
print(f"Statistic: {stat1}, p-value: {p1}")

if p1 > 0.05:
    print("Group1 data is normal.")
else:
    print("Group1 data is not normal.")
#-----------------------------------------------#
stat2, p2 = shapiro(z_scores2)
print(f"Statistic: {stat2}, p-value: {p2}")

if p2 > 0.05:
    print("Group2 data is normal.")
else:
    print("Group2 data is not normal.")

Statistic: 0.9539564387992596, p-value: 0.10379665993353049
Group1 data is normal.
Statistic: 0.9667774773471002, p-value: 0.26948302457481166
Group2 data is normal.


In [16]:
#So they have been normalized.
#now we can use the t-test.
#If they had not been normalized, we would use non-parametric methods like Mann-Whitney U!

In [17]:
t_stat, p_value = stats1.ttest_ind(z_scores1, z_scores2, alternative = 'greater')
alpha = 0.05
if p_value < alpha:
    print("The average agility of the top 20 players, has increased significantly.")
else:
    print("The average agility of the top 20 players, has not increased significantly.")

The average agility of the top 20 players, has not increased significantly.
