In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statistics as stats
import numpy as np


In [None]:
mvp = pd.read_csv('mvp_awards.csv')
stats = pd.read_csv('stats.csv')
players = pd.read_csv('players.csv')
players = players.rename(columns={'id': 'player_id'})

In [None]:
best_50_2019 = stats[stats['season_id'] == 1].sort_values(by = 'Points' ,ascending = False).head(50)[['player_id','Points']]
best_50_2020 = stats[stats['season_id'] == 2].sort_values(by = 'Points' ,ascending = False).head(50)[['player_id','Points']]
best_50_2021 = stats[stats['season_id'] == 3].sort_values(by = 'Points' ,ascending = False).head(50)[['player_id','Points']]
best_50_2022 = stats[stats['season_id'] == 4].sort_values(by = 'Points' ,ascending = False).head(50)[['player_id','Points']]
best_50_2023 = stats[stats['season_id'] == 5].sort_values(by = 'Points' ,ascending = False).head(50)[['player_id','Points']]

merged_df = pd.concat([best_50_2019, best_50_2020, best_50_2021, best_50_2022, best_50_2023]).drop_duplicates(subset='player_id')
merged_df

In [None]:
height_bestp = merged_df.merge(players[['player_id', 'height']], on='player_id')
height_bestp

In [None]:
height_mvp = mvp.merge(players[['player_id', 'height']], on='player_id')[['player_id', 'height']]
height_mvp

In [None]:
plt.figure(figsize=(10, 6))
sns.kdeplot(height_bestp['height'], fill=True, color='blue', label='Best Players')

sns.kdeplot(height_mvp['height'], fill=True, color='red', label='MVP Winners')

plt.title('Height Distribution Comparison')
plt.xlabel('Height')
plt.ylabel('Density')
plt.legend()

plt.show()


In [None]:
#For best players:
# Calculate mean (average value)
mean_bestp = height_bestp['height'].mean()

# Calculate range (max - min)
range_bestp = height_bestp['height'].max() - height_bestp['height'].min()

# Calculate quartiles (Q1, Q2/Median, Q3)
q1_b = np.percentile(height_bestp['height'], 25) 
q2_b = np.percentile(height_bestp['height'], 50)  
q3_b = np.percentile(height_bestp['height'], 75)  

print(f"Mean: {mean_bestp}")
print(f"Range: {range_bestp}")
print(f"Q1: {q1_b}, Median (Q2): {q2_b}, Q3: {q3_b}")

In [None]:
#For mvp winners:
# Calculate mean (average value)
mean_mvp = height_mvp['height'].mean()

# Calculate range (max - min)
range_mvp = height_mvp['height'].max() - height_mvp['height'].min()

# Calculate quartiles (Q1, Q2/Median, Q3)
q1_m = np.percentile(height_mvp['height'], 25)  # First quartile (25th percentile)
q2_m = np.percentile(height_mvp['height'], 50)  # Median (50th percentile)
q3_m = np.percentile(height_mvp['height'], 75)  # Third quartile (75th percentile)

print(f"Mean: {mean_mvp}")
print(f"Range: {range_mvp}")
print(f"Q1: {q1_m}, Median (Q2): {q2_m}, Q3: {q3_m}")

In [None]:
num_bins = int(np.log2(len(height_mvp['height'])) + 1)
print(num_bins)
plt.hist(height_bestp['height'], bins=10 , color = "blue", edgecolor="black")
plt.xlabel("Height")
plt.ylabel("Frequency")
plt.title("Height Distribution for Best Players")
plt.show()

In [None]:
plt.hist(height_mvp['height'], bins=10 ,color = "red", edgecolor="black")
plt.xlabel("Height")
plt.ylabel("Frequency")
plt.title("Height Distribution For MVP Winners")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(height_bestp['height'], bins=10 , color = "blue", edgecolor="black",alpha=0.5, label='Best Players', density=True)

plt.hist(height_mvp['height'], bins=10 ,color = "red", edgecolor="black",alpha=0.5, label='MVP Winners', density=True)

sns.kdeplot(height_bestp['height'], color="blue", linewidth=2, label='KDE Best Players')
sns.kdeplot(height_mvp['height'], color="red", linewidth=2, label='KDE MVP Winners')

plt.title('Height Distribution Comparison')
plt.xlabel('Height')
plt.ylabel('Density')
plt.legend()

plt.show()

In [None]:
#just being curuios about the ways to calculate bins
# data = height_mvp['height'] 
# iqr = np.percentile(data, 75) - np.percentile(data, 25)  # Q3 - Q1 (Interquartile Range)
# bin_width = 2 * iqr * (len(data) ** (-1/3))  # Freedman-Diaconis Formula
# num_bins = int((data.max() - data.min()) / bin_width)  # Total range divided by bin width
# num_bins

In [None]:
plt.figure(figsize=(10, 6))
sns.kdeplot(height_mvp['height'], fill =True, color="skyblue", linewidth=2)

# Overlay Mean
plt.axvline(mean_mvp, color='red', linestyle='dashed', linewidth=2, label=f"Mean: {mean_mvp:.2f}")

# Overlay Quartiles
plt.axvline(q1_m, color='purple', linestyle='dotted', linewidth=2, label=f"Q1: {q1_m:.2f}")
plt.axvline(q2_m, color='green', linestyle='dotted', linewidth=2, label=f"Median (Q2): {q2_m:.2f}")
plt.axvline(q3_m, color='purple', linestyle='dotted', linewidth=2, label=f"Q3: {q3_m:.2f}")


plt.xlabel("Height")
plt.ylabel("Density")
plt.title("Height Distribution (KDE) with Mean, Mode, and Quartiles for MVP winners")
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.kdeplot(height_bestp['height'], fill=True, color='grey', alpha=0.5)

# Overlay statistical lines
plt.axvline(mean_bestp, color='red', linestyle='--', label=f'Mean: {mean_bestp}')
plt.axvline(q1_b, color='green', linestyle='--', label=f'Q1: {q1_b}')
plt.axvline(q2_b, color='orange', linestyle='--', label=f'Median (Q2): {q2_b}')
plt.axvline(q3_b, color='purple', linestyle='--', label=f'Q3: {q3_b}')

plt.title('Height Distribution (KDE) with Mean, Mode, and Quartiles for Best Players')
plt.xlabel('Height')
plt.ylabel('Density')
plt.legend()
plt.show()

# Explanation of the Code:
# KDE Plot: The sns.kdeplot() function is used to plot the Kernel Density Estimation (KDE) for the height data.
# Overlay Lines: Using plt.axvline(), vertical lines are added to represent the mean, quartiles (Q1, Q2, Q3), and mode. Each line is assigned a different color and labeled accordingly.
# Legend: plt.legend() displays the labels for each line on the plot.
# Labels: The plot includes a title and axis labels for clarity.
# This will give us a KDE plot with the mean, quartiles, and mode overlaid as vertical lines for clear visualization.

In [None]:
# Plot both KDEs on the same figure
plt.figure(figsize=(12, 6))

# KDE for Best Players
sns.kdeplot(height_bestp['height'], fill=True, color='grey', alpha=0.5, label="Best Players")

# KDE for MVP Players
sns.kdeplot(height_mvp['height'], fill=True, color='skyblue', alpha=0.5, linewidth=2, label="MVP Players")

# Overlay statistical lines for Best Players
plt.axvline(mean_bestp, color='red', linestyle='--', label=f'Mean (Best Players): {mean_bestp:.2f}')
plt.axvline(q1_b, color='green', linestyle='--', label=f'Q1 (Best Players): {q1_b:.2f}')
plt.axvline(q2_b, color='orange', linestyle='--', label=f'Median (Best Players): {q2_b:.2f}')
plt.axvline(q3_b, color='green', linestyle='--', label=f'Q3 (Best Players): {q3_b:.2f}')

# Overlay statistical lines for MVP Players
plt.axvline(mean_mvp, color='red', linestyle='dashed', linewidth=2, label=f"Mean (MVP): {mean_mvp:.2f}")
plt.axvline(q1_m, color='blue', linestyle='dotted', linewidth=2, label=f"Q1 (MVP): {q1_m:.2f}")
plt.axvline(q2_m, color='orange', linestyle='dotted', linewidth=2, label=f"Median (MVP): {q2_m:.2f}")
plt.axvline(q3_m, color='blue', linestyle='dotted', linewidth=2, label=f"Q3 (MVP): {q3_m:.2f}")

plt.xlabel("Height")
plt.ylabel("Density")
plt.title("Height Distribution (KDE) with Statistical Measures for Best Players and MVP Winners")
plt.legend()

plt.show()
