# LET'S PLAY BASKETBALL 🏀 🏀 🏀

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import re
import statistics

# Get basic data overview:

In [None]:
pd.set_option('display.max_rows', None, 'display.max_columns', None)

data = pd.read_csv('/kaggle/input/nba2k20-player-dataset/nba2k20-full.csv')
data.info()

In [None]:
data.head()

In [None]:
plt.rcParams['figure.figsize'] = [9, 7]
plt.rcParams['figure.dpi'] = 100

# Have a look at number of represented players by team:

In [None]:
labels = [key for key in data.team.value_counts(dropna=False).keys()]
values = [value for value in data.team.value_counts(dropna=False).values]

x = np.arange(len(labels))
fig, ax = plt.subplots()
rects = ax.barh(x, values)


ax.set_xlabel('Number of represented players')
ax.set_ylabel('Team')
ax.set_yticks(ticks=x)
ax.set_yticklabels(labels)
ax.set_title('Number of represented players by team')
plt.show()

# Check rating histogram and average rating value:

In [None]:
rating_values_count = len(data.rating.value_counts())
print('Average rating value: {}'.format(round(data.rating.mean(), 2)))

plt.hist(data.rating, bins=rating_values_count)
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.title('Rating histogram')
plt.show()

**Drop non-numeric objects from 'Salary' column:**

In [None]:
def drop_non_numeric(value):
    return re.sub('[^0-9]', '', value)
data.salary = data.salary.apply(lambda x: drop_non_numeric(x)).astype(int)

# Get average salary by player's rating:

In [None]:
def autolabel(rects):
    
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom')
        

rating_values = data.rating.unique()
salary = []
for rating in rating_values:
    avg_salary = int(round(data.loc[data.rating == rating].salary.mean()))
    salary.append(round(avg_salary / 1000000, 1))

salary_range = np.arange(0, max(salary), 5)
ticks_labels = ['$' + str(value) + 'M' for value in salary_range]
x = np.arange(10)

fig, ax = plt.subplots()
rects = ax.bar(rating_values, salary)


ax.set_xlabel('Rating')
ax.set_ylabel('Average Salary ($M)')
ax.set_yticks(salary_range)
ax.set_yticklabels(ticks_labels)
ax.set_xticks(ticks=rating_values)
ax.set_title("Average salary by player's rating")

autolabel(rects)

fig.tight_layout()
plt.show()

# Take a look at average and maximum salary by team:

In [None]:
team_salary = {}
for team in data.team.value_counts().keys():
    team_salary_avg = round(data.loc[data.team == team].salary.mean() / 1000000, 2)
    team_salary_max = round(data.loc[data.team == team].salary.max() / 1000000, 2)
    team_salary[team] = [team_salary_avg, team_salary_max]

sorted_team_salary = {k: v for k, v in sorted(team_salary.items(), key=lambda x: x[1][0])}

lowest_avg_salary_team = [key for key in sorted_team_salary.keys()][0:5]
lowest_avg_salary = [value[0] for value in sorted_team_salary.values()][0:5]
lowest_max_salary = [value[1] for value in sorted_team_salary.values()][0:5]

highest_avg_salary_team = [key for key in sorted_team_salary.keys()][-5:]
highest_avg_salary = [value[0] for value in sorted_team_salary.values()][-5:]
highest_max_salary = [value[1] for value in sorted_team_salary.values()][-5:]

x = np.arange(len(lowest_avg_salary_team))
width = 0.35


def plot_dependencies(value1, value2, title, teams):
    fig, ax = plt.subplots()
    rects1 = ax.barh(x - width/2, value1, width, label='Average salary ($M)')
    rects2 = ax.barh(x + width/2, value2, width, label='Maximum salary ($M)')

    ax.set_xlabel('Salary')
    ax.set_ylabel('Team')
    ax.set_title('{}'.format(title))
    ax.set_yticks(ticks=x)
    ax.set_yticklabels(teams)
    ax.legend()


    def autolabel(rects):
            for rect in rects:
                width = rect.get_width()
                ax.annotate('${}M'.format(width),
                        xy=(0.95*rect.get_width(), rect.get_y()+0.5*rect.get_height()),
                        xytext=(1, 0),
                        textcoords="offset points",
                        ha='right', va='center')


    autolabel(rects1)
    autolabel(rects2)
    plt.show()
    

title1 = 'Lowest average/maximum salary by team'
plot_dependencies(lowest_avg_salary, lowest_max_salary, title1, lowest_avg_salary_team)

title2 = 'Highest average/maximum salary by team'
plot_dependencies(highest_avg_salary, highest_max_salary, title2, highest_avg_salary_team)

# Let's see player's salary by position:

In [None]:
positions = data.position.unique()
positions = positions[positions != '-']
salary_position = {}

for position in positions:
    avg_salary = round(data.loc[data.position == position].salary.mean() / 1000000, 2)
    max_salary = round(data.loc[data.position == position].salary.max() / 1000000, 2)
    salary_position[position] = [avg_salary, max_salary]

sorted_salary_position = {k: v for k, v in sorted(salary_position.items(), key=lambda x: x[1][0])}

positions = [key for key in sorted_salary_position.keys()]
avg_salary = [value[0] for value in sorted_salary_position.values()]
max_salary = [value[1] for value in sorted_salary_position.values()]

x = np.arange(len(positions))
width = 0.35

title = "Player's salary by position"
plot_dependencies(avg_salary, max_salary, title, positions)

*The choice of position is yours 😏*

# Check how teams are balanced with players rating.
**In this section I calculate variance of players rating by team. Notice that a large variance indicates that rating values are spread out; a small variance indicates it is clustered closely around the average rating value.**

In [None]:
teams = data.team.value_counts().keys()

team_rating = {}
for team in teams:
    rating_variance = round(statistics.variance(data.loc[data.team == team].rating), 2)
    team_rating[team] = rating_variance
    

sorted_team_rating = {k: v for k, v in sorted(team_rating.items(), key=lambda x: x[1])}

labels = [key for key in sorted_team_rating.keys()]
values = [value for value in sorted_team_rating.values()]

y = np.arange(len(labels))

fig, ax = plt.subplots()
rects = ax.barh(y, values)

def autolabel(rects):
    for rect in rects:
        width = rect.get_width()
        ax.annotate('{}'.format(width),
                    xy=(0.95*rect.get_width(), rect.get_y()+0.5*rect.get_height()),
                    xytext=(1, 0),
                    textcoords="offset points",
                    ha='right', va='center')


ax.set_xlabel('Variance value')
ax.set_ylabel('Team')
ax.set_yticks(ticks=y)
ax.set_yticklabels(labels)
ax.set_title("Variance of players rating by team")
autolabel(rects)
plt.show()