In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Read in all the necessary datasets and print out heads to see how they link to each other

# boxscores information shows us information for each player per fixture they have played
df_boxscores = pd.read_csv('Boxscores.csv', index_col=0)
print(df_boxscores.shape)
df_boxscores.head()

In [None]:
# We can see negative three pointers as well as 'high' values that seem unreasonable.
# e.g. a player is very unlikely to have made 500 3 pointers
df_boxscores.describe()

In [None]:
# Some things to look out for: 
# having more 3 pointers made than actual attempts
print(df_boxscores[df_boxscores['ThreePointers'] > df_boxscores['ThreePointersAttempted']][['ThreePointers', 'ThreePointersAttempted']])

# get rid of those rows
df_boxscores = df_boxscores.drop([16, 352754])
print(df_boxscores.shape)

In [None]:
# having an unrealistic number of 3 pointers attempted (e.g. 1000 or -10). Look at negative values first.
print(df_boxscores[df_boxscores['ThreePointers'] < 0][['ThreePointers']])
print(df_boxscores[df_boxscores['ThreePointersAttempted'] < 0][['ThreePointersAttempted']])

# get rid of those rows
df_boxscores = df_boxscores.drop([4])
print(df_boxscores.shape)

In [None]:
# Look at other outliers
print(df_boxscores.describe())

# These 'outliers' all look reasonable, so no need to get rid of any of the rows
ax = df_boxscores[['ThreePointers', 'ThreePointersAttempted']].plot(kind='box', title='Three Pointers Made/Attempted Per Fixture')
plt.show()

In [None]:
# fixtures information shows us each fixture's id and the season the fixture corresponds to
df_fixtures = pd.read_csv('Fixtures.csv', index_col=0)
print(df_fixtures.shape)
df_fixtures.head()

In [None]:
# Can see some rows with NaNs. Remove those from the dataframe
df_fixtures = df_fixtures.dropna()
print(df_fixtures.shape)

In [None]:
# players information shows us each player's height, weight, dob and full name
df_players = pd.read_csv('Players.csv', index_col=0)
print(df_players.shape)
df_players.head()

In [None]:
# Can see a duplicate, check for others and remove from the dataframe
print(df_players[df_players.duplicated(keep='last')])

# There's only 1 duplicate entry, so get rid of duplicate row
df_players = df_players.drop([2561])
print(df_players[df_players.duplicated(keep='last')])
df_players

In [None]:
# Now that we have all the data imported and cleaned, let's compile the data into something we can use

# Columns we care about
columns = ['Player', 'Season', 'FirstName', 'Surname', 'ThreePointers', 'ThreePointersAttempted']

# Merge to get FixtureKey, Player name and season
df_merged_players = pd.merge(df_boxscores, df_players, left_on='Player', right_on='PlayerID', how='left')
df_merged = pd.merge(df_merged_players, df_fixtures, left_on='FixtureURL', right_on='FixtureKey', how='left')
df_merged[columns]

In [None]:
df_all = df_merged.copy()[columns]
df_all['Name'] = df_all['FirstName'] + ' ' + df_all['Surname']

# group by name and season
df_all = df_all[['Name', 'Season', 'ThreePointers', 'ThreePointersAttempted']].groupby(['Name', 'Season']).sum().copy()

df_all['ThreePointersMissed'] = df_all['ThreePointersAttempted'] - df_all['ThreePointers']

# get 3 point make percentage per player
df_all['ThreePointersMadePct'] = df_all['ThreePointers'] / df_all['ThreePointersAttempted']

# This is each player's make ability on a global level (NOTE this means it is not necessarily their ability in the 
# actual three point contest)
print(df_all.shape)
df_all = df_all.dropna()
print(df_all.shape)
df_all.head()

In [None]:
df_all.to_csv('CleanedData.csv')