#### Setup

In [1]:
%pip install -r  Requirements.txt

Note: you may need to restart the kernel to use updated packages.


# Cougar Basketball Visualization Project
## By John Salmon

### The information for this project was acquired from sports reference. The data set and more information about it can be found [here](https://www.sports-reference.com/cbb/schools/washington-state/men/2024.html).

The goal of this project is to create interesting and insightful visualization about the WSU Men's basketball team's 2023-2024 season.

#### Import Data

In [2]:
import pandas as pd
gamelog = pd.read_csv('CougarBBallStats/gamelog.csv', index_col = 'G')
gamelog.head()

Unnamed: 0_level_0,Date,Unnamed: 2,Opp,W/L,Tm,Opp.1,FG,FGA,FG%,3P,...,OPP_FT,OPP_FTA,OPP_FT%,OPP_ORB,OPP_TRB,OPP_AST,OPP_STL,OPP_BLK,OPP_TOV,OPP_PF
G,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2023-11-06,,Idaho,W,84,59,31,57,0.544,7,...,18,25,0.72,5,23,8,6,4,11,23
2,2023-11-10,,Prairie View,W,83,65,30,56,0.536,4,...,15,21,0.714,8,23,8,6,1,10,21
3,2023-11-18,N,Mississippi State,L,64,76,24,59,0.407,7,...,21,25,0.84,5,33,7,10,2,8,13
4,2023-11-19,N,Rhode Island,W,78,57,33,64,0.516,3,...,10,12,0.833,4,23,9,1,2,11,18
5,2023-11-24,,Utah Tech,W,93,53,32,57,0.561,15,...,12,17,0.706,9,27,7,5,4,16,20


#### Cleaning

In [3]:
print(gamelog.columns)

Index(['Date', 'Unnamed: 2', 'Opp', 'W/L', 'Tm', 'Opp.1', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'TRB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'Unnamed: 23', 'OPP_FG', 'OPP_FGA', 'OPP_FG%',
       'OPP_3P', 'OPP_3PA', 'OPP_3P%', 'OPP_FT', 'OPP_FTA', 'OPP_FT%',
       'OPP_ORB', 'OPP_TRB', 'OPP_AST', 'OPP_STL', 'OPP_BLK', 'OPP_TOV',
       'OPP_PF'],
      dtype='object')


In [4]:
#Fix Missing Column Names
gamelog.rename(columns = {'Unnamed: 2': 'Location'}, inplace = True)
gamelog.drop(columns = ['Unnamed: 23'], inplace = True) #Column is empty and used for spacing

#fix nan values in Location column
gamelog['Location'] = gamelog['Location'].fillna('H')

In [5]:
#Column Wise null and na counts
print('Null Count: ', gamelog.isnull().sum())
print('NA Count: ', gamelog.isna().sum())

Null Count:  Date        0
Location    0
Opp         0
W/L         0
Tm          0
Opp.1       0
FG          0
FGA         0
FG%         0
3P          0
3PA         0
3P%         0
FT          0
FTA         0
FT%         0
ORB         0
TRB         0
AST         0
STL         0
BLK         0
TOV         0
PF          0
OPP_FG      0
OPP_FGA     0
OPP_FG%     0
OPP_3P      0
OPP_3PA     0
OPP_3P%     0
OPP_FT      0
OPP_FTA     0
OPP_FT%     0
OPP_ORB     0
OPP_TRB     0
OPP_AST     0
OPP_STL     0
OPP_BLK     0
OPP_TOV     0
OPP_PF      0
dtype: int64
NA Count:  Date        0
Location    0
Opp         0
W/L         0
Tm          0
Opp.1       0
FG          0
FGA         0
FG%         0
3P          0
3PA         0
3P%         0
FT          0
FTA         0
FT%         0
ORB         0
TRB         0
AST         0
STL         0
BLK         0
TOV         0
PF          0
OPP_FG      0
OPP_FGA     0
OPP_FG%     0
OPP_3P      0
OPP_3PA     0
OPP_3P%     0
OPP_FT      0
OPP_FTA     0
OPP_FT%    

In [6]:
#Column wise data type correction
print('Data Types: ', gamelog.dtypes)

cols_to_convert = ['']

Data Types:  Date         object
Location     object
Opp          object
W/L          object
Tm            int64
Opp.1         int64
FG            int64
FGA           int64
FG%         float64
3P            int64
3PA           int64
3P%         float64
FT            int64
FTA           int64
FT%         float64
ORB           int64
TRB           int64
AST           int64
STL           int64
BLK           int64
TOV           int64
PF            int64
OPP_FG        int64
OPP_FGA       int64
OPP_FG%     float64
OPP_3P        int64
OPP_3PA       int64
OPP_3P%     float64
OPP_FT        int64
OPP_FTA       int64
OPP_FT%     float64
OPP_ORB       int64
OPP_TRB       int64
OPP_AST       int64
OPP_STL       int64
OPP_BLK       int64
OPP_TOV       int64
OPP_PF        int64
dtype: object


#### Feature Engineering

In [7]:
#Score Difference Column
gamelog['Victory Margin'] = gamelog['Tm'] - gamelog['Opp.1']

#add team column for later
gamelog['Team'] = 'WSU'

#### Data Exploration

In [8]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

In [9]:
#Correlation Matrix
corr = gamelog.select_dtypes(include = ['int64', 'float64']).corr()

plt.figure(figsize = (30, 30))
sns.heatmap(corr,
            annot = True,
            cmap = 'Spectral',
            vmin = -1, vmax = 1,
            square = True,
            linewidths = 0.5)
plt.title('Correlation Matrix', fontsize = 20)
plt.show

<function matplotlib.pyplot.show(close=None, block=None)>

This correlation matrix looks really cool and tells us some interesting things, but its important to remember that these correlations do not mean that these factors are having an effect on others, instead it means there is a possible link between the two.

In [None]:
#distplot of victory margin
sns.distplot(gamelog['Victory Margin'], color = 'crimson', bins = 35, hist_kws = {'alpha': 0.5})

In [None]:
gamelog_nums = gamelog.select_dtypes(include = ['float64', 'int64'])
gamelog_nums.hist(figsize = (16, 20), bins = 35, xlabelsize = 8, ylabelsize = 8, color = 'red')

#### Import Data From Other Teams

Next lets import the data from other college basketball teams for reference.


In [None]:
#Function for cleaning and adding features
def clean_and_add_columns(gamelog):
    '''This function takes a dataframe and performs the previous cleaning
    steps outlined in this noteboom'''
    
    #clean
    gamelog.rename(columns = {'Unnamed: 2': 'Location'}, inplace = True)
    gamelog.drop(columns = ['Unnamed: 23'], inplace = True) #Column is empty and used for spacing

    #fix nan values in Location column
    gamelog['Location'] = gamelog['Location'].fillna('H')
    
    #add columns
    gamelog['Victory Margin'] = gamelog['Tm'] - gamelog['Opp.1']
    

In [None]:
#Import and Clean Data From other teams
kansas = pd.read_csv('Kansas_Gamelog.csv', index_col = 'G') #team with most basketball games played in their history (cougs are 2nd)
uconn = pd.read_csv('UConn_Gamelog.csv', index_col = 'G') #winner of the 2024 Men's NCAA tournament
uw = pd.read_csv('UW_gamelog.csv', index_col = 'G') #as a former husky (grad 2023) they put the 'dog' in 'dogshit' (sorry dad I'm a cougar now)

new_data = [kansas, uconn, uw]
for df in new_data:
    clean_and_add_columns(df)
    
kansas['Team'] = 'Kansas'
uconn['Team'] = 'UConn'
uw['Team'] = 'UW'

#### Compare Teams' Stats

In [None]:
all_data = pd.concat([gamelog, kansas, uconn, uw], axis = 0)
g = sns.FacetGrid(all_data, col='Team')
g.map(sns.histplot, 'Victory Margin')  # Replace 'column_of_interest' with the column you want to compare
plt.show()

In [None]:
#swarm plot
plt.figure(figsize=(10, 6))
sns.swarmplot(x = 'Team', y = 'Victory Margin', data = all_data)
plt.title('Comparison of Margin of Victory Across Teams')
plt.show()

In [None]:
#Create DF of means
all_numeric = all_data.select_dtypes(include = ['float64', 'int64'])
all_numeric['Team'] = all_data['Team']
summary_df = all_numeric.groupby('Team').mean().reset_index()

# Determine the grid size for subplots (wrap after 2 charts)
num_teams = len(summary_df)
num_cols = 2
num_rows = int(np.ceil(num_teams / num_cols))

# Set up the subplot grid
fig, axes = plt.subplots(num_rows, num_cols, figsize = (15, 5 * num_rows), sharey = True)
axes = axes.flatten() 

# Colors for each team
colors = ['blue', 'grey', 'purple', 'crimson']

# Iterate through each team and create a bar plot
for i, (index, row) in enumerate(summary_df.iterrows()):
    axes[i].bar(summary_df.columns[1:], row[1:], color=colors[i])
    axes[i].set_title(row['Team'])
    axes[i].set_ylim(0, max(summary_df.max()[1:]) + 5)  # Adjust y-axis limit for consistency
    axes[i].set_xticklabels(summary_df.columns[1:], rotation = 90)

# Set a common y-axis label
fig.text(0.04, 0.5, 'Values', va='center', rotation='vertical')
plt.tight_layout()
plt.show()