# Imports

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy
from scipy.stats import pearsonr

from wordcloud import WordCloud, STOPWORDS

# Analysis 1

## Number of Players in the MLB Since 1948 by Last School Attended

In [None]:
#Create dataframes for college playing information and list of schools
college_df = pd.read_csv("baseballdatabank-2022.2/contrib/CollegePlaying.csv")
schools_df = pd.read_csv("baseballdatabank-2022.2/contrib/Schools.csv")

#Sort college playing information by player, then year
college_df = college_df.sort_values(by=['playerID', 'yearID'], ascending=[False, False])

#Only keep data since from 1948 and after, convert year to string, group each player, and concatenate schoolIDs and years
college_df = college_df[college_df['yearID'] > 1947] 
college_df = college_df.astype({'yearID':'string'})
college_df = college_df.groupby(by="playerID").agg({'schoolID': ' '.join, 'yearID': ' '.join})

#Only want to look at the school that was most recently attended before entering the MLB (so if an athlete has transfered, the first institution attended is ignored)
college_df["schoolID"] = college_df["schoolID"].str.split(" ").str[0]
college_df["yearID"] = college_df["yearID"].str.split(" ").str[0]

#Add a player count column and group columns by the school (to get the number of MLB players from each school)
college_df['player_ct'] = 1
college_df = college_df.groupby(by="schoolID").sum()

#Sort colleges by number of players and reset the index
college_df = college_df.sort_values(by=['player_ct'], ascending=[False])
college_df = college_df.reset_index()

#Drop city and country data from schools dataframe and merge with college playing dataframe
schools_df = schools_df.drop(columns=["city", "country"])
college_df = pd.merge(college_df, schools_df)

#Drop the school ID and rename "name_full" column to "school"
college_df = college_df.drop(columns=["schoolID"])
college_df.rename(columns = {'name_full': 'school'}, inplace = True)

#Keep only the top n schools by number of players in the MLB
n = 30
college_n_df = college_df.head(n)

#Create and plot bargraph
fig, ax = plt.subplots(figsize=(10, 6))
plot = sns.barplot(x=college_n_df.school, y=college_n_df['player_ct'])
plt.setp(plot.get_xticklabels(), rotation=90)
plt.title('Top ' + str(n) + ' Schools With the Most Number of Players Who Have Entered the MLB since 1948')
plt.xlabel('Schools')
plt.ylabel('# of Players')
plt.show()

#Create and plot wordcloud
college_n_df["tuples"] = list(zip(college_n_df.school, college_df.player_ct))

d = {}
for s, c in college_n_df["tuples"]:
    d[s] = c

wordcloud = WordCloud(prefer_horizontal=1).generate_from_frequencies(d)

plt.figure()
fig, ax = plt.subplots(figsize=(15, 6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

# Analysis 2

# Analysis 3

# Analysis 4