### Package Dependencies

In [1]:
import pymongo
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

### Connect to MongoDB

In [2]:
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

### Pull Data

In [3]:
db = client.eliteprospects
metadata = db.meta_data
player_stats = db.player_data

meta = pd.DataFrame(list(metadata.find()))
players = pd.DataFrame(list(player_stats.find()))

### Write/Read CSV

In [4]:
#meta.to_csv('meta.csv')
#players.to_csv('players.csv')
# meta = pd.read_csv('meta.csv')
# players = pd.read_csv('players.csv')

## Wrangling

### Get the age of each player for each season

In [5]:
# Set ep_id as an integer
players['ep_id'] = players['ep_id'].astype(int)

# Likewise for meta
meta['ep_id'] = meta['ep_id'].astype(int)

# In case of dubplicates, drop them
meta = meta.drop_duplicates()
players = players.drop_duplicates()

In [6]:
# Set aside each players birthday
birthdays = meta[['ep_id', 'date_of_birth']]

# Merge the birthdays with each player row in the players table
players = pd.merge(players, birthdays, on = 'ep_id', how = 'left')

In [7]:
# Extract the end of season date for each yyyy-yy season variable
players['end_of_season'] = '20' + players['season'].str.extract(r'-(\d{2})') + '-04-05'

In [8]:
# Convert date_of_birth and end_of_season to pandas datetime
players['date_of_birth'] = pd.to_datetime(players['date_of_birth'])
players['end_of_season'] = pd.to_datetime(players['end_of_season'])

# Subtract end_of_season from date_of_birth to get the age of each player at the end of each season
players['age'] = players['end_of_season'] - players['date_of_birth']

# Convert the pandas timedelta object to integer years
players['age'] = round(players['age'].dt.days / 365).astype(int)

# Set aside current date
players['current_date'] = pd.to_datetime('today')

# Subtract players birth date from the current date to get the current age
players['current_age'] = players['current_date'] - players['date_of_birth']

# Convert the pandas timedelta to integer years
players['current_age'] = round(players['current_age'].dt.days / 365).astype(int)

In [12]:
# Subset the players table to only include players who are over 25 at the time of running
players = players[players['current_age'] >= 25]

### Manage +/-

In [14]:
# Deal with mis-typed values in the plus_minus column
players['plus_minus'] = players['plus_minus'].replace('--6', '-6')

# Impute 0 inplace of - in plus_minus (Fortunately, plus_minus is gaussian and 0 is the mean)
players.replace('-', 0, inplace=True)
players['plus_minus'] = players['plus_minus'].astype(int)

### Reshape quantitative data

In [15]:
ids = players['ep_id'].unique()

types = {
    'assists': 'int64',
    'games_played': 'int64',
    'goals': 'int64',
    'penalty_min': 'int64',
    'plus_minus': 'int64',
    'age': 'object'
}

new_df = pd.DataFrame()

for id in ids:
        
    df = players[players['ep_id'] == id].drop_duplicates()
    df = df.astype(types)
    
    # Have to groupby each players age to account for in-season trades
    df = df.groupby(['age', 'ep_id']).agg({
        'games_played': 'sum', 
        'goals': 'sum', 
        'assists': 'sum', 
        'penalty_min': 'sum',
        'plus_minus': 'mean'
    }).reset_index()    
    
    # Pivot the data frame to form a single row vector for each player
    df = df.pivot('ep_id', 'age')
    
    # Column Labels    
    # Manage the pandas multiindex to re-label each column for each stat and age
    df.columns = [''.join(str(col)) for col in df.columns]
    
    # Rename column labels for clarity
    labels = []
    
    for col in df:
    
        label = re.findall(r"'(.+)'", col)
        stat = re.findall(r',\s(\d{2})', col)
    
        labels.append(label[0] + '_' + stat[0])
        
    df.columns = labels    
    
    # Append each row of player data to the last
    new_df = pd.concat([new_df, df], axis=0, sort=False)

### Reshape qualitative

In [16]:
ids = players['ep_id'].unique()

qual_df = pd.DataFrame()

for id in ids:
    
    df = players[players['ep_id'] == id].reset_index().sort_values('age')
    df = df.drop(['index', '_id'], 1).drop_duplicates()
    
    ages = []
    teams = []
    leagues = []
    age_test = []

    high_gp = 0

    for row in df.iterrows():    

        if row[1]['age'] not in age_test:

            age_test.append(row[1]['age'])

            age = row[1]['age']
            team = row[1]['team']
            league = row[1]['league']

            high_gp = int(row[1]['games_played'])

            ages.append(age)
            teams.append(team)
            leagues.append(league)

        elif row[1]['age'] in age_test:

            if int(row[1]['games_played']) > high_gp:

                age = row[1]['age']
                team = row[1]['team']
                league = row[1]['league']

                high_gp = int(row[1]['games_played'])

                ages = ages[:-1]
                teams = teams[:-1]
                leagues = leagues[:-1]

                ages.append(age)
                teams.append(team)
                leagues.append(league)

    test_df = pd.DataFrame(zip(ages, teams, leagues), columns=['age', 'team', 'league'])

    test_df['ep_id'] = row[1]['ep_id']

    test_df = test_df.pivot('ep_id', 'age')

    test_df.columns = [''.join(str(col)) for col in test_df.columns]

    # Rename column labels for clarity
    labels = []

    for col in test_df:

        label = re.findall(r"'(.+)'", col)
        stat = re.findall(r',\s(\d{2})', col)

        labels.append(label[0] + '_' + stat[0])

    test_df.columns = labels
    
    qual_df = pd.concat([qual_df, test_df], axis=0, sort=False)

### Merge qualitative and quantitative dataframes

In [17]:
combined = pd.merge(new_df, qual_df, on='ep_id', how='left').reset_index()

### Combine Meta and Combined dataframes

In [18]:
main = pd.merge(meta, combined, on = 'ep_id', how = 'left')
    
main = main.reset_index()
main = round(main.drop(['_id', 'index'], 1))

### Save and inspect data

In [19]:
#main.to_csv('main.csv')