![1_2p1GIUUcRSzyyJjSj4x7Iw](https://user-images.githubusercontent.com/52401767/74623348-daf98d80-5176-11ea-8c70-7494a60b0794.jpeg)
# Welcome back to my kernel!
- This is my 'naive' approach for the [Google Cloud & NCAA® ML Competition 2020-NCAAW](https://www.kaggle.com/c/google-cloud-ncaa-march-madness-2020-division-1-womens-tournament) 
- And as usual, if my work can make you feel excited, help me to <font color='red' size=3>upvote this kernel </font>on the right corner 💖💖

P/s: I come from Vietnam, so please ignore my English grammar mistakes through out this notebook 😊😊


# Preparation

In [None]:
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
import matplotlib.ticker as ticker
import matplotlib.animation as animation
from IPython.display import HTML
import random

# Feature engineering

My work is fundamentally inherited from [Pentagram](https://www.kaggle.com/nxrprime) 's great work. By the way, I've added some changes

In [None]:
result = pd.read_csv('../input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MDataFiles_Stage1/MNCAATourneyCompactResults.csv')
result = result.drop(columns=['WLoc', 'NumOT', 'DayNum'])
result.head()

In [None]:
seeds = pd.read_csv('../input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MDataFiles_Stage1/MNCAATourneySeeds.csv')
seeds.Seed = seeds.Seed.map(lambda string : int(string[1:3]))
seeds.head()

In [None]:
team_name = pd.read_csv('../input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MDataFiles_Stage1/MTeamSpellings.csv',
                       encoding='cp1252')
team_name = team_name.drop_duplicates(subset=['TeamID'], keep='last').reset_index(drop=True)
team_name.TeamNameSpelling = team_name.TeamNameSpelling.map(lambda string : string.upper())
team_name.head()

In [None]:
teams_avg_seed = seeds.groupby('TeamID').Seed.mean().sort_values()
low_seed = pd.merge(left=teams_avg_seed, right=team_name, how='left', on=['TeamID'] ).head(20)
plt.figure(figsize=(20,10))
ax = sns.barplot(low_seed.Seed, low_seed.TeamNameSpelling)
plt.title('Top 20 rich-traditional NCAA Teams')
for p in ax.patches:
    plt.text(p.get_width()-0.07, 
    p.get_y()+0.55*p.get_height(),f'{p.get_width():.2f}',ha='center', va='center')

#### The 2 most prestigious and rich-traditional teams are Duke and Kansas Jayhawks. I had predicted that before I entered this compettion because Blue Devils won 5 NCAA Championships (fourth all-time) and has an NCAA-best 75,5% NCAA tournament winning percentage. And  other one is currently all-time consecutive conference titles record holder with 14 consecutive titles (2005-2018), also has 5 overall claimed National Championships

![maxresdefault](https://user-images.githubusercontent.com/52401767/74656114-1cfaf180-51c0-11ea-9fb1-b5a73955ad91.jpg)


#### List of NCAA  basketball champions in the period of 1985-2019

In [None]:
Wname = team_name.rename(columns={'TeamNameSpelling':'Wteam_name', 'TeamID':'WTeamID'})
team_result = pd.merge(left=result, right=Wname, how='left', on=['WTeamID'])

win_by_year = team_result.groupby(['Season', 'Wteam_name'])\
              ['WScore'].count().reset_index()\
              .rename(columns={'Wteam_name':'Team', 'WScore':'Win_matches'})
              
df = win_by_year[win_by_year.Win_matches == win_by_year.Win_matches.max()].drop(columns=['Win_matches']).set_index('Season')
df

In [None]:
plt.figure(figsize=(22,8))
ax = sns.countplot(df.Team)
for p in ax.patches:
    ax.text(p.get_x()+p.get_width()/2., p.get_height()+0.05, f'{p.get_height()}',ha='center')
plt.title('Most successful colleges in the NCAA Tournament')
plt.ylabel('Number of NCAA titles')

### Creat a bar chart race to have a greater insight into these basketball team's performance

Pause button is on the middle

[Reference](https://towardsdatascience.com/bar-chart-race-in-python-with-matplotlib-8e687a5c8a41)

In [None]:
win_by_year = win_by_year.groupby(['Team', 'Season']).Win_matches.sum()\
              .unstack(fill_value=0).cumsum(axis=1).sort_values(by=2019, ascending=False).head(15)
cum_win = pd.melt(win_by_year, value_vars=win_by_year.columns, value_name="Win_matches")
cum_win['Team'] = list(win_by_year.index)*35
cum_win = cum_win[['Season', 'Team', 'Win_matches']]

In [None]:
fig, ax = plt.subplots(figsize=(15, 8))

colors = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]) for i in range(15)]
team2color = dict(zip(cum_win.Team, colors))

def draw_barchart(current_season):
    dff = cum_win[cum_win['Season'].eq(current_season)].sort_values(by='Win_matches', ascending=True)
    ax.clear()
    ax.barh(dff['Team'], dff['Win_matches'], color=dff['Team'].map(team2color))
    dx = dff['Win_matches'].max() / 200
    for i, (value, name) in enumerate(zip(dff['Win_matches'], dff['Team'])):
        ax.text(value-dx, i,     name,       size=10, weight=600, ha='right', va='bottom')
        ax.text(value+dx, i,     f'{value:}',size=10, ha='left',  va='center')
        
    ax.text(1, 0.4, current_season, transform=ax.transAxes, color='#777777', size=46, ha='right', weight=800)
    ax.text(0, 1.06, 'Matches', transform=ax.transAxes, size=12, color='#777777')
    ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
    ax.xaxis.set_ticks_position('top')
    ax.tick_params(axis='x', colors='#777777', labelsize=12)
    ax.set_yticks([])
    ax.margins(0, 0.01)
    ax.grid(which='major', axis='x', linestyle='-')
    ax.set_axisbelow(True)
    ax.text(0, 1.15, 'Number of win matches of top 15 NCAA teams from 1985 to 2019',
            transform=ax.transAxes, size=24, weight=600, ha='left', va='top')
    plt.box(False)
    
animator = animation.FuncAnimation(fig, draw_barchart, frames=range(1985, 2020),repeat=True, interval=500)
HTML(animator.to_jshtml())

## Merge seeds

In [None]:
Wseeds = seeds.rename(columns={'TeamID':'WTeamID', 'Seed':'WSeed'})
Lseeds = seeds.rename(columns={'TeamID':'LTeamID', 'Seed':'LSeed'})

data = pd.merge(left=result, right=Wseeds, how='left', on=['Season', 'WTeamID'])
data = pd.merge(left=data, right=Lseeds, on=['Season', 'LTeamID'])

data.head()

In [None]:
low_seed_win = data[data.WSeed < data.LSeed].groupby('Season')['WSeed'].count()
high_seed_win = data[data.WSeed > data.LSeed].groupby('Season')['WSeed'].count()

plt.figure(figsize=(20,10))
N = len(low_seed_win)
low_seed_win_std = low_seed_win - low_seed_win.mean()
high_seed_win_std = high_seed_win - high_seed_win.mean()

ind = np.arange(N)   
width = 0.35       

p1 = plt.bar(ind, low_seed_win, width, yerr=low_seed_win_std)
p2 = plt.bar(ind, high_seed_win, width,
             bottom=low_seed_win, yerr=high_seed_win_std)
p3 = plt.plot(np.arange(-1,N+1), [low_seed_win.mean()]*(N+2), linestyle='--', color='yellow')
plt.ylabel('Number of matches')
plt.title('Winning ratio over years')
plt.xticks(ind, low_seed_win.index)
plt.yticks(np.arange(0, 81, 10))

plt.legend((p1[0], p2[0], p3[0]), ('Low seed wins', 'High seed wins', f'Average : {low_seed_win.mean():.2f} (matches)'))

plt.show()

#### On average, a team with higher seed has up to 44.6% chance to beat higher-appreciated opponent. This is easy to understand and trivially happened in history . As the case of Loyola Marymount on 1990, they are 11st seed that year but the Lion was a blast to watch. They scored averaged 122.4 points per game, which will be discussed below on NCAA's records section 🌟🌟

In [None]:
scores = pd.read_csv('../input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MDataFiles_Stage1/MRegularSeasonCompactResults.csv')
scores.head()

In [None]:
Lscores = scores[['Season', 'WTeamID', 'WScore']].rename(columns={'WTeamID':'TeamID', 'WScore':'Score'})
Wscores = scores[['Season', 'LTeamID', 'LScore']].rename(columns={'LTeamID':'TeamID', 'LScore':'Score'})

result_scores = pd.concat([Wscores, Lscores])
result_scores.head()

In [None]:
season_score = result_scores.groupby(['Season', 'TeamID'])['Score'].sum()
season_score.sort_values(ascending=False).head()

In [None]:
result_scores.sort_values(by='Score', ascending=False).head(15)

### It seems that somehow, The 1258 and 1328 team were involved in all four of the highest scoring games, nine of the top 16 and four of 5 highest all season scoring .  Who are they?

In [None]:
team_name[team_name.TeamID.isin([1258, 1328])]

### Loyola Marymount Lions and Oklahoma Sooners

In 1991, when two of the top four highest-scoring NCAAT games were recorded, the Lions' scored 186 points on a crazy match — 15 more than the next highest team - the Sooner. But on the previous year, Oklahoma scored 7 more points on a single match compares to Loyola Marymount. They are actually rivals on that time the Lions were masters of getting other teams to play at their pace and the Sooner even beat Kansas beaten twice in regular season play.


![sportfximg-119214-10809-1psbkb q8y0g](https://user-images.githubusercontent.com/52401767/74666274-ca2b3500-51d3-11ea-9939-d5efe3a25370.jpg)


## Merge score

In [None]:
data = pd.merge(data, season_score, left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'], how='left')
data = data.rename(columns={'Score':'WScoreT'})
data = pd.merge(data, season_score, left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'], how='left')
data = data.rename(columns={'Score':'LScoreT'})
data = data.drop(columns=['WScore', 'LScore'])
data.head()

In [None]:
low_score_win = data[data.WScoreT < data.LScoreT].groupby('Season')['WSeed'].count()
high_score_win = data[data.WScoreT > data.LScoreT].groupby('Season')['WSeed'].count()

plt.figure(figsize=(20,10))
N = len(low_score_win)
low_score_win_std = low_score_win - low_score_win.mean()
high_score_win_std = high_score_win - high_score_win.mean()

ind = np.arange(N)   
width = 0.35       

p1 = plt.bar(ind, low_score_win, width, yerr=low_score_win_std)
p2 = plt.bar(ind, high_score_win, width,
             bottom=low_score_win, yerr=high_score_win_std)
p3 = plt.plot(np.arange(-1,N+1), [low_score_win.mean()]*(N+2), linestyle='--', color='yellow')
plt.ylabel('Number of matches')
plt.title('Winning ratio over years')
plt.xticks(ind, low_score_win.index)
plt.yticks(np.arange(0, 81, 10))
plt.legend((p1[0], p2[0], p3[0]), ('Low score wins', 'High score wins', f'Average : {low_score_win.mean():.2f} (matches)'))

plt.show()

#### On average, a team with lower score has about 24% chance to win against their opponent

## Keep the Team ID feature

In [None]:
Wdata = data.drop(columns=['Season', 
                           #'WTeamID', 
                           #'LTeamID'
                           ])
Wdata.rename(columns={'WSeed':'Seed1', 'LSeed':'Seed2', 
                      'WScoreT':'ScoreT1', 'LScoreT':'ScoreT2',
                      'WTeamID':'TeamID_1', 'LTeamID': 'TeamID_2'}, inplace=True)
Wdata.head()

In [None]:
Ldata = data[['LTeamID', 'WTeamID', 'LSeed', 'WSeed', 'LScoreT', 'WScoreT']]
Ldata.rename(columns={'LTeamID':'TeamID_1', 'WTeamID':'TeamID_2', 
                      'LSeed':'Seed1', 'WSeed':'Seed2', 
                      'LScoreT':'ScoreT1', 'WScoreT':'ScoreT2',}, inplace=True)
Ldata.head()

## Calculate the seed and score difference

In [None]:
Wdata['Seed_diff'] = Wdata['Seed1'] - Wdata['Seed2']
Wdata['ScoreT_diff'] = Wdata['ScoreT1'] - Wdata['ScoreT2']
Ldata['Seed_diff'] = Ldata['Seed1'] - Ldata['Seed2']
Ldata['ScoreT_diff'] = Ldata['ScoreT1'] - Ldata['ScoreT2']

In [None]:
# At present, I drop the Season columns but this figure can play an important role 
Wdata['result'] = 1
Ldata['result'] = 0
train = pd.concat((Wdata, Ldata)).reset_index(drop=True)
#train = train[train.TeamID_1 < train.TeamID_2]
#train['Season'] = data.Season
train.head()

In [None]:
result0 = train[train.result==0]
result1 = train[train.result==1]

cols = train.columns.drop(['TeamID_1', 'TeamID_2', 'result'])
fig, ax = plt.subplots(2,3,figsize=(22,7))

for i, col in enumerate(cols):
    plt.subplot(2,3,i+1)
    plt.xlabel(col, fontsize=9)
    sns.kdeplot(result0[col].values, bw=0.5,label='Result: Lose')
    sns.kdeplot(result1[col].values, bw=0.5,label='Result: Win')  
plt.show() 

It's hard for me to provide any conclusion here, the patterns sharply fluctuate and it seems there isn't ay clear correlation between these figures in both results. As [Mr. Clutch](https://www.pinterest.cl/pin/326229566755829747/) 's quote:

![4fa546e81c8c06ae99f941bd89af2cb8](https://user-images.githubusercontent.com/52401767/74668815-91418f00-51d8-11ea-8c0e-647e34895856.jpg)


In [None]:
# a seimple heatmap
plt.figure(figsize=(10,7))
cols = train.columns.drop(['TeamID_1', 'TeamID_2'])
corr = train[cols].corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

# Prepare test data

In [None]:
# Extract year and ID number out of string
test = pd.read_csv('../input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MSampleSubmissionStage1_2020.csv')
test['Season'] = test.ID.map(lambda string : int(string.split('_')[0]))
test['TeamID_1'] = test.ID.map(lambda string : int(string.split('_')[1]))
test['TeamID_2'] = test.ID.map(lambda string : int(string.split('_')[2]))
test = test.drop(columns=['ID'])
test.head()

In [None]:
# Convert test data to the train set's format
test = pd.merge(test, seeds, left_on=['Season', 'TeamID_1'], right_on=['Season', 'TeamID'], how='left')
test.rename(columns={'Seed':'Seed1'}, inplace=True)
test = test.drop('TeamID', axis=1)
test = pd.merge(test, seeds, left_on=['Season', 'TeamID_2'], right_on=['Season', 'TeamID'], how='left')
test.rename(columns={'Seed':'Seed2'}, inplace=True)
test = test.drop('TeamID', axis=1)
test = pd.merge(test, season_score, left_on=['Season', 'TeamID_1'], right_on=['Season', 'TeamID'], how='left')
test.rename(columns={'Score':'ScoreT1'}, inplace=True)
test = pd.merge(test, season_score, left_on=['Season', 'TeamID_2'], right_on=['Season', 'TeamID'], how='left')
test.rename(columns={'Score':'ScoreT2'}, inplace=True)
test['Seed_diff'] = test['Seed1'] - test['Seed2']
test['ScoreT_diff'] = test['ScoreT1'] - test['ScoreT2']
#test = test.drop(columns=['Pred', 'Season', 'TeamID_1', 'TeamID_2'])
test = test.drop(columns=['Pred', 'Season'])
test.head()

# One-hot Encoding

I will encode team ID (and may be Season as well, in near future) with the pd.get_dummies module for memory/cpu resourses saving (compare to the corresponding sklearn's OneHotEncoder) and avoid session crashing 

In [None]:
X_train = train.drop(columns=['result'])
y_train = train.result
X_test = test.copy()

data_full = pd.concat([X_train, X_test])
data_full.shape

In [None]:
OH_cols = ['TeamID_1', 'TeamID_2']

OH_full = pd.get_dummies(
    data_full[OH_cols],
    columns=OH_cols,
    drop_first=True,
    dummy_na=True,
    sparse=True,
).sparse.to_coo()

In [None]:
retain_full = data_full.drop(columns=OH_cols)
retain_full = retain_full/retain_full.max()
retain_full.head()

In [None]:
encoded_full = scipy.sparse.hstack([OH_full, retain_full, retain_full**2]).tocsr()
print(encoded_full.shape)

encoded_train = encoded_full[:len(X_train)]
encoded_test = encoded_full[len(X_train):]

In [None]:
submission = pd.read_csv('../input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MSampleSubmissionStage1_2020.csv')
submission.head()

# Model

![0_Q1DUD6HIeksecpPq](https://user-images.githubusercontent.com/52401767/74579824-3a696900-4fd0-11ea-8c65-47dbc8e86f45.png)

## Light Gradient Boosting Machine
- Faster training speed and higher efficiency.
- Lower memory usage.
- Better accuracy.
- Support of parallel and GPU learning.
- Capable of handling large-scale data.

[Reference](https://lightgbm.readthedocs.io/en/latest/)

In [None]:
LGB = LGBMClassifier(
    n_estimators=10000,
    learning_rate =0.005,
    max_depth=-1,
    objective= 'binary',
    eval_metric='cross_entropy',
    first_metric_only=True,)

In [None]:
n_folds = 10
cv = StratifiedKFold(n_splits=n_folds, shuffle=True)
losses = []
LGB_predicts = []
for i, (train,valid) in enumerate(cv.split(encoded_train, y_train)):

    LGB.fit(encoded_train[train], y_train[train], 
            eval_set=[(encoded_train[train], y_train[train]), (encoded_train[valid, :], y_train[valid])], 
            verbose=False)

    test_pred = LGB.predict_proba(encoded_test)[:,1]
    LGB_predicts.append(test_pred)

# Take the average probabilty on 10 folds
LGB_predicts = np.asarray(LGB_predicts)
LGB_predict = np.mean(LGB_predicts, axis=0)

In [None]:
submission.Pred = LGB_predict
submission.to_csv(f'LGB.csv', index=False)

# Optional

Instead of consecutively submitting your prediction, you can get the ground truth label and practice on your local machine

In [None]:
opt_result = pd.read_csv('../input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MDataFiles_Stage1/MNCAATourneyCompactResults.csv')
opt_result = opt_result[opt_result.Season > 2014].reset_index()
opt_result = opt_result[['Season', 'WTeamID', 'LTeamID']]
opt_result.head()

In [None]:
# Convert the result to sample submission's format
for index, match in opt_result.iterrows():

    opt_result.loc[index, 'ID'] = f"{match.Season}_{'_'.join(str(num) for num in sorted([match.WTeamID,match.LTeamID]))}"
    opt_result.loc[index, 'Pred'] = 1 if match.WTeamID < match.LTeamID else 0
    
opt_result = opt_result[['ID', 'Pred']]
opt_result.head()

In [None]:
opt_test = pd.read_csv('../input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MSampleSubmissionStage1_2020.csv').drop(columns='Pred')
opt_test.head()

In [None]:
# This file is the one you can take like your homework keys
predict = pd.merge(left=opt_test, right=opt_result, how='left', on='ID')
predict.Pred = predict.Pred.fillna(0)
predict.to_csv('Phan_Viet_Hoang.csv', index=False)

<p><font size="3" color="green">Last update on 17/2/2020</font></p> 
<p><font size="5" color="yellow">What's next?</font></p>

- Try others feature engineering technique (especially with the time series data)
- Get some visualization (like my previous work 😅😅)
- Fine tune the vanilla Catboost
- Update them in this kernel

<p><font size="3" color="red">Thank you</font> for spending time on my kernel!</p>