# Kaggle
## Competition NFL Big Data Bowl

In [None]:
# Carregando os pacotes
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Statistic lib
from scipy import stats
from scipy.stats import skew, norm
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

import warnings
warnings.filterwarnings('ignore')
import gc
gc.enable()

In [None]:
# Carregando os dados de treino
train = pd.read_csv('../data/train.csv')
print ("Data is ready !!")

# Data exploration

In [None]:
# Visualizando os primeiros registros do dataset
train.head()

In [None]:
# Visualizando os tipos das features
train.dtypes

In [None]:
# Visualizando dados estatisticos das variaveis numericas
train.describe().T

In [None]:
def percent_missing(df):
    data = pd.DataFrame(df)
    df_cols = list(pd.DataFrame(data))
    dict_x = {}
    for i in range(0, len(df_cols)):
        dict_x.update({df_cols[i]: round(data[df_cols[i]].isnull().mean()*100,2)})
    
    return dict_x

In [None]:
# Verificando as colunas com dados missing do dataset de treino
missing = percent_missing(train)
df_miss = sorted(missing.items(), key=lambda x: x[1], reverse=True)
print('Percent of missing data')
df_miss[0:50]

In [None]:
# Setup do plot
sns.set_style("white")
f, ax = plt.subplots(figsize=(8, 7))
sns.set_color_codes(palette='deep')

# Identificando os valores missing
missing = round(train.isnull().mean()*100,2)
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar(color="b")

# Visual presentation
ax.xaxis.grid(False)
ax.set(ylabel="Percent of missing values")
ax.set(xlabel="Features")
ax.set(title="Percent missing data by feature")
sns.despine(trim=True, left=True)

## Analisando Correlacoes

In [None]:
# Correlação de Pearson
cor_mat = train.corr(method = 'pearson')

# Visualizando o grafico de heatmap
f, ax = plt.subplots(figsize=(18, 18))
sns.heatmap(cor_mat,linewidths=.5,fmt= '.1f',ax=ax,square=True,cbar=True,annot=True)

### Analisando a variavel target 'Yards'

In [None]:
# the yardage gained on the play (you are predicting this)
train['Yards'].describe()

In [None]:
# Analisando a variavel target 'Yards'
sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(12, 8))

# Fit a normal distribution
mu, std = norm.fit(train["Yards"])

# Verificando a distribuicao de frequencia da variavel Yards
sns.distplot(train["Yards"], color="b", fit = stats.norm)
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="Yards")
ax.set(title="Yards distribution: mu = %.2f,  std = %.2f" % (mu, std))
sns.despine(trim=True, left=True)

# Adicionando Skewness e Kurtosis
ax.text(x=1.1, y=1, transform=ax.transAxes, s="Skewness: %f" % train["Yards"].skew(),\
        fontweight='demibold', fontsize=10, verticalalignment='top', horizontalalignment='right',\
        backgroundcolor='white', color='xkcd:poo brown')
ax.text(x=1.1, y=0.95, transform=ax.transAxes, s="Kurtosis: %f" % train["Yards"].kurt(),\
        fontweight='demibold', fontsize=10, verticalalignment='top', horizontalalignment='right',\
        backgroundcolor='white', color='xkcd:dried blood')

plt.show()

### Analisando as variaveis gerais do JOGO

In [None]:
# year of the season
train['Season'].value_counts()

In [None]:
# week into the season
train['Week'].value_counts()

In [None]:
# description of the stadium environment
train['StadiumType'].value_counts()

In [None]:
# stadium where the game is being played
train['Stadium'].value_counts()

In [None]:
# city where the game is being player
train['Location'].value_counts()

In [None]:
# description of the field surface
train['Turf'].value_counts()

In [None]:
# a unique game identifier
train['GameId'].value_counts()

In [None]:
# a unique play identifier
train['PlayId'].value_counts()

In [None]:
# home or away
train['Team'].value_counts()

### Analisando as variaveis do JOGO

In [None]:
# game quarter (1-5, 5 == overtime)
train['Quarter'].value_counts()

In [None]:
# time on the game clock
train['GameClock'].value_counts()

In [None]:
# home team abbreviation 
train['HomeTeamAbbr'].value_counts()

In [None]:
# home team score before play started
train['HomeScoreBeforePlay'].value_counts()

In [None]:
# visitor team abbreviation
train['VisitorTeamAbbr'].value_counts()

In [None]:
# visitor team score before play started
train['VisitorScoreBeforePlay'].value_counts()

In [None]:
# team with possession
train['PossessionTeam'].value_counts()

In [None]:
# which side of the field the play is happening on
train['FieldPosition'].value_counts()

### Analisando as variaveis de ACOES no JOGO

In [None]:
# offensive team positional grouping
train['OffensePersonnel'].value_counts()

In [None]:
# defensive team positional grouping
train['DefensePersonnel'].value_counts()

In [None]:
# direction the play is headed
train['PlayDirection'].value_counts()

In [None]:
# offense formation
train['OffenseFormation'].value_counts()

In [None]:
# number of defenders lined up near the line of scrimmage, spanning the width of the offensive line
train['DefendersInTheBox'].value_counts()

In [None]:
# UTC time of the handoff
train['TimeHandoff'].value_counts()

In [None]:
# UTC time of the snap
train['TimeSnap'].value_counts()

### Analisando as variaveis de TEMPERATURA

In [None]:
# wind direction
train['WindDirection'].value_counts()

In [None]:
# wind speed in miles/hour
train['WindSpeed'].value_counts()

In [None]:
# temperature (deg F)
train['Temperature'].value_counts()

In [None]:
# description of the game weather
train['GameWeather'].value_counts()

In [None]:
# humidity
train['Humidity'].value_counts()

### Analisando as variaveis do JOGADOR

In [None]:
# a unique identifier of the player
train['NflId'].value_counts()

In [None]:
# the NflId of the rushing player
train['NflIdRusher'].value_counts()

In [None]:
# player's name
train['DisplayName'].value_counts()

In [None]:
# player height (ft-in)
train['PlayerHeight'].value_counts()

In [None]:
# player weight (lbs)
train['PlayerWeight'].value_counts()

In [None]:
# birth date (mm/dd/yyyy)
train['PlayerBirthDate'].value_counts()

In [None]:
# where the player attended college
train['PlayerCollegeName'].value_counts()

### Analisando as variaveis de POSICIONAMENTO DO JOGADOR

In [None]:
# player position along the long axis of the field. See figure below.
train['X'].value_counts()

In [None]:
# player position along the short axis of the field. See figure below.
train['Y'].value_counts()

In [None]:
# speed in yards/second
train['S'].value_counts()

In [None]:
# acceleration in yards/second^2
train['A'].value_counts()

In [None]:
# distance traveled from prior time point, in yards
train['Dis'].value_counts()

In [None]:
# orientation of player (deg)
train['Orientation'].value_counts()

In [None]:
# angle of player motion (deg)
train['Dir'].value_counts()

In [None]:
# the yard line of the line of scrimmage
train['YardLine'].value_counts()

### Analisando as variaveis DESCONHECIDAS

In [None]:
# jersey number
train['JerseyNumber'].value_counts()

In [None]:
# the down (1-4)
train['Down'].value_counts()

In [None]:
# yards needed for a first down
train['Distance'].value_counts()

In [None]:
train['Position'].value_counts()