## 3 Preprocessing

In [1]:
import pandas as pd

data = pd.read_csv("data/kobe.csv")
data = data[data.shot_made_flag.isnull()==False] # 같은 표현 : data = data.dropna()

In [2]:
X = data.ix[:,data.columns != 'shot_made_flag'].copy() # 'shot_made_flag'를 제외한 모든 columns를 선택
y = data.shot_made_flag.copy()

### 3-1 불필요한 feature 제거

In [3]:
for col in X.columns:
    print(col)

action_type
combined_shot_type
game_event_id
game_id
lat
loc_x
loc_y
lon
minutes_remaining
period
playoffs
season
seconds_remaining
shot_distance
shot_type
shot_zone_area
shot_zone_basic
shot_zone_range
team_id
team_name
game_date
matchup
opponent
shot_id


In [4]:
X.drop('game_id', axis=1, inplace=True) # Independent
X.drop('game_event_id', axis=1, inplace=True) # Independent

X.drop('lat', axis=1, inplace=True) # Correlated with loc_x
X.drop('lon', axis=1, inplace=True) # Correlated with loc_y

X.drop('team_id', axis=1, inplace=True) # Always one number
X.drop('team_name', axis=1, inplace=True) # Always LA Lakers

## 3-2 Data Transformation

In [5]:
# Remaining time
X['seconds_from_period_end'] = 60 * X['minutes_remaining'] + X['seconds_remaining']
X['last_5_sec_in_period'] = X['seconds_from_period_end'] < 5

X.drop('minutes_remaining', axis=1, inplace=True)
X.drop('seconds_remaining', axis=1, inplace=True)
X.drop('seconds_from_period_end', axis=1, inplace=True)

## Matchup - (away/home)
X['home_play'] = X['matchup'].str.contains('vs').astype('int')
X.drop('matchup', axis=1, inplace=True)

# Game date
X['game_date'] = pd.to_datetime(X['game_date'])
X['game_year'] = X['game_date'].dt.year
X['game_month'] = X['game_date'].dt.month
X.drop('game_date', axis=1, inplace=True)

# Loc_x, and loc_y binning
X['loc_x'] = pd.cut(X['loc_x'], 25)
X['loc_y'] = pd.cut(X['loc_y'], 25)

# Replace 20 least common action types with value 'Other'
rare_action_types = X['action_type'].value_counts().sort_values().index.values[:20]
X.loc[X['action_type'].isin(rare_action_types), 'action_type'] = 'Other'

## 3-3 Categorical variable to dummies

In [6]:
categorial_cols = [
    'action_type', 'combined_shot_type', 'period', 'season', 'shot_type',
    'shot_zone_area', 'shot_zone_basic', 'shot_zone_range', 'game_year',
    'game_month', 'opponent', 'loc_x', 'loc_y']

for cc in categorial_cols:
    dummies = pd.get_dummies(X[cc])
    dummies = dummies.add_prefix("{}-".format(cc))
    X.drop(cc, axis=1, inplace=True)
    X = X.join(dummies)

In [7]:
X.head()

Unnamed: 0,playoffs,shot_distance,shot_id,last_5_sec_in_period,home_play,action_type-Alley Oop Dunk Shot,action_type-Alley Oop Layup shot,action_type-Driving Dunk Shot,action_type-Driving Finger Roll Layup Shot,action_type-Driving Finger Roll Shot,...,"loc_y-(457, 490.4]","loc_y-(490.4, 523.8]","loc_y-(523.8, 557.2]","loc_y-(557.2, 590.6]","loc_y-(590.6, 624]","loc_y-(624, 657.4]","loc_y-(657.4, 690.8]","loc_y-(690.8, 724.2]","loc_y-(724.2, 757.6]","loc_y-(757.6, 791]"
1,0,15,2,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,16,3,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,22,4,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,5,False,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,14,6,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


- 총 207개 features

## 3-3 Feature selection
- 가장 대표적인 feature selection 방법으로는 Lasso penalty(i.e. 'L1' penalty)를 이용한 방법과 Random Forest를 이용한 방법이 있음
- Lasso : http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html
- Random forest : http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html

### [1] Lasso

In [8]:
import numpy as np

def get_important_features(model, n_features):
    # coefficient가 0이 아닌 변수들만 고름
    index_list = np.where(model.coef_ != 0)[1]
    coef_list = []
    
    for index in index_list:
        coef_list.append(model.coef_[0][index])
    
    # coefficient가 큰 값으로 정렬하고 변수의 index도 함께 정렬
    coef_list, index_list = (list(t) for t in zip(*sorted(zip(coef_list, index_list), reverse=True)))

    # coefficient가 큰 값을 갖는 순서대로 변수명을 저장
    important_features = []
    for coef, index in zip(coef_list, index_list):
        important_features.append(X.columns[index])
    return important_features[:n_features]

In [9]:
from sklearn.linear_model import LogisticRegression

# C 값을 조정하여 coefficient값이 0인 변수 수를 조절 할 수 있음
model = LogisticRegression(penalty='l1',C=0.1)
model.fit(X,y)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
Lasso_selected_features_ = get_important_features(model, 20)

# 선택된 feature
for feature in Lasso_selected_features_:
    print(feature)

combined_shot_type-Dunk
action_type-Jump Bank Shot
action_type-Running Jump Shot
action_type-Slam Dunk Shot
combined_shot_type-Bank Shot
shot_zone_range-24+ ft.
loc_y-(123, 156.4]
shot_zone_range-16-24 ft.
loc_y-(156.4, 189.8]
action_type-Pullup Jump shot
opponent-NYK
shot_zone_range-8-16 ft.
loc_y-(189.8, 223.2]
action_type-Driving Layup Shot
game_year-2000
game_year-2006
game_month-5
shot_zone_basic-Left Corner 3
shot_zone_basic-Restricted Area
game_year-1999


In [11]:
LR_X = X[Lasso_selected_features_]
LR_X.head()

Unnamed: 0,combined_shot_type-Dunk,action_type-Jump Bank Shot,action_type-Running Jump Shot,action_type-Slam Dunk Shot,combined_shot_type-Bank Shot,shot_zone_range-24+ ft.,"loc_y-(123, 156.4]",shot_zone_range-16-24 ft.,"loc_y-(156.4, 189.8]",action_type-Pullup Jump shot,opponent-NYK,shot_zone_range-8-16 ft.,"loc_y-(189.8, 223.2]",action_type-Driving Layup Shot,game_year-2000,game_year-2006,game_month-5,shot_zone_basic-Left Corner 3,shot_zone_basic-Restricted Area,game_year-1999
1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
5,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0


### [2] Random forest

In [12]:
from sklearn.ensemble import RandomForestClassifier
num_trees = 100
num_features = 10

model = RandomForestClassifier(n_estimators=num_trees, max_features=num_features)
model.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=10, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [14]:
rf_index_list = np.arange(len(model.feature_importances_))

In [15]:
# Lasso와는 coefficient form이 달라 같은 함수를 적용할 수 없음
# Logic은 동일

rf_index_list = np.arange(len(model.feature_importances_))
rf_coef_list = []

for index in rf_index_list:
    rf_coef_list.append(model.feature_importances_[index])

rf_coef_list, rf_index_list = (list(t) for t in zip(*sorted(zip(rf_coef_list, rf_index_list), reverse=True)))

important_features = []
for coef, index in zip(rf_coef_list, rf_index_list):
    important_features.append(X.columns[index])

RF_selected_features = important_features[:20]

In [20]:
for feature in RF_selected_features:
    print(feature)

shot_id
shot_distance
action_type-Jump Shot
home_play
period-3
period-1
period-2
period-4
action_type-Layup Shot
game_month-3
game_month-1
combined_shot_type-Dunk
game_month-2
game_month-12
game_month-4
game_month-11
action_type-Driving Layup Shot
loc_y-(-10.6, 22.8]
opponent-HOU
opponent-SAS


In [21]:
RF_X = X[RF_selected_features]
RF_X.head()

Unnamed: 0,shot_id,shot_distance,action_type-Jump Shot,home_play,period-3,period-1,period-2,period-4,action_type-Layup Shot,game_month-3,game_month-1,combined_shot_type-Dunk,game_month-2,game_month-12,game_month-4,game_month-11,action_type-Driving Layup Shot,"loc_y-(-10.6, 22.8]",opponent-HOU,opponent-SAS
1,2,15,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,16,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,22,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0
5,6,14,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## 4 Model

### 4-1 Base model
- 가장 기본적인 모델, Logistic regression, LDA, KNN, Decision Tree, Naive Bayes, SVM을 사용하여 평가해보자

In [22]:
from sklearn.cross_validation import KFold, cross_val_score

# setting parameters
seed = 7
processors=1
num_folds=5
num_instances=len(X)
scoring='log_loss'

kfold = KFold(n=num_instances, n_folds=num_folds, random_state=seed)

In [23]:
# base model를 활용하기 위한 package
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [24]:
# Prepare some basic models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('K-NN', KNeighborsClassifier(n_neighbors=5)))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))

results = []
names = []

for name, model in models:
    cv_results = cross_val_score(model, X, y, cv=kfold, n_jobs=processors) # accuracy를 보려면 scoring을 지우면 default처리
    results.append(cv_results)
    names.append(name)
    print("{0}: ({1:.3f}) +/- ({2:.3f})".format(name, cv_results.mean(), cv_results.std()))   
print('\n')

for name, model in models:
    cv_results = cross_val_score(model, X, y, cv=kfold, n_jobs=processors) # accuracy를 보려면 scoring을 지우면 default처리
    results.append(cv_results)
    names.append(name)
    print("{0}: ({1:.3f}) +/- ({2:.3f})".format(name, cv_results.mean(), cv_results.std()))
print('\n')

for name, model in models:
    cv_results = cross_val_score(model, X, y, cv=kfold, n_jobs=processors) # accuracy를 보려면 scoring을 지우면 default처리
    results.append(cv_results)
    names.append(name)
    print("{0}: ({1:.3f}) +/- ({2:.3f})".format(name, cv_results.mean(), cv_results.std()))
print('\n')    

LR: (0.664) +/- (0.015)




LDA: (0.680) +/- (0.003)
K-NN: (0.489) +/- (0.047)
CART: (0.582) +/- (0.018)
NB: (0.628) +/- (0.013)


LR: (0.664) +/- (0.015)




LDA: (0.680) +/- (0.003)
K-NN: (0.489) +/- (0.047)
CART: (0.577) +/- (0.016)
NB: (0.628) +/- (0.013)


LR: (0.664) +/- (0.015)




LDA: (0.680) +/- (0.003)
K-NN: (0.489) +/- (0.047)
CART: (0.582) +/- (0.020)
NB: (0.628) +/- (0.013)




### 4-2 Ensemble

In [25]:
# 대표적인 ensemble models
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, AdaBoostClassifier

### Bagging (Bootstrap Aggregation)
- Involves taking multiple samples from the training dataset (with replacement) and training a model for each sample. The final output prediction is averaged across the predictions of all of the sub-models.

In [26]:
cart = DecisionTreeClassifier()
num_trees = 100

model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)

results = cross_val_score(model, X, y, cv=kfold, n_jobs=processors)
print("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std()))

(0.651) +/- (0.016)


### Random Forest
- An extension to bagged decision trees. Samples of the training dataset are taken with replacement, but the trees are constructed in a way that reduces the correlation between individual classifiers. Also the tree size is much slowe due to max_features

In [27]:
num_trees = 100
num_features = 10

model = RandomForestClassifier(n_estimators=num_trees, max_features=num_features)

results = cross_val_score(model, X, y, cv=kfold, n_jobs=processors)
print("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std()))

(0.671) +/- (0.004)


## Boosting
- Boosting ensembles creates a sequence of models that attemtp to correct the mistakes of the models before them in the sequence. Once created, the models make predictions which may be weighted by their demonstrated accuracy and the results are combined to create a final output prediction.

In [28]:
model = AdaBoostClassifier(n_estimators=100, random_state=seed)

results = cross_val_score(model, X, y, cv=kfold, n_jobs=processors)
print("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std()))

(0.661) +/- (0.036)


In [30]:
import time

In [34]:
time.time()

1495453836.899004

In [35]:
current_time = time.time()

time.sleep(4)

print(time.time() - current_time)

4.000493049621582


In [36]:
%time
time.sleep(4)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs
