# Modeling

In [1]:
from function import *

In [2]:
# load data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

### 1. Select columns & Encoding

In [3]:
# Select columns
category = ['Home']
continuous = ['Possession', 'Shots', 'Touches', 'Passes',
              'Tackles', 'Clearances','SOT', 'Corners', 'Offsides', 'Goal', ]

In [4]:
# make train/test data

train_cols, test_cols = [], []

# category
for cat in category:
    train_tok, test_tok = category_to_ohe(train[cat],test[cat])
    train_cols.append(train_tok)
    test_cols.append(test_tok)    

# continuous
for con in continuous:
    train_cols.append(train[con].values.reshape(len(train),1))
    test_cols.append(test[con].values.reshape(len(test),1))
 

In [5]:
# stack train/test data
X_train = np.hstack(tuple(each for each in train_cols))
X_test = np.hstack(tuple(each for each in test_cols))
y_train = train['Result']

In [6]:
X_train

array([[ 1. , 70.1, 23. , ...,  7. ,  2. ,  0. ],
       [ 0. , 29.9,  4. , ...,  0. ,  1. ,  0. ],
       [ 1. , 59.8, 15. , ...,  6. ,  2. ,  5. ],
       ...,
       [ 0. , 34.7, 16. , ...,  4. ,  0. ,  1. ],
       [ 1. , 32.9,  6. , ...,  4. ,  0. ,  0. ],
       [ 0. , 67.1, 15. , ...,  5. ,  2. ,  5. ]])

In [7]:
X_test

array([[ 1. , 70. , 27. , ...,  9. ,  5. ,  4. ],
       [ 0. , 30. ,  6. , ...,  4. ,  3. ,  3. ],
       [ 1. , 21.8,  6. , ...,  3. ,  6. ,  0. ],
       ...,
       [ 0. , 35.4, 16. , ...,  4. ,  4. ,  4. ],
       [ 1. , 56.6, 15. , ...,  6. ,  4. ,  3. ],
       [ 0. , 43.4, 14. , ...,  6. ,  1. ,  1. ]])

In [8]:
y_train.tail()

3795    1
3796    1
3797    0
3798    0
3799    1
Name: Result, dtype: int64

---

### 2. Modeling

 - 조건부 확률 모형 : 각 클래스가 정답일 조건부 확률을 계산

    - 조건부 확률기반 생성모형 : 베이즈 정리를 사용

        - LDA (linear discriminant analysis)
        - QDA (Quadratic Discriminanat Analysis)
        - 나이브 베이지안 (Naive Bayes)
    
    - 조건부 확률기반 판별모형 :  직접 조건부 확률 함수를 추정
    
        - 로지스틱 회귀 (Logistic Regression)
        - 의사결정나무 (Descision Tree)
        - KNN (K Nearest Neighbor)
        
        
- 판별함수 모형 : 경계면을 찾아서 데이터가 어느 위치에 있는지 계산

    - 퍼셉트론 (Perceptron)
    - 서포트 벡터 머신 (Support Vector Machine)
    - 신경망 (Neural Network)  
    
    
- 모형결합 (Ensemble) : 복수의 예측모형을 결합하여 더 나은 성능을 예측하려는 시도

    - 취합 방법론 : 사용할 모형의 집합이 이미 결정되어 있음
        
        - 다수결 (Majority voting)
        - 배깅 (Bagging)
        - 랜덤 포레스트 (Random Forest)
        
    - 부스팅 방법론 : 사용할 모형을 점진적으로 늘림
    
        - 에이다 부스트 (AdaBoost)
        - 그레디언트 부스트 (Gradient Boost)

---

#### 2.1 조건부 확률모형

##### 2.1.1 조건부 확률기반 생성 모형

In [9]:
# LDA (linear discriminant analysis)
model = LinearDiscriminantAnalysis(n_components=3, solver="svd", 
        store_covariance=True).fit(X_train, y_train)
predict_proba = model.predict_proba(X_test)

# comparison
y_true = test['Result']
y_pred = []

for i in range(760) :
    y_pred.append(np.argmax(predict_proba[i]))

target_names = ['Lose', 'Win', 'Draw']
print('Confusion Matrix : \n\n',confusion_matrix(y_true, y_pred))
print('\n\n Classification Report : \n\n', classification_report(y_true, y_pred, target_names=target_names))

Confusion Matrix : 

 [[261  17   3]
 [ 48 211  22]
 [135  46  17]]


 Classification Report : 

              precision    recall  f1-score   support

       Lose       0.59      0.93      0.72       281
        Win       0.77      0.75      0.76       281
       Draw       0.40      0.09      0.14       198

avg / total       0.61      0.64      0.58       760



In [10]:
# QDA (Quadratic Discriminanat Analysis)
model = QuadraticDiscriminantAnalysis().fit(X_train, y_train)
predict_proba = model.predict_proba(X_test)

# comparison
y_true = test['Result']
y_pred = []

for i in range(760) :
    y_pred.append(np.argmax(predict_proba[i]))

target_names = ['Lose', 'Win', 'Draw']
print('Confusion Matrix : \n\n',confusion_matrix(y_true, y_pred))
print('\n\n Classification Report : \n\n', classification_report(y_true, y_pred, target_names=target_names))

Confusion Matrix : 

 [[246  19  16]
 [ 42 212  27]
 [119  43  36]]


 Classification Report : 

              precision    recall  f1-score   support

       Lose       0.60      0.88      0.72       281
        Win       0.77      0.75      0.76       281
       Draw       0.46      0.18      0.26       198

avg / total       0.63      0.65      0.61       760



In [11]:
# Naive bayesian - Multinomial
model = MultinomialNB().fit(X_train, y_train)
predict_proba = model.predict_proba(X_test)

# comparison
y_true = test['Result']
y_pred = []

for i in range(760) :
    y_pred.append(np.argmax(predict_proba[i]))

target_names = ['Lose', 'Win', 'Draw']
print('Confusion Matrix : \n\n',confusion_matrix(y_true, y_pred))
print('\n\n Classification Report : \n\n', classification_report(y_true, y_pred, target_names=target_names))

Confusion Matrix : 

 [[205  29  47]
 [ 53 176  52]
 [ 93  52  53]]


 Classification Report : 

              precision    recall  f1-score   support

       Lose       0.58      0.73      0.65       281
        Win       0.68      0.63      0.65       281
       Draw       0.35      0.27      0.30       198

avg / total       0.56      0.57      0.56       760



---

##### 2.1.2 조건부 확률기반 판별모형

In [12]:
# Logistic Regression : 사용 X (종속변수가 이항분포를 따라야함)

In [13]:
# Descision Tree
model = DecisionTreeClassifier(criterion='entropy', 
        max_depth=7, min_samples_leaf=5).fit(X_train, y_train)
predict_proba = model.predict_proba(X_test)

# comparison
y_true = test['Result']
y_pred = []

for i in range(760) :
    y_pred.append(np.argmax(predict_proba[i]))

target_names = ['Lose', 'Win', 'Draw']
print('Confusion Matrix : \n\n',confusion_matrix(y_true, y_pred))
print('\n\n Classification Report : \n\n', classification_report(y_true, y_pred, target_names=target_names))

Confusion Matrix : 

 [[226  24  31]
 [ 32 214  35]
 [114  50  34]]


 Classification Report : 

              precision    recall  f1-score   support

       Lose       0.61      0.80      0.69       281
        Win       0.74      0.76      0.75       281
       Draw       0.34      0.17      0.23       198

avg / total       0.59      0.62      0.59       760



In [14]:
# KNN (K Nearest Neighbor)

---

#### 2.2 모형결합 (Ensemble)

##### 2.2.1 취합 방법론

In [15]:
# 다수결 (Majority voting)

# 취합할 모델 생성
model1 = LinearDiscriminantAnalysis(n_components=3, solver="svd", store_covariance=True)
model2 = QuadraticDiscriminantAnalysis()
model3 = GaussianNB()
model4 = MultinomialNB()

# ensemble 생성
ensemble = VotingClassifier(estimators=[('lda', model1), ('qda', model2), ('gnb', model3), ('mul', model4)], 
                            voting='soft', weights=[1, 1, 1, 1])

predict_proba = [c.fit(X_train, y_train).predict_proba(X_test) for c in (model1, model2, model3, model4, ensemble)]

# comparison
y_true = test['Result']
y_pred = []

for i in range(760) :
    y_pred.append(np.argmax(predict_proba[4][i])) # ensemble index

target_names = ['Lose', 'Win', 'Draw']
print('Confusion Matrix : \n\n',confusion_matrix(y_true, y_pred))
print('\n\n Classification Report : \n\n', classification_report(y_true, y_pred, target_names=target_names))

Confusion Matrix : 

 [[252  19  10]
 [ 47 204  30]
 [127  47  24]]


 Classification Report : 

              precision    recall  f1-score   support

       Lose       0.59      0.90      0.71       281
        Win       0.76      0.73      0.74       281
       Draw       0.38      0.12      0.18       198

avg / total       0.60      0.63      0.59       760



In [16]:
# 배깅 (Bagging)
model1 = DecisionTreeClassifier().fit(X_train, y_train)
model2 = BaggingClassifier(DecisionTreeClassifier(), bootstrap_features=True, random_state=0).fit(X_train, y_train)
predict_proba = model2.predict_proba(X_test)

# comparison
y_true = test['Result']
y_pred = []

for i in range(760) :
    y_pred.append(np.argmax(predict_proba[i]))

target_names = ['Lose', 'Win', 'Draw']
print('Confusion Matrix : \n\n',confusion_matrix(y_true, y_pred))
print('\n\n Classification Report : \n\n', classification_report(y_true, y_pred, target_names=target_names))

Confusion Matrix : 

 [[207  39  35]
 [ 43 212  26]
 [113  54  31]]


 Classification Report : 

              precision    recall  f1-score   support

       Lose       0.57      0.74      0.64       281
        Win       0.70      0.75      0.72       281
       Draw       0.34      0.16      0.21       198

avg / total       0.56      0.59      0.56       760



In [17]:
# 랜덤포레스트 (RandomForest)
clf = RandomForestClassifier(n_estimators=1000, max_depth=10, min_samples_split = 10, criterion = 'entropy')
model = clf.fit(X_train, y_train)
predict_proba = model.predict_proba(X_test)

# comparison
y_true = test['Result']
y_pred = []

for i in range(760) :
    y_pred.append(np.argmax(predict_proba[i]))

target_names = ['Lose', 'Win', 'Draw']
print('Confusion Matrix : \n\n',confusion_matrix(y_true, y_pred))
print('\n\n Classification Report : \n\n', classification_report(y_true, y_pred, target_names=target_names))

Confusion Matrix : 

 [[245  26  10]
 [ 35 232  14]
 [119  60  19]]


 Classification Report : 

              precision    recall  f1-score   support

       Lose       0.61      0.87      0.72       281
        Win       0.73      0.83      0.77       281
       Draw       0.44      0.10      0.16       198

avg / total       0.61      0.65      0.59       760



---

##### 2.2.2 부스팅 방법론

In [18]:
# 에이다 부스트 (Ada Boost)
model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5, random_state=0), 
                               algorithm="SAMME", n_estimators=100).fit(X_train, y_train)
predict_proba = model.predict_proba(X_test)

# comparison
y_true = test['Result']
y_pred = []

for i in range(760) :
    y_pred.append(np.argmax(predict_proba[i]))

target_names = ['Lose', 'Win', 'Draw']
print('Confusion Matrix : \n\n',confusion_matrix(y_true, y_pred))
print('\n\n Classification Report : \n\n', classification_report(y_true, y_pred, target_names=target_names))

Confusion Matrix : 

 [[200  29  52]
 [ 23 228  30]
 [ 88  66  44]]


 Classification Report : 

              precision    recall  f1-score   support

       Lose       0.64      0.71      0.68       281
        Win       0.71      0.81      0.75       281
       Draw       0.35      0.22      0.27       198

avg / total       0.59      0.62      0.60       760



In [19]:
# 그레디언트 부스트 (Gradient Boost)
model = GradientBoostingClassifier(n_estimators=100, max_depth=2, random_state=0).fit(X_train, y_train)
predict_proba = model.predict_proba(X_test)

# comparison
y_true = test['Result']
y_pred = []

for i in range(760) :
    y_pred.append(np.argmax(predict_proba[i]))

target_names = ['Lose', 'Win', 'Draw']
print('Confusion Matrix : \n\n',confusion_matrix(y_true, y_pred))
print('\n\n Classification Report : \n\n', classification_report(y_true, y_pred, target_names=target_names))

Confusion Matrix : 

 [[246  25  10]
 [ 43 227  11]
 [118  57  23]]


 Classification Report : 

              precision    recall  f1-score   support

       Lose       0.60      0.88      0.72       281
        Win       0.73      0.81      0.77       281
       Draw       0.52      0.12      0.19       198

avg / total       0.63      0.65      0.60       760



In [20]:
# XG boost
model = xgboost.XGBClassifier(n_estimators=100, max_depth=2).fit(X_train, y_train)
predict_proba = model.predict_proba(X_test)

# comparison
y_true = test['Result']
y_pred = []

for i in range(760) :
    y_pred.append(np.argmax(predict_proba[i]))

target_names = ['Lose', 'Win', 'Draw']
print('Confusion Matrix : \n\n',confusion_matrix(y_true, y_pred))
print('\n\n Classification Report : \n\n', classification_report(y_true, y_pred, target_names=target_names))

Confusion Matrix : 

 [[252  26   3]
 [ 42 236   3]
 [126  61  11]]


 Classification Report : 

              precision    recall  f1-score   support

       Lose       0.60      0.90      0.72       281
        Win       0.73      0.84      0.78       281
       Draw       0.65      0.06      0.10       198

avg / total       0.66      0.66      0.58       760



---

#### 2.3 판별함수 모형

In [21]:
# 퍼셉트론 (Perceptron) - perceptron
model = Perceptron(max_iter=500, eta0=0.1, random_state=1).fit(X_train, y_train)
predict_proba = model.predict(X_test)

# comparison
y_true = test['Result']
y_pred = []

for i in range(760) :
    y_pred.append(np.argmax(predict_proba[i]))

target_names = ['Lose', 'Win', 'Draw']
print('Confusion Matrix : \n\n',confusion_matrix(y_true, y_pred))
print('\n\n Classification Report : \n\n', classification_report(y_true, y_pred, target_names=target_names))

Confusion Matrix : 

 [[281   0   0]
 [281   0   0]
 [198   0   0]]


 Classification Report : 

              precision    recall  f1-score   support

       Lose       0.37      1.00      0.54       281
        Win       0.00      0.00      0.00       281
       Draw       0.00      0.00      0.00       198

avg / total       0.14      0.37      0.20       760



  'precision', 'predicted', average, warn_for)


In [22]:
# 퍼셉트론 (Perceptron) - SGD
model = SGDClassifier(loss="hinge", max_iter=3, random_state=1).fit(X_train, y_train)
predict_proba = model.predict(X_test)

# comparison
y_true = test['Result']
y_pred = []

for i in range(760) :
    y_pred.append(np.argmax(predict_proba[i]))

target_names = ['Lose', 'Win', 'Draw']
print('Confusion Matrix : \n\n',confusion_matrix(y_true, y_pred))
print('\n\n Classification Report : \n\n', classification_report(y_true, y_pred, target_names=target_names))

Confusion Matrix : 

 [[281   0   0]
 [281   0   0]
 [198   0   0]]


 Classification Report : 

              precision    recall  f1-score   support

       Lose       0.37      1.00      0.54       281
        Win       0.00      0.00      0.00       281
       Draw       0.00      0.00      0.00       198

avg / total       0.14      0.37      0.20       760



  'precision', 'predicted', average, warn_for)


In [23]:
# 서포트 벡터 머신 (Support Vector Machine) - linear
model = SVC(kernel='linear').fit(X_train, y_train)
predict_proba = model.predict(X_test)

# comparison
y_true = test['Result']
y_pred = []

for i in range(760) :
    y_pred.append(np.argmax(predict_proba[i]))

target_names = ['Lose', 'Win', 'Draw']
print('Confusion Matrix : \n\n',confusion_matrix(y_true, y_pred))
print('\n\n Classification Report : \n\n', classification_report(y_true, y_pred, target_names=target_names))

Confusion Matrix : 

 [[281   0   0]
 [281   0   0]
 [198   0   0]]


 Classification Report : 

              precision    recall  f1-score   support

       Lose       0.37      1.00      0.54       281
        Win       0.00      0.00      0.00       281
       Draw       0.00      0.00      0.00       198

avg / total       0.14      0.37      0.20       760



  'precision', 'predicted', average, warn_for)


In [24]:
# 서포트 벡터 머신 (Support Vector Machine) - 다항 커널 (Polynomial Kernel)
model = SVC(kernel="poly", degree=2, gamma=1, coef0=0).fit(X_train, y_train)
predict_proba = model.predict(X_test)

# comparison
y_true = test['Result']
y_pred = []

for i in range(760) :
    y_pred.append(np.argmax(predict_proba[i]))

target_names = ['Lose', 'Win', 'Draw']
print('Confusion Matrix : \n\n',confusion_matrix(y_true, y_pred))
print('\n\n Classification Report : \n\n', classification_report(y_true, y_pred, target_names=target_names))

Confusion Matrix : 

 [[281   0   0]
 [281   0   0]
 [198   0   0]]


 Classification Report : 

              precision    recall  f1-score   support

       Lose       0.37      1.00      0.54       281
        Win       0.00      0.00      0.00       281
       Draw       0.00      0.00      0.00       198

avg / total       0.14      0.37      0.20       760



  'precision', 'predicted', average, warn_for)


In [25]:
# 서포트 벡터 머신 (Support Vector Machine) - RBF(Radial Basis Function)
model = SVC(kernel="rbf").fit(X_train, y_train)
predict_proba = model.predict(X_test)

# comparison
y_true = test['Result']
y_pred = []

for i in range(760) :
    y_pred.append(np.argmax(predict_proba[i]))

target_names = ['Lose', 'Win', 'Draw']
print('Confusion Matrix : \n\n',confusion_matrix(y_true, y_pred))
print('\n\n Classification Report : \n\n', classification_report(y_true, y_pred, target_names=target_names))

Confusion Matrix : 

 [[281   0   0]
 [281   0   0]
 [198   0   0]]


 Classification Report : 

              precision    recall  f1-score   support

       Lose       0.37      1.00      0.54       281
        Win       0.00      0.00      0.00       281
       Draw       0.00      0.00      0.00       198

avg / total       0.14      0.37      0.20       760



  'precision', 'predicted', average, warn_for)


In [26]:
# 서포트 벡터 머신 (Support Vector Machine) - 시그모이드 커널 (Sigmoid Kernel)
model = SVC(kernel="sigmoid", gamma=2, coef0=2).fit(X_train, y_train)
predict_proba = model.predict(X_test)

# comparison
y_true = test['Result']
y_pred = []

for i in range(760) :
    y_pred.append(np.argmax(predict_proba[i]))

target_names = ['Lose', 'Win', 'Draw']
print('Confusion Matrix : \n\n',confusion_matrix(y_true, y_pred))
print('\n\n Classification Report : \n\n', classification_report(y_true, y_pred, target_names=target_names))

Confusion Matrix : 

 [[281   0   0]
 [281   0   0]
 [198   0   0]]


 Classification Report : 

              precision    recall  f1-score   support

       Lose       0.37      1.00      0.54       281
        Win       0.00      0.00      0.00       281
       Draw       0.00      0.00      0.00       198

avg / total       0.14      0.37      0.20       760



  'precision', 'predicted', average, warn_for)


In [27]:
# 신경망 (Neural Network)