In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df_train_final = pd.read_csv('../datasets/train.csv')
df_test_final = pd.read_csv('../datasets/test.csv')

In [None]:
df_train_final

In [None]:
print(df_train_final['WinningTeam'].unique())


In [None]:
df_test_final

In [None]:
print(df_test_final['WinningTeam'].unique())

In [None]:
df_train_final['PenaltyOnWinner'] = df_train_final.apply(lambda row: row['Winner/tie'] == row['PenaltyTeam'], axis = 1)
df_test_final['PenaltyOnWinner'] = df_test_final.apply(lambda row: row['Winner/tie'] == row['PenaltyTeam'], axis = 1)

In [None]:
feature_list_pca = ['Down','ToGo','YardLine','SeriesFirstDown','Yards','IsRush','IsPass','IsIncomplete','IsTouchdown','IsSack','IsChallenge','IsChallengeReversed','IsInterception','IsFumble','IsPenalty','IsTwoPointConversion','IsTwoPointConversionSuccessful','IsPenaltyAccepted','PenaltyOnWinner', 'Winner']
X_train_pca = df_train_final[feature_list_pca]
y_train_pca = df_train_final['Winner']
X_test = df_test_final[feature_list_pca]
y_test = df_test_final['Winner']

# pca


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

In [None]:
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train_pca)
X_test_scaled = ss.transform(X_test)

In [None]:
pca = PCA(n_components=len(feature_list_pca)).fit(X_train_scaled)

plt.plot(pca1.explained_variance_ratio_.cumsum(), lw=3, color='#087E8B')
plt.title('Cumulative explained variance by number of principal components', size=20)
plt.xticks(range(0,21,2))
plt.show()

In [None]:
loadings = pd.DataFrame(
    data=pca.components_.T * np.sqrt(pca.explained_variance_),
    columns=[f'PC{i}' for i in range(1, len(X_train_pca.columns) + 1)],
    index=X_train_pca.columns
)
loadings.head()

In [None]:
pc1_loadings = loadings.sort_values(by='PC1', ascending=True)[['PC1']]
pc1_loadings = pc1_loadings.reset_index()
pc1_loadings.columns = ['Attribute', 'CorrelationWithPC1']

plt.bar(x=pc1_loadings['Attribute'], height=pc1_loadings['CorrelationWithPC1'], color='#087E8B')
plt.title('PCA loading scores (first principal component)', size=20)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
pc2_loadings = loadings.sort_values(by='PC2', ascending=True)[['PC2']]
pc2_loadings = pc2_loadings.reset_index()
pc2_loadings.columns = ['Attribute', 'CorrelationWithPC2']

plt.bar(x=pc2_loadings['Attribute'], height=pc2_loadings['CorrelationWithPC2'], color='#087E8B')
plt.title('PCA loading scores (second principal component)', size=20)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
loading_abscumsum = loadings.apply(lambda row: sum(abs(row)), axis = 1).sort_values(ascending=True)

In [None]:
loading_abscumsum.plot.barh()

In [None]:
feature_list_dtc = loading_abscumsum[loading_abscumsum > loading_abscumsum.quantile(q=0.5)].index.to_list()
feature_list_dtc

# dtc

In [None]:
from sklearn.tree import DecisionTreeClassifier as dtc
# from sklearn.model_selection import train_test_split as tts
from sklearn import tree
from sklearn.metrics import accuracy_score

In [None]:
X_train_dtc = df_train_final[feature_list_dtc]
y_train_dtc = df_train_final['Winner']
X_test_dtc = df_test_final[feature_list_dtc]
y_test_dtc = df_test_final['Winner']

In [None]:
clf = dtc(criterion='entropy')
clf.fit(X_train_dtc,y_train_dtc)

In [None]:
tree.plot_tree(clf, feature_names=feature_list_dtc, class_names=['Winner','Loser'])

In [ ]:
pred = clf.predict(X_test)
accuracy = accuracy_score(pred, y_test)
accuracy