In [None]:
# import necessary libraries
import pandas as pd
import pandas as pd
import numpy as np
import ast
import networkx as nx
import matplotlib.pyplot as plt

In [None]:
teamAllMetrics = pd.read_csv('./teamAllMetrics.csv')
teamAllMetrics.head()

### Forming an analysis to see if higher gold is always a factor that results in winning games. Need to normalize on the gold earned for each team on the basis of matches

In [None]:
matchTotalMetrics = teamAllMetrics.groupby(['gameId'])[['totalTeamGold', 'totalTeamKills', 'totalTeamChampExperience', 'totalTeamMinionsKilled']].sum().reset_index()
matchTotalMetrics.rename(columns={'totalTeamGold': 'matchTotalGold', 'totalTeamKills': 'matchTotalKills', 'totalTeamChampExperience': 'matchTotalChampExperience', 'totalTeamMinionsKilled': 'matchTotalMinionsKilled'}, inplace=True)

teamAllMetrics = teamAllMetrics.merge(matchTotalMetrics, on=['gameId'], how='inner')
print(teamAllMetrics.shape)

In [None]:
teamAllMetrics['teamMatchGoldShare'] = teamAllMetrics['totalTeamGold'] / teamAllMetrics['matchTotalGold']
teamAllMetrics['teamMatchExpShare'] = teamAllMetrics['totalTeamChampExperience'] / teamAllMetrics['matchTotalChampExperience']
teamAllMetrics['teamMatchKillsShare'] = teamAllMetrics['totalTeamKills'] / teamAllMetrics['matchTotalKills']
teamAllMetrics['teamMatchMinionsShare'] = teamAllMetrics['totalTeamMinionsKilled'] / teamAllMetrics['matchTotalMinionsKilled']

In [None]:
teamAllMetrics[['gameId', 'teamId', 'teamMatchGoldShare', 'win']]

### Only in 95 games, there are cases when lower gold team wins the game

In [None]:
teamAllMetrics[(teamAllMetrics['teamMatchGoldShare'] > 0.5) & (teamAllMetrics['win'] == 0)].describe()

lesserGoldWinningTeams = teamAllMetrics[(teamAllMetrics['teamMatchGoldShare'] > 0.5) & (teamAllMetrics['win'] == 0)]
lesserGoldWinningTeams.shape

### Compare match outcomes for both the teams in for lesser winning team winning matches

In [None]:
lesserGoldWinningTeamsGameIds = lesserGoldWinningTeams['gameId']
lesserGoldWinningTeamMatches = teamAllMetrics[teamAllMetrics['gameId'].isin(lesserGoldWinningTeamsGameIds)]
print(lesserGoldWinningTeamMatches.head(10))

subsetDF = lesserGoldWinningTeamMatches[['gameId', 'teamId', 'win', 'teamMatchExpShare', 'teamMatchGoldShare', 'teamMatchKillsShare', 'resistance', 'teamIndegreeCentrality', 'teamOutdegreeCentrality']]
subsetDF.head(10)

### Check for Multi-Collinearity Between Variables - Using Variance Inflation Factor

In [None]:
teamAllMetrics['totalTeamKillsPerMin'] = teamAllMetrics['totalTeamKills'] / teamAllMetrics['gameDuration']

In [None]:
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor

# using original set of metrics before, we had share of the match
teamDF = teamAllMetrics[[   'gameId', 'teamId', 'win', 'teamAverageRank', 'totalTeamKillsPerMin', 'totalTeamAllAssistsPerMin', 'totalTeamEpicMonsterKills', 'totalTeamTurretKills',
                            'teamMatchGoldShare', 'goldPerMin', 'totalTeamChampExpPerMin', 'teamMatchMinionsShare', 'totalTeamVisionPerMin',
                            'resistance', 'teamIndegreeCentrality',
                            'teamOutdegreeCentrality', 'teamWeightCentralization'
                        ]]

# subset of features from teamDF that will be used for checking collinearity using VIF
y, X = dmatrices('win~teamAverageRank+totalTeamKillsPerMin+totalTeamAllAssistsPerMin+totalTeamEpicMonsterKills+totalTeamTurretKills+teamMatchGoldShare+goldPerMin+totalTeamChampExpPerMin+teamMatchMinionsShare+totalTeamVisionPerMin+resistance+teamIndegreeCentrality+teamOutdegreeCentrality', data=teamDF, return_type='dataframe')

In [None]:
vif_df = pd.DataFrame()
vif_df['variable'] = X.columns 
vif_df['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_df

### Check for multi-collinearity using pearson correlation coefficient

In [None]:
from scipy import stats

# Correlation matrix
teamDF_features = teamDF[['teamAverageRank', 'totalTeamKillsPerMin', 'totalTeamAllAssistsPerMin', 'totalTeamEpicMonsterKills', 'totalTeamTurretKills',
                        'teamMatchGoldShare', 'goldPerMin', 'totalTeamChampExpPerMin', 'teamMatchMinionsShare', 'totalTeamVisionPerMin',
                        'resistance', 'teamIndegreeCentrality',
                        'teamOutdegreeCentrality', 'teamWeightCentralization'
                    ]]

correlation_matrix = teamDF_features.corr()

# Function to calculate p-values
def correlation_test(x, y):
    return stats.pearsonr(x, y)

# Calculating p-values
p_values = pd.DataFrame(index=teamDF_features.columns, columns=teamDF_features.columns)

for col1 in teamDF_features.columns:
    for col2 in teamDF_features.columns:
        if col1 == col2:
            p_values[col1][col2] = np.nan  # NaN for the diagonal
        else:
            corr_test = correlation_test(teamDF_features[col1], teamDF_features[col2])
            p_values[col1][col2] = corr_test[1]  # Extract the p-value

# print(p_values)

print("Correlation Matrix:")

teamDF_features.drop(['goldPerMin', 'teamMatchGoldShare', 'teamWeightCentralization'], axis=1, inplace=True)
correlation_matrix

## Fitting and prediction using in-game + graph metrics

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# splitting games into train-test split
game_ids = teamDF['gameId'].unique()
train_games, test_games = train_test_split(game_ids, test_size=0.2, random_state=245)

train_df = teamDF[teamDF['gameId'].isin(train_games)]
test_df = teamDF[teamDF['gameId'].isin(test_games)]

X_train = train_df[['teamAverageRank', 'totalTeamKillsPerMin', 'totalTeamAllAssistsPerMin', 'totalTeamEpicMonsterKills', 'totalTeamTurretKills', 'totalTeamChampExpPerMin', 'teamMatchMinionsShare', 'totalTeamVisionPerMin', 'resistance', 'teamIndegreeCentrality', 'teamOutdegreeCentrality']]
y_train = train_df['win']

X_test = test_df[['teamAverageRank', 'totalTeamKillsPerMin', 'totalTeamAllAssistsPerMin', 'totalTeamEpicMonsterKills', 'totalTeamTurretKills', 'totalTeamChampExpPerMin', 'teamMatchMinionsShare', 'totalTeamVisionPerMin', 'resistance', 'teamIndegreeCentrality', 'teamOutdegreeCentrality']]
y_test = test_df['win']

features_names = X_train.columns

### Binary Logistic Regression

In [None]:
# check for results with and without scaling
# Scaling data features to avoid model skewing; performing scaling may not be necessary, because we have transformed the data!
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) 

In [None]:
# Fit the model
model = LogisticRegression(solver='lbfgs', max_iter=400)
model.fit(X_train, y_train) # use X_train_scaled to fit instead

features_names = X_train.columns
coefficients = model.coef_
importances = coefficients[0]
blr_feature_importances = dict(zip(features_names, importances))

y_pred = model.predict(X_test)

In [None]:
# p-values using statsmodels
import statsmodels.api as sm
X = sm.add_constant(X_train) # adding a constant intercept term
logit_model = sm.Logit(y_train, X).fit()
print(logit_model.summary())

In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Calculate classification report
class_report = classification_report(y_test, y_pred)

print(blr_feature_importances)
print('Accuracy:', accuracy)
print('Confusion Matrix:\n', conf_matrix)
print('Classification Report:\n', class_report)

### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtmodel = DecisionTreeClassifier()
dtmodel.fit(X_train, y_train)
dt_importances = dtmodel.feature_importances_
feature_importances = dict(zip(features_names, dt_importances))


dt_y_pred = dtmodel.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, dt_y_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, dt_y_pred)

# Calculate classification report
class_report = classification_report(y_test, dt_y_pred)

print(feature_importances)
print('Accuracy:', accuracy)
print('Confusion Matrix:\n', conf_matrix)
print('Classification Report:\n', class_report)

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfmodel = RandomForestClassifier(n_estimators=16, max_depth=8, bootstrap=True)
rfmodel.fit(X_train, y_train)
rf_importances = rfmodel.feature_importances_ 
rf_feature_importances = dict(zip(features_names, rf_importances))
rf_y_pred = rfmodel.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, rf_y_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, rf_y_pred)

# Calculate classification report
class_report = classification_report(y_test, rf_y_pred)

print(rf_feature_importances)
print('RF Accuracy:', accuracy)
print('RF Confusion Matrix:\n', conf_matrix)
print('RF Classification Report:\n', class_report)

### XG Boost Classifier

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(
    objective = 'binary:logistic',
    n_estimators = 100,
    eval_metric='logloss',
    learning_rate = 0.05
)

xgb_model.fit(X_train, y_train)
xgb_importances = xgb_model.feature_importances_ 
xgb_feature_importances = dict(zip(features_names, xgb_importances))

xgb_y_pred = xgb_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, xgb_y_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, xgb_y_pred)

# Calculate classification report
class_report = classification_report(y_test, xgb_y_pred)

print(xgb_feature_importances)
print('XGB Accuracy:', accuracy)
print('XGB Confusion Matrix:\n', conf_matrix)
print('XGB Classification Report:\n', class_report)

## Fitting and predicition using graph metrics only

In [None]:
# splitting games into train-test split
game_ids = teamDF['gameId'].unique()
train_games, test_games = train_test_split(game_ids, test_size=0.2, random_state=25)

train_df = teamDF[teamDF['gameId'].isin(train_games)]
test_df = teamDF[teamDF['gameId'].isin(test_games)]

X_train = train_df[['resistance', 'teamIndegreeCentrality', 'teamOutdegreeCentrality']]
y_train = train_df['win']

X_test = test_df[['resistance', 'teamIndegreeCentrality', 'teamOutdegreeCentrality']]
y_test = test_df['win']

features_names = X_train.columns

### Binary Logistic Regression

In [None]:
# Fit the model
model = LogisticRegression(solver='lbfgs', max_iter=400)
model.fit(X_train, y_train) # use X_train_scaled to fit instead

features_names = X_train.columns
coefficients = model.coef_
importances = coefficients[0]
blr_feature_importances = dict(zip(features_names, importances))

y_pred = model.predict(X_test)

In [None]:
# p-values using statsmodels
import statsmodels.api as sm
X = sm.add_constant(X_train) # adding a constant intercept term
logit_model = sm.Logit(y_train, X).fit()
print(logit_model.summary())

In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Calculate classification report
class_report = classification_report(y_test, y_pred)

print(blr_feature_importances)
print('Accuracy:', accuracy)
print('Confusion Matrix:\n', conf_matrix)
print('Classification Report:\n', class_report)

### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtmodel = DecisionTreeClassifier()
dtmodel.fit(X_train, y_train)
dt_importances = dtmodel.feature_importances_
feature_importances = dict(zip(features_names, dt_importances))


dt_y_pred = dtmodel.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, dt_y_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, dt_y_pred)

# Calculate classification report
class_report = classification_report(y_test, dt_y_pred)

print(feature_importances)
print('Accuracy:', accuracy)
print('Confusion Matrix:\n', conf_matrix)
print('Classification Report:\n', class_report)

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfmodel = RandomForestClassifier(n_estimators=16, max_depth=8, bootstrap=True)
rfmodel.fit(X_train, y_train)
rf_importances = rfmodel.feature_importances_ 
rf_feature_importances = dict(zip(features_names, rf_importances))
rf_y_pred = rfmodel.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, rf_y_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, rf_y_pred)

# Calculate classification report
class_report = classification_report(y_test, rf_y_pred)

print(rf_feature_importances)
print('RF Accuracy:', accuracy)
print('RF Confusion Matrix:\n', conf_matrix)
print('RF Classification Report:\n', class_report)

### XG Boost Classifier

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(
    objective = 'binary:logistic',
    n_estimators = 100,
    eval_metric='logloss',
    learning_rate = 0.05
)

xgb_model.fit(X_train, y_train)
xgb_importances = xgb_model.feature_importances_ 
xgb_feature_importances = dict(zip(features_names, xgb_importances))

xgb_y_pred = xgb_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, xgb_y_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, xgb_y_pred)

# Calculate classification report
class_report = classification_report(y_test, xgb_y_pred)

print(xgb_feature_importances)
print('XGB Accuracy:', accuracy)
print('XGB Confusion Matrix:\n', conf_matrix)
print('XGB Classification Report:\n', class_report)

## Fitting and Prediction with In-Game Metrics only

In [None]:
# splitting games into train-test split
game_ids = teamDF['gameId'].unique()
train_games, test_games = train_test_split(game_ids, test_size=0.2, random_state=245)

train_df = teamDF[teamDF['gameId'].isin(train_games)]
test_df = teamDF[teamDF['gameId'].isin(test_games)]

X_train = train_df[['teamAverageRank', 'totalTeamKillsPerMin', 'totalTeamAllAssistsPerMin', 'totalTeamEpicMonsterKills', 'totalTeamTurretKills', 'totalTeamChampExpPerMin', 'teamMatchMinionsShare', 'totalTeamVisionPerMin']]
y_train = train_df['win']

X_test = test_df[['teamAverageRank', 'totalTeamKillsPerMin', 'totalTeamAllAssistsPerMin', 'totalTeamEpicMonsterKills', 'totalTeamTurretKills', 'totalTeamChampExpPerMin', 'teamMatchMinionsShare', 'totalTeamVisionPerMin']]
y_test = test_df['win']

features_names = X_train.columns

### Binary Logistic Regression

In [None]:
# Fit the model
model = LogisticRegression(solver='lbfgs', max_iter=400)
model.fit(X_train, y_train) # use X_train_scaled to fit instead

features_names = X_train.columns
coefficients = model.coef_
importances = coefficients[0]
blr_feature_importances = dict(zip(features_names, importances))

y_pred = model.predict(X_test)

In [None]:
# p-values using statsmodels
import statsmodels.api as sm
X = sm.add_constant(X_train) # adding a constant intercept term
logit_model = sm.Logit(y_train, X).fit()
print(logit_model.summary())

In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Calculate classification report
class_report = classification_report(y_test, y_pred)

print(blr_feature_importances)
print('Accuracy:', accuracy)
print('Confusion Matrix:\n', conf_matrix)
print('Classification Report:\n', class_report)

### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtmodel = DecisionTreeClassifier()
dtmodel.fit(X_train, y_train)
dt_importances = dtmodel.feature_importances_
feature_importances = dict(zip(features_names, dt_importances))


dt_y_pred = dtmodel.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, dt_y_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, dt_y_pred)

# Calculate classification report
class_report = classification_report(y_test, dt_y_pred)

print(feature_importances)
print('Accuracy:', accuracy)
print('Confusion Matrix:\n', conf_matrix)
print('Classification Report:\n', class_report)

### Random Forest Classifier

In [None]:
rfmodel = RandomForestClassifier(n_estimators=16, max_depth=8, bootstrap=True)
rfmodel.fit(X_train, y_train)
rf_importances = rfmodel.feature_importances_ 
rf_feature_importances = dict(zip(features_names, rf_importances))
rf_y_pred = rfmodel.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, rf_y_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, rf_y_pred)

# Calculate classification report
class_report = classification_report(y_test, rf_y_pred)

print(rf_feature_importances)
print('RF Accuracy:', accuracy)
print('RF Confusion Matrix:\n', conf_matrix)
print('RF Classification Report:\n', class_report)

### XG Boost Classifier

In [None]:
xgb_model = xgb.XGBClassifier(
    objective = 'binary:logistic',
    n_estimators = 100,
    eval_metric='logloss',
    learning_rate = 0.05
)

xgb_model.fit(X_train, y_train)
xgb_importances = xgb_model.feature_importances_ 
xgb_feature_importances = dict(zip(features_names, xgb_importances))

xgb_y_pred = xgb_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, xgb_y_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, xgb_y_pred)

# Calculate classification report
class_report = classification_report(y_test, xgb_y_pred)

print(xgb_feature_importances)
print('XGB Accuracy:', accuracy)
print('XGB Confusion Matrix:\n', conf_matrix)
print('XGB Classification Report:\n', class_report)

## Dimensionality Reduction

### Scree Analysis

In [None]:
from sklearn.decomposition import FactorAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
# scale features
scaler = StandardScaler()
teamDF_features_scaled = scaler.fit_transform(teamDF_features)

pca = PCA()
pca.fit(teamDF_features_scaled)


# Plot the Scree Plot
plt.figure(figsize=(8,6))
plt.plot(np.arange(1, len(pca.explained_variance_)+1), pca.explained_variance_, marker='o')
plt.title('Scree Plot')
plt.xlabel('Number of Components')
plt.ylabel('Eigenvalue')
plt.grid()
plt.show()

### Factor Analysis; scree plot suggests number of latent components to be 3

In [None]:
# Perform Factor Analysis
fa = FactorAnalysis(n_components=3)  # Adjust n_components based on Scree Plot
fa.fit(teamDF_features_scaled)
factor_loadings = fa.components_.T

# factor loadings for each feature column
loadings_df = pd.DataFrame(factor_loadings, index=teamDF_features.columns, columns=[f'Factor{i+1}' for i in range(factor_loadings.shape[1])])

# compute factor scores
factor_scores = fa.transform(teamDF_features_scaled)
factor_scores_df = pd.DataFrame(factor_scores, columns=[f'Factor{i+1}' for i in range(factor_scores.shape[1])])

In [None]:
# Print factor loadings
loadings_df

In [None]:
# Use BLR on computed factor scores
y = teamDF['win']
X = factor_scores_df

# splitting games into train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
model = LogisticRegression(solver='lbfgs', max_iter=400)
model.fit(X_train, y_train) # use X_train_scaled to fit instead

features_names = X_train.columns
coefficients = model.coef_
importances = coefficients[0]
blr_feature_importances = dict(zip(features_names, importances))

y_pred = model.predict(X_test)

In [None]:
# p-values using statsmodels
import statsmodels.api as sm
X = sm.add_constant(X_train) # adding a constant intercept term
logit_model = sm.Logit(y_train, X).fit()
print(logit_model.summary())

In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Calculate classification report
class_report = classification_report(y_test, y_pred)

print(blr_feature_importances)
print('Accuracy:', accuracy)
print('Confusion Matrix:\n', conf_matrix)
print('Classification Report:\n', class_report)