In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Parametric

In [3]:
import pandas as pd
from scipy import stats
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [4]:
# Load the CSV file
file_path = '/content/drive/My Drive/CODES_REPO/LoL/Significance Tests_LoL/Data/perMatchperTeamStats.csv'
data = pd.read_csv(file_path)

In [5]:
# The columns that need to be normalized
columns_to_normalize = [
    'totalTeamKills',
    'totalTeamDeaths',
    'totalTeamTurretKills',
    'totalTeamEpicMonsterKills',
    'totalTeamGold',
    'totalTeamGPM'
]

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Normalize the specified columns and create new columns with the _normalized suffix
for column in columns_to_normalize:
    data[column + '_normalized'] = scaler.fit_transform(data[[column]])

# Now the DataFrame `data` will have the new normalized columns added at the end

In [6]:
data.head()

Unnamed: 0,gameId,teamId,gameDuration,win,teamAverageRank,matchAverageRank,totalTeamKills,totalTeamDeaths,totalTeamTurretKills,totalTeamEpicMonsterKills,...,averageTeamMinionsKilled,teamIndegreeCentrality,teamOutdegreeCentrality,resistance,totalTeamKills_normalized,totalTeamDeaths_normalized,totalTeamTurretKills_normalized,totalTeamEpicMonsterKills_normalized,totalTeamGold_normalized,totalTeamGPM_normalized
0,EUW1_7056642171,100,31.816667,0,13.0,13.8,39,41,7,1,...,140.8,0.118852,0.094262,2.5417,0.469136,0.493827,0.466667,0.090909,0.321725,0.527083
1,EUW1_7056642171,200,31.816667,1,14.6,13.8,41,39,8,4,...,128.2,0.239691,0.139175,9.2691,0.493827,0.469136,0.533333,0.363636,0.356908,0.604167
2,EUW1_7003965976,100,32.316667,0,8.6,8.0,28,30,3,2,...,133.6,0.100877,0.048246,0.7889,0.333333,0.358025,0.2,0.181818,0.248835,0.348611
3,EUW1_7003965976,200,32.316667,1,7.4,8.0,30,28,9,2,...,141.6,0.074675,0.178571,1.9585,0.358025,0.333333,0.6,0.181818,0.325127,0.5125
4,EUW1_6984979109,100,26.616667,1,5.6,6.0,44,19,11,3,...,105.0,0.245283,0.129717,0.4124,0.530864,0.222222,0.733333,0.272727,0.265923,0.655556


In [7]:
import pandas as pd
from scipy import stats
from sklearn.linear_model import LinearRegression

# Define dependent and independent variables
dependent_var = data['win']
independent_vars = data[['averageTeamChampExperience', 'averageTeamVisionScore', 'averageTeamMinionsKilled',
                         'teamIndegreeCentrality', 'teamOutdegreeCentrality',
                         'totalTeamKills_normalized', 'totalTeamDeaths_normalized',
                         'totalTeamTurretKills_normalized', 'totalTeamEpicMonsterKills_normalized',
                         'totalTeamGold_normalized', 'totalTeamGPM_normalized']]  # Added totalTeamGPM_normalized

# Initialize the results dictionary
results = {
    "TEST STATISTICS": ["win - averageTeamChampExperience", "win - averageTeamVisionScore",
                        "win - averageTeamMinionsKilled", "win - teamIndegreeCentrality",
                        "win - teamOutdegreeCentrality",
                        "win - totalTeamKills_normalized", "win - totalTeamDeaths_normalized",
                        "win - totalTeamTurretKills_normalized", "win - totalTeamEpicMonsterKills_normalized",
                        "win - totalTeamGold_normalized", "win - totalTeamGPM_normalized"],  # Added win - totalTeamGPM_normalized
    "ANOVA": {"F-STATISTIC": [], "P-VALUE": []},
    "LINEAR REGRESSION": {"F-STATISTIC": [], "P-VALUE": []},
    "T-TEST": {"T-STATISTIC": [], "P-VALUE": []},
    "SPEARMAN": {"CORRELATION COEFFICIENT": [], "P-VALUE": []}
}

# Perform ANOVA (one-way)
for col in independent_vars.columns:
    f_stat, p_value = stats.f_oneway(independent_vars[col], dependent_var)
    results["ANOVA"]["F-STATISTIC"].append(f_stat)
    results["ANOVA"]["P-VALUE"].append(p_value)

# Perform Linear Regression
for col in independent_vars.columns:
    model = LinearRegression().fit(independent_vars[[col]], dependent_var)
    f_stat = model.score(independent_vars[[col]], dependent_var)
    p_value = stats.pearsonr(independent_vars[col], dependent_var)[1]  # Pearson correlation p-value
    results["LINEAR REGRESSION"]["F-STATISTIC"].append(f_stat)
    results["LINEAR REGRESSION"]["P-VALUE"].append(p_value)

# Perform T-Test (independent two-sample t-test)
for col in independent_vars.columns:
    t_stat, p_value = stats.ttest_ind(independent_vars[col], dependent_var)
    results["T-TEST"]["T-STATISTIC"].append(t_stat)
    results["T-TEST"]["P-VALUE"].append(p_value)

# Perform Spearman Correlation
for col in independent_vars.columns:
    corr_coeff, p_value = stats.spearmanr(independent_vars[col], dependent_var)
    results["SPEARMAN"]["CORRELATION COEFFICIENT"].append(corr_coeff)
    results["SPEARMAN"]["P-VALUE"].append(p_value)

# Convert results to DataFrame
results_df = pd.DataFrame({
    "TEST STATISTICS": results["TEST STATISTICS"],
    "ANOVA F-STATISTIC": results["ANOVA"]["F-STATISTIC"],
    "ANOVA P-VALUE": results["ANOVA"]["P-VALUE"],
    "LINEAR REGRESSION F-STATISTIC": results["LINEAR REGRESSION"]["F-STATISTIC"],
    "LINEAR REGRESSION P-VALUE": results["LINEAR REGRESSION"]["P-VALUE"],
    "T-TEST T-STATISTIC": results["T-TEST"]["T-STATISTIC"],
    "T-TEST P-VALUE": results["T-TEST"]["P-VALUE"],
    "SPEARMAN CORRELATION COEFFICIENT": results["SPEARMAN"]["CORRELATION COEFFICIENT"],
    "SPEARMAN P-VALUE": results["SPEARMAN"]["P-VALUE"]
})

# Save the results to an Excel file
output_file_path = '/content/drive/My Drive/CODES_REPO/LoL/Significance Tests_LoL/Outputs/parametric_statistical_analysis_results.xlsx'
results_df.to_excel(output_file_path, index=False)


# Non-Parametric

### Basic code

In [8]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import kruskal, pearsonr, kendalltau, mannwhitneyu, tiecorrect
from sklearn.utils import resample
from openpyxl import load_workbook
from openpyxl.styles import PatternFill
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

In [9]:
file_path = '/content/drive/My Drive/CODES_REPO/LoL/Significance Tests_LoL/Data/perMatchperTeamStats.csv'
df = pd.read_csv(file_path)
# print(df.head())

## Significance test

### Helper function

In [10]:
# Function to perform the permutation test
def permutation_test(x, y, n_permutations=10000):
    observed_diff = np.mean(x) - np.mean(y)
    combined = np.concatenate([x, y])
    perm_diffs = []

    for _ in range(n_permutations):
        permuted = np.random.permutation(combined)
        perm_x = permuted[:len(x)]
        perm_y = permuted[len(x):]
        perm_diffs.append(np.mean(perm_x) - np.mean(perm_y))

    perm_diffs = np.array(perm_diffs)
    p_value = np.sum(np.abs(perm_diffs) >= np.abs(observed_diff)) / n_permutations
    return observed_diff, p_value

# Function to check for ties in the data
def check_ties(series):
    unique, counts = np.unique(series, return_counts=True)
    return any(counts > 1)


### Average columns (normalization not required)

In [11]:

# Define the dependent and independent variables
dependent_var = df['win']
independent_vars = [
    "averageTeamChampExperience", "averageTeamVisionScore", "averageTeamMinionsKilled",
    "teamIndegreeCentrality", "teamOutdegreeCentrality"
]

# Initialize a list to store the results
results = []

# Perform the tests
for var in independent_vars:
    # Kruskal-Wallis Test
    kruskal_stat, kruskal_p = kruskal(df[var], dependent_var)

    # Pearson Correlation
    pearson_corr, pearson_p = pearsonr(df[var], dependent_var)

    # Kendall Tau Correlation
    kendall_corr, kendall_p = kendalltau(df[var], dependent_var)

    # Mann-Whitney U Test
    mannwhitney_stat, mannwhitney_p = mannwhitneyu(df[var], dependent_var)

    # Permutation Test
    perm_stat, perm_p = permutation_test(df[var], dependent_var)

    # Store the results
    results.append({
        'Dependent Variable': 'win',
        'Independent Variable': var,
        'Pearson Correlation (Test Statistic)': pearson_corr,
        'Pearson Correlation (p-value)': pearson_p,
        'Kendall Tau (Test Statistic)': kendall_corr,
        'Kendall Tau (p-value)': kendall_p,
        'Kruskal-Wallis (Test Statistic)': kruskal_stat,
        'Kruskal-Wallis (p-value)': kruskal_p,
        'Mann-Whitney U (Test Statistic)': mannwhitney_stat,
        'Mann-Whitney U (p-value)': mannwhitney_p,
        'Permutation Test (Test Statistic)': perm_stat,
        'Permutation Test (p-value)': perm_p
    })

# Convert results to DataFrame
results_df_avg = pd.DataFrame(results)
print("Results for Average Columns (No Normalization):")
print(results_df_avg)

# Checking for ties in the data
print("\nChecking for ties in the data for the non-parametric models\n")

# Adjust Kruskal-Wallis and Mann-Whitney U test if ties are detected
for var in independent_vars:
    if check_ties(df[var]):
        print(f"Warning: Ties detected in {var}. Results may be affected.")

    kruskal_stat, kruskal_p = kruskal(df[var], dependent_var)
    mannwhitney_stat, mannwhitney_p = mannwhitneyu(df[var], dependent_var)

    # Update the results with adjusted values
    for result in results:
        if result['Independent Variable'] == var:
            result.update({
                'Kruskal-Wallis (Test Statistic)': kruskal_stat,
                'Kruskal-Wallis (p-value)': kruskal_p,
                'Mann-Whitney U (Test Statistic)': mannwhitney_stat,
                'Mann-Whitney U (p-value)': mannwhitney_p
            })

# Final results
results_df_avg = pd.DataFrame(results)
print("Final Results for Average Columns (No Normalization):")
print(results_df_avg)


Results for Average Columns (No Normalization):
  Dependent Variable        Independent Variable  \
0                win  averageTeamChampExperience   
1                win      averageTeamVisionScore   
2                win    averageTeamMinionsKilled   
3                win      teamIndegreeCentrality   
4                win     teamOutdegreeCentrality   

   Pearson Correlation (Test Statistic)  Pearson Correlation (p-value)  \
0                              0.299947                  4.915636e-145   
1                              0.169485                   3.967741e-46   
2                              0.064403                   7.264298e-08   
3                             -0.131624                   2.416780e-28   
4                             -0.076077                   1.976445e-10   

   Kendall Tau (Test Statistic)  Kendall Tau (p-value)  \
0                      0.267918          2.317426e-165   
1                      0.150771           2.641131e-53   
2                   

### Total columns (Normalized)

In [12]:
# Define the dependent and independent variables
dependent_var = df['win']
independent_vars = [
    "totalTeamKills", "totalTeamDeaths", "totalTeamTurretKills",
    "totalTeamEpicMonsterKills", "totalTeamGold", "totalTeamGPM"
]

# Normalize the independent variables
scaler = StandardScaler()
df[independent_vars] = scaler.fit_transform(df[independent_vars])

# Initialize a list to store the results
results = []

# Perform the tests
for var in independent_vars:
    # Kruskal-Wallis Test
    kruskal_stat, kruskal_p = kruskal(df[var], dependent_var)

    # Pearson Correlation
    pearson_corr, pearson_p = pearsonr(df[var], dependent_var)

    # Kendall Tau Correlation
    kendall_corr, kendall_p = kendalltau(df[var], dependent_var)

    # Mann-Whitney U Test
    mannwhitney_stat, mannwhitney_p = mannwhitneyu(df[var], dependent_var)

    # Permutation Test
    perm_stat, perm_p = permutation_test(df[var], dependent_var)

    # Store the results
    results.append({
        'Dependent Variable': 'win',
        'Independent Variable': var,
        'Pearson Correlation (Test Statistic)': pearson_corr,
        'Pearson Correlation (p-value)': pearson_p,
        'Kendall Tau (Test Statistic)': kendall_corr,
        'Kendall Tau (p-value)': kendall_p,
        'Kruskal-Wallis (Test Statistic)': kruskal_stat,
        'Kruskal-Wallis (p-value)': kruskal_p,
        'Mann-Whitney U (Test Statistic)': mannwhitney_stat,
        'Mann-Whitney U (p-value)': mannwhitney_p,
        'Permutation Test (Test Statistic)': perm_stat,
        'Permutation Test (p-value)': perm_p
    })

# Convert results to DataFrame and reorder columns
results_df_total = pd.DataFrame(results)
results_df_total = results_df_total[['Dependent Variable', 'Independent Variable'] + [col for col in results_df_total.columns if col not in ['Dependent Variable', 'Independent Variable']]]

print("Results for Total Columns (With Normalization):")
print(results_df_total)


Results for Total Columns (With Normalization):
  Dependent Variable       Independent Variable  \
0                win             totalTeamKills   
1                win            totalTeamDeaths   
2                win       totalTeamTurretKills   
3                win  totalTeamEpicMonsterKills   
4                win              totalTeamGold   
5                win               totalTeamGPM   

   Pearson Correlation (Test Statistic)  Pearson Correlation (p-value)  \
0                              0.511435                   0.000000e+00   
1                             -0.510386                   0.000000e+00   
2                              0.825232                   0.000000e+00   
3                              0.632290                   0.000000e+00   
4                              0.357805                  8.036335e-210   
5                              0.752261                   0.000000e+00   

   Kendall Tau (Test Statistic)  Kendall Tau (p-value)  \
0                

### Excel

In [13]:
# Append DataFrames
results_df = pd.concat([results_df_avg, results_df_total], ignore_index=True)

results_df.to_excel('/content/drive/My Drive/CODES_REPO/LoL/Significance Tests_LoL/Outputs/non_parametric_statistical_analysis_results.xlsx', index=False)
print("Results have been saved to 'non_parametric.xlsx'")


Results have been saved to 'non_parametric.xlsx'


In [14]:
import pandas as pd
from openpyxl import load_workbook
from openpyxl.styles import PatternFill

# Load the Excel sheets
paramteric_test_path = '/content/drive/My Drive/CODES_REPO/LoL/Significance Tests_LoL/Outputs/parametric_statistical_analysis_results.xlsx'
model_statistics_results_path = '/content/drive/My Drive/CODES_REPO/LoL/Significance Tests_LoL/Outputs/non_parametric_statistical_analysis_results.xlsx'

# Read the Excel sheets
paramteric_test_df = pd.read_excel(paramteric_test_path)
model_statistics_results_df = pd.read_excel(model_statistics_results_path)

# Remove the first column from paramteric_test
paramteric_test_df = paramteric_test_df.iloc[:, 1:]

# Add the columns from paramteric_test to the Model_Statistics_Results
combined_df = pd.concat([model_statistics_results_df, paramteric_test_df], axis=1)

# Create a new Excel file with combined results
with pd.ExcelWriter('/content/drive/My Drive/CODES_REPO/LoL/Significance Tests_LoL/Outputs/combined_statistical_analysis_results.xlsx', engine='openpyxl') as writer:
    combined_df.to_excel(writer, index=False, sheet_name='Results')

    # Access the workbook and sheet
    workbook = writer.book
    sheet = workbook['Results']

    # Define a fill for highlighting cells
    highlight_fill = PatternFill(start_color='FFFF99', end_color='FFFF99', fill_type='solid')

    # Identify p-value columns (assuming column names contain 'p-value')
    p_value_columns = [col for col in combined_df.columns if 'p-value' in col.lower()]

    # Iterate through the cells and apply formatting for p-value columns
    for col in sheet.iter_cols(min_col=1, max_col=sheet.max_column, min_row=2, max_row=sheet.max_row):
        if sheet.cell(row=1, column=col[0].column).value in p_value_columns:
            for cell in col:
                try:
                    # Check if the cell value is a float and less than 0.05
                    if isinstance(cell.value, float) and cell.value < 0.05:
                        cell.fill = highlight_fill
                except ValueError:
                    # Handle cells that cannot be converted to float
                    pass

print("Combined and highlighted successfully.")


Combined and highlighted successfully.
