In [3]:
import pandas as pd

def clean_data(df):
    # Drop columns: 'gameid', 'datacompleteness' and 50 other columns
    df = df.drop(columns=['gameid', 'datacompleteness', 'url', 'league', 'split', 'year', 'playoffs', 'date', 'game', 'patch', 'participantid', 'side', 'teamid', 'champion', 'ban1', 'ban2', 'ban3', 'ban4', 'ban5', 'gamelength', 'doublekills', 'triplekills', 'quadrakills', 'pentakills', 'firstbloodassist', 'firstbloodkill', 'firstbloodvictim', 'firstdragon', 'dragons', 'opp_dragons', 'elementaldrakes', 'infernals', 'opp_elementaldrakes', 'mountains', 'clouds', 'oceans', 'chemtechs', 'hextechs', 'dragons (type unknown)', 'elders', 'opp_elders', 'firstherald', 'heralds', 'opp_heralds', 'firstbaron', 'firsttower', 'towers', 'opp_towers', 'firstmidtower', 'firsttothreetowers', 'turretplates', 'opp_turretplates'])
    # Drop rows with missing data in columns: 'playername', 'result'
    df = df.dropna(subset=['playername', 'result'])
    # Drop column: 'gspd'
    df = df.drop(columns=['gspd'])
    # Drop column: 'monsterkillsenemyjungle'
    df = df.drop(columns=['monsterkillsenemyjungle'])
    # Drop column: 'monsterkillsownjungle'
    df = df.drop(columns=['monsterkillsownjungle'])
    # Drop rows with missing data across all columns
    df = df.dropna()
    # Drop column: 'playerid'
    df = df.drop(columns=['playerid'])
    # Export DataFrame to an Excel file
    df.to_excel("output.xlsx", index=False)
    return df

# Loaded variable 'df' from URI: c:\Users\ssjed\OneDrive\Documents\GitHub\Fall2024\STAT 335\Final\2024_LoL_esports_match_data_from_OraclesElixir.xlsx
df = pd.read_excel(r'c:\Users\ssjed\OneDrive\Documents\GitHub\Fall2024\STAT 335\Final\2024_LoL_esports_match_data_from_OraclesElixir.xlsx')

df_clean = clean_data(df.copy())
df_clean.head()

Unnamed: 0,position,playername,teamname,result,kills,deaths,assists,teamkills,teamdeaths,firstblood,...,opp_csat15,golddiffat15,xpdiffat15,csdiffat15,killsat15,assistsat15,deathsat15,opp_killsat15,opp_assistsat15,opp_deathsat15
216,top,Adam,Team BDS,0,3,3,3,10,14,1.0,...,135.0,1690.0,-346.0,-1.0,2.0,1.0,1.0,0.0,1.0,2.0
217,jng,Sheo,Team BDS,0,2,2,2,10,14,1.0,...,95.0,-378.0,74.0,3.0,1.0,2.0,0.0,2.0,2.0,0.0
218,mid,nuc,Team BDS,0,2,3,1,10,14,0.0,...,152.0,279.0,-330.0,-16.0,1.0,0.0,0.0,0.0,1.0,0.0
219,bot,Ice,Team BDS,0,2,2,2,10,14,0.0,...,109.0,213.0,1319.0,17.0,2.0,1.0,2.0,3.0,1.0,2.0
220,sup,Labrov,Team BDS,0,1,4,3,10,14,0.0,...,23.0,-75.0,-1313.0,-8.0,0.0,3.0,2.0,0.0,3.0,2.0


In [4]:
import numpy as np
import random

# Set the seed for reproducibility
seed = 42
np.random.seed(seed)
random.seed(seed)

In [5]:
# Select only numeric columns
numeric_df_clean = df_clean.select_dtypes(include=['number'])

# Calculate the correlation matrix
correlation_matrix = numeric_df_clean.corr()

# Get the absolute values of the correlation matrix
abs_correlation_matrix = correlation_matrix.abs()

# Unstack the matrix and sort by correlation value
sorted_correlations = abs_correlation_matrix.unstack().sort_values(ascending=False)

# Drop the duplicate pairs (correlation of a variable with itself)
sorted_correlations = sorted_correlations[sorted_correlations != 1]

# Display the sorted correlations
sorted_correlations

# Calculate the correlation of 'result' with all other columns
result_correlations = abs_correlation_matrix['result']

# Sort the correlations by absolute value in descending order
sorted_result_correlations = result_correlations.sort_values(ascending=False)

# Display the sorted correlations
sorted_result_correlations

result                      1.000000
team kpm                    0.687156
teamkills                   0.668952
teamdeaths                  0.667032
assists                     0.527882
                              ...   
wardsplaced                 0.021534
damagemitigatedperminute    0.020690
damageshare                 0.000685
earnedgoldshare             0.000041
ckpm                        0.000017
Name: result, Length: 64, dtype: float64

In [6]:
# Select only numeric columns
numeric_df_clean = df_clean.select_dtypes(include=['number'])

# Calculate the correlation matrix
correlation_matrix = numeric_df_clean.corr()

# Get the absolute values of the correlation matrix
abs_correlation_matrix = correlation_matrix.abs()

# Unstack the matrix and sort by correlation value
sorted_correlations = abs_correlation_matrix.unstack().sort_values(ascending=False)

# Drop the duplicate pairs (correlation of a variable with itself)
sorted_correlations = sorted_correlations[sorted_correlations != 1]

# Display the sorted correlations
sorted_correlations

csat15           csat10             0.983146
csat10           csat15             0.983146
opp_csat10       opp_csat15         0.983115
opp_csat15       opp_csat10         0.983115
totalgold        earnedgold         0.982773
                                      ...   
earnedgoldshare  team kpm           0.000105
result           earnedgoldshare    0.000041
earnedgoldshare  result             0.000041
ckpm             result             0.000017
result           ckpm               0.000017
Length: 4032, dtype: float64

In [7]:
# Create datasets based on position
positions = ['top', 'jng', 'mid', 'bot', 'sup']
datasets = {}

for position in positions:
    datasets[position] = df_clean[df_clean['position'] == position]

# Display the first few rows of each dataset
for position, dataset in datasets.items():
    print(f"Dataset for position: {position}")
    print(dataset.head())
    print("\n")

Dataset for position: top
    position   playername      teamname  result  kills  deaths  assists  \
216      top         Adam      Team BDS       0      3       3        3   
221      top  BrokenBlade    G2 Esports       1      4       3        2   
228      top     Szygenda         Rogue       0      0       3        2   
233      top   Irrelevant     SK Gaming       1      6       1        6   
240      top    Cabochard  Karmine Corp       0      0       4        4   

     teamkills  teamdeaths  firstblood  ...  opp_csat15  golddiffat15  \
216         10          14         1.0  ...       135.0        1690.0   
221         14          10         0.0  ...       134.0       -1690.0   
228          4          16         0.0  ...       133.0        -136.0   
233         16           4         0.0  ...       118.0         136.0   
240          9          20         0.0  ...       142.0        -175.0   

     xpdiffat15  csdiffat15  killsat15  assistsat15  deathsat15  \
216      -346.0  

In [8]:
import pandas as pd
import os

# Create an Excel file for each role
for position, dataset in datasets.items():
    filename = f"{position}_dataset.xlsx"
    dataset.to_excel(filename, index=False)
    print(f"Created {filename}")

Created top_dataset.xlsx
Created jng_dataset.xlsx
Created mid_dataset.xlsx
Created bot_dataset.xlsx
Created sup_dataset.xlsx


In [9]:
# Create the 'roles' directory if it doesn't exist
if not os.path.exists('roles'):
    os.makedirs('roles')

# Move the Excel files to the 'roles' directory
for position, dataset in datasets.items():
    filename = f"{position}_dataset.xlsx"
    destination = os.path.join('roles', filename)
    if os.path.exists(destination):
        os.remove(destination)
    os.rename(filename, destination)
    print(f"Moved {filename} to roles/{filename}")

Moved top_dataset.xlsx to roles/top_dataset.xlsx
Moved jng_dataset.xlsx to roles/jng_dataset.xlsx
Moved mid_dataset.xlsx to roles/mid_dataset.xlsx
Moved bot_dataset.xlsx to roles/bot_dataset.xlsx
Moved sup_dataset.xlsx to roles/sup_dataset.xlsx


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Define the feature columns and target variable
features = numeric_df_clean.drop(columns=['result'])
target = numeric_df_clean['result']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Create and train the logistic regression model
model = LogisticRegression(max_iter=8334)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.9610077984403119
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       846
           1       0.96      0.96      0.96       821

    accuracy                           0.96      1667
   macro avg       0.96      0.96      0.96      1667
weighted avg       0.96      0.96      0.96      1667



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
import numpy as np

import statsmodels.api as sm

# Add a constant to the features
X_train_const = sm.add_constant(X_train)

# Fit the initial model with all features
model = sm.Logit(y_train, X_train_const).fit()

# Perform backward selection
def backward_selection(data, target, significance_level=0.05):
    features = data.columns.tolist()
    while len(features) > 0:
        X = sm.add_constant(data[features])
        model = sm.Logit(target, X).fit(disp=0)
        p_values = model.pvalues[1:]  # Exclude the intercept
        max_p_value = p_values.max()
        if max_p_value > significance_level:
            excluded_feature = p_values.idxmax()
            features.remove(excluded_feature)
        else:
            break
    return features

# Run backward selection
selected_features = backward_selection(X_train, y_train)

# Fit the final model with selected features
X_train_selected = sm.add_constant(X_train[selected_features])
final_model = sm.Logit(y_train, X_train_selected).fit()

# Evaluate the final model
X_test_selected = sm.add_constant(X_test[selected_features])
y_pred = final_model.predict(X_test_selected)
y_pred_class = (y_pred > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred_class)
report = classification_report(y_test, y_pred_class)

print(f'Selected Features: {selected_features}')
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

         Current function value: 0.053815
         Iterations: 35




Optimization terminated successfully.
         Current function value: 0.056230
         Iterations 12
Selected Features: ['assists', 'teamkills', 'teamdeaths', 'team kpm', 'ckpm', 'inhibitors', 'opp_inhibitors', 'wardsplaced', 'visionscore', 'earned gpm', 'earnedgoldshare', 'goldspent', 'minionkills', 'cspm', 'csat15', 'opp_goldat15', 'opp_csat15', 'killsat15', 'assistsat15', 'opp_killsat15', 'opp_assistsat15']
Accuracy: 0.976004799040192
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       846
           1       0.98      0.98      0.98       821

    accuracy                           0.98      1667
   macro avg       0.98      0.98      0.98      1667
weighted avg       0.98      0.98      0.98      1667



In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Iterate through each dataset in the roles folder
for position, dataset in datasets.items():
    # Select only numeric columns
    numeric_dataset = dataset.select_dtypes(include=['number'])
    
    # Define the feature columns and target variable
    features = numeric_dataset.drop(columns=['result'])
    target = numeric_dataset['result']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    
    # Create and train the logistic regression model
    model = LogisticRegression(max_iter=8334)
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print(f'Position: {position}')
    print(f'Accuracy: {accuracy}')
    print('Classification Report:')
    print(report)
    print('\n')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Position: top
Accuracy: 0.972972972972973
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       165
           1       0.96      0.98      0.97       168

    accuracy                           0.97       333
   macro avg       0.97      0.97      0.97       333
weighted avg       0.97      0.97      0.97       333





STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Position: jng
Accuracy: 0.9670658682634731
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97       169
           1       0.97      0.96      0.97       165

    accuracy                           0.97       334
   macro avg       0.97      0.97      0.97       334
weighted avg       0.97      0.97      0.97       334





STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Position: mid
Accuracy: 0.9640718562874252
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.98      0.97       169
           1       0.98      0.95      0.96       165

    accuracy                           0.96       334
   macro avg       0.96      0.96      0.96       334
weighted avg       0.96      0.96      0.96       334





STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Position: bot
Accuracy: 0.9579579579579579
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96       181
           1       0.96      0.95      0.95       152

    accuracy                           0.96       333
   macro avg       0.96      0.96      0.96       333
weighted avg       0.96      0.96      0.96       333



Position: sup
Accuracy: 0.9520958083832335
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.94      0.95       164
           1       0.94      0.96      0.95       170

    accuracy                           0.95       334
   macro avg       0.95      0.95      0.95       334
weighted avg       0.95      0.95      0.95       334





STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
# Iterate through each dataset in the roles folder
for position, dataset in datasets.items():
    # Select only numeric columns
    numeric_dataset = dataset.select_dtypes(include=['number'])
    
    # Define the feature columns and target variable
    features = numeric_dataset.drop(columns=['result'])
    target = numeric_dataset['result']
    
    # Run backward selection
    selected_features = backward_selection(features, target)
    
    # Fit the final model with selected features
    X_train_selected = sm.add_constant(features[selected_features])
    final_model = sm.Logit(target, X_train_selected).fit()
    
    # Display the selected features and model summary
    print(f'Position: {position}')
    print(f'Selected Features: {selected_features}')
    print(final_model.summary())
    print('\n')



Optimization terminated successfully.
         Current function value: 0.049551
         Iterations 12
Position: top
Selected Features: ['assists', 'teamdeaths', 'team kpm', 'ckpm', 'barons', 'opp_inhibitors', 'damagetochampions', 'dpm', 'damageshare', 'earned gpm', 'earnedgoldshare', 'goldspent', 'opp_xpat10', 'opp_xpat15']
                           Logit Regression Results                           
Dep. Variable:                 result   No. Observations:                 1664
Model:                          Logit   Df Residuals:                     1649
Method:                           MLE   Df Model:                           14
Date:                Mon, 25 Nov 2024   Pseudo R-squ.:                  0.9285
Time:                        14:44:08   Log-Likelihood:                -82.452
converged:                       True   LL-Null:                       -1153.4
Covariance Type:            nonrobust   LLR p-value:                     0.000
                        coef    std err  



Optimization terminated successfully.
         Current function value: 0.037739
         Iterations 12
Position: jng
Selected Features: ['assists', 'team kpm', 'ckpm', 'inhibitors', 'opp_inhibitors', 'damagemitigatedperminute', 'wardsplaced', 'visionscore', 'earnedgold', 'earned gpm', 'earnedgoldshare', 'goldspent', 'csdiffat10', 'opp_killsat10', 'opp_xpat15', 'opp_csat15', 'killsat15', 'assistsat15', 'opp_killsat15', 'opp_assistsat15']
                           Logit Regression Results                           
Dep. Variable:                 result   No. Observations:                 1668
Model:                          Logit   Df Residuals:                     1647
Method:                           MLE   Df Model:                           20
Date:                Mon, 25 Nov 2024   Pseudo R-squ.:                  0.9456
Time:                        14:44:12   Log-Likelihood:                -62.949
converged:                       True   LL-Null:                       -1156.2
Covari



Optimization terminated successfully.
         Current function value: 0.039345
         Iterations 13
Position: mid
Selected Features: ['assists', 'teamdeaths', 'team kpm', 'ckpm', 'opp_barons', 'inhibitors', 'opp_inhibitors', 'totalgold', 'earnedgold', 'earned gpm', 'earnedgoldshare', 'goldspent', 'total cs', 'monsterkills', 'cspm', 'csdiffat10', 'golddiffat15', 'assistsat15', 'opp_killsat15', 'opp_assistsat15']
                           Logit Regression Results                           
Dep. Variable:                 result   No. Observations:                 1668
Model:                          Logit   Df Residuals:                     1647
Method:                           MLE   Df Model:                           20
Date:                Mon, 25 Nov 2024   Pseudo R-squ.:                  0.9432
Time:                        14:44:16   Log-Likelihood:                -65.627
converged:                       True   LL-Null:                       -1156.2
Covariance Type:            n



Optimization terminated successfully.
         Current function value: 0.041358
         Iterations 13
Position: bot
Selected Features: ['teamkills', 'teamdeaths', 'team kpm', 'ckpm', 'opp_inhibitors', 'damagetochampions', 'wardsplaced', 'wardskilled', 'wcpm', 'earned gpm', 'earnedgoldshare', 'goldspent', 'xpat10', 'opp_goldat10', 'opp_xpat10', 'golddiffat10', 'csdiffat10', 'killsat10', 'assistsat10', 'opp_killsat10', 'opp_assistsat10', 'opp_goldat15', 'golddiffat15', 'xpdiffat15', 'csdiffat15']
                           Logit Regression Results                           
Dep. Variable:                 result   No. Observations:                 1664
Model:                          Logit   Df Residuals:                     1638
Method:                           MLE   Df Model:                           25
Date:                Mon, 25 Nov 2024   Pseudo R-squ.:                  0.9403
Time:                        14:44:21   Log-Likelihood:                -68.819
converged:               



Optimization terminated successfully.
         Current function value: 0.034819
         Iterations 14
Position: sup
Selected Features: ['teamdeaths', 'team kpm', 'ckpm', 'damageshare', 'damagetakenperminute', 'controlwardsbought', 'earned gpm', 'earnedgoldshare', 'goldspent', 'total cs', 'goldat10', 'opp_csat10', 'xpdiffat10', 'deathsat10', 'opp_killsat10', 'opp_assistsat10', 'opp_deathsat10', 'opp_xpat15', 'golddiffat15', 'csdiffat15', 'killsat15', 'assistsat15', 'opp_killsat15', 'opp_assistsat15', 'opp_deathsat15']
                           Logit Regression Results                           
Dep. Variable:                 result   No. Observations:                 1670
Model:                          Logit   Df Residuals:                     1644
Method:                           MLE   Df Model:                           25
Date:                Mon, 25 Nov 2024   Pseudo R-squ.:                  0.9498
Time:                        14:44:25   Log-Likelihood:                -58.148
co

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Define the feature columns and target variable
features = numeric_df_clean.drop(columns=['result'])
target = numeric_df_clean['result']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Create and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.9556088782243551
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.95      0.96       846
           1       0.95      0.96      0.96       821

    accuracy                           0.96      1667
   macro avg       0.96      0.96      0.96      1667
weighted avg       0.96      0.96      0.96      1667



In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Function to perform backward selection
def backward_selection_rf(X_train, y_train, X_test, y_test, significance_level=0.05):
    features = X_train.columns.tolist()
    best_accuracy = 0
    best_features = features.copy()
    
    while len(features) > 0:
        # Train the Random Forest model
        rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
        rf_model.fit(X_train[features], y_train)
        
        # Make predictions on the test set
        y_pred = rf_model.predict(X_test[features])
        
        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_features = features.copy()
        
        # Get feature importances
        importances = rf_model.feature_importances_
        
        # Find the least important feature
        least_important_feature = features[np.argmin(importances)]
        
        # Remove the least important feature
        features.remove(least_important_feature)
    
    return best_features, best_accuracy

# Run backward selection
selected_features_rf, best_accuracy_rf = backward_selection_rf(X_train, y_train, X_test, y_test)

# Fit the final Random Forest model with selected features
rf_model_final = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_final.fit(X_train[selected_features_rf], y_train)

# Make predictions on the test set
y_pred_rf_final = rf_model_final.predict(X_test[selected_features_rf])

# Evaluate the final model
accuracy_rf_final = accuracy_score(y_test, y_pred_rf_final)
report_rf_final = classification_report(y_test, y_pred_rf_final)

print(f'Selected Features: {selected_features_rf}')
print(f'Best Accuracy: {best_accuracy_rf}')
print('Final Model Accuracy:', accuracy_rf_final)
print('Classification Report:')
print(report_rf_final)

Selected Features: ['teamkills', 'teamdeaths', 'team kpm', 'ckpm']
Best Accuracy: 0.9868026394721056
Final Model Accuracy: 0.9868026394721056
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       846
           1       0.98      0.99      0.99       821

    accuracy                           0.99      1667
   macro avg       0.99      0.99      0.99      1667
weighted avg       0.99      0.99      0.99      1667



In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Iterate through each dataset in the roles folder
for position, dataset in datasets.items():
    # Select only numeric columns
    numeric_dataset = dataset.select_dtypes(include=['number'])
    
    # Define the feature columns and target variable
    features = numeric_dataset.drop(columns=['result'])
    target = numeric_dataset['result']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    
    # Create and train the Random Forest model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = rf_model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print(f'Position: {position}')
    print(f'Accuracy: {accuracy}')
    print('Classification Report:')
    print(report)
    print('\n')

Position: top
Accuracy: 0.9579579579579579
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.95      0.96       165
           1       0.95      0.96      0.96       168

    accuracy                           0.96       333
   macro avg       0.96      0.96      0.96       333
weighted avg       0.96      0.96      0.96       333



Position: jng
Accuracy: 0.9431137724550899
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.94      0.94       169
           1       0.94      0.95      0.94       165

    accuracy                           0.94       334
   macro avg       0.94      0.94      0.94       334
weighted avg       0.94      0.94      0.94       334



Position: mid
Accuracy: 0.9640718562874252
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96       169
           1       0.97      0.96  

In [17]:
# Iterate through each dataset in the roles folder
for position, dataset in datasets.items():
    # Select only numeric columns
    numeric_dataset = dataset.select_dtypes(include=['number'])
    
    # Define the feature columns and target variable
    features = numeric_dataset.drop(columns=['result'])
    target = numeric_dataset['result']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    
    # Run backward selection
    selected_features_rf, best_accuracy_rf = backward_selection_rf(X_train, y_train, X_test, y_test)
    
    # Fit the final Random Forest model with selected features
    rf_model_final = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model_final.fit(X_train[selected_features_rf], y_train)
    
    # Make predictions on the test set
    y_pred_rf_final = rf_model_final.predict(X_test[selected_features_rf])
    
    # Evaluate the final model
    accuracy_rf_final = accuracy_score(y_test, y_pred_rf_final)
    report_rf_final = classification_report(y_test, y_pred_rf_final)
    
    print(f'Position: {position}')
    print(f'Selected Features: {selected_features_rf}')
    print(f'Best Accuracy: {best_accuracy_rf}')
    print('Final Model Accuracy:', accuracy_rf_final)
    print('Classification Report:')
    print(report_rf_final)
    print('\n')

Position: top
Selected Features: ['kills', 'deaths', 'assists', 'teamkills', 'teamdeaths', 'team kpm', 'ckpm', 'inhibitors', 'opp_inhibitors', 'damagetochampions', 'dpm', 'damageshare', 'damagetakenperminute', 'damagemitigatedperminute', 'wardsplaced', 'wpm', 'wardskilled', 'wcpm', 'controlwardsbought', 'visionscore', 'vspm', 'totalgold', 'earnedgold', 'earned gpm', 'earnedgoldshare', 'goldspent', 'total cs', 'minionkills', 'monsterkills', 'cspm', 'goldat10', 'xpat10', 'csat10', 'opp_goldat10', 'opp_xpat10', 'opp_csat10', 'golddiffat10', 'xpdiffat10', 'csdiffat10', 'deathsat10', 'goldat15', 'xpat15', 'csat15', 'opp_goldat15', 'opp_xpat15', 'opp_csat15', 'golddiffat15', 'xpdiffat15', 'csdiffat15', 'killsat15', 'assistsat15', 'deathsat15', 'opp_killsat15', 'opp_assistsat15', 'opp_deathsat15']
Best Accuracy: 0.9669669669669669
Final Model Accuracy: 0.9669669669669669
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97  