In [1]:
import glob
import pandas as pd

# Get a list of all files matching the pattern
file_list = glob.glob('data/parsed_data/heroes_matches_csv/heroes_matches_batch_*.csv')

# Initialize an empty dataframe to store the results
result_df = pd.DataFrame()

# Iterate through each file
for file in file_list:
    # Load the data
    temp_df = pd.read_csv(file)
    
    # Filter columns starting with 'A' and 'B' and include 'duration'
    filtered_df = temp_df.filter(regex='^(duration|[AE])')
    
    # Append the filtered data to the result dataframe
    result_df = pd.concat([result_df, filtered_df], ignore_index=True)

# Display the result dataframe
print(result_df)
# Initialize an empty list to store the new rows
new_rows = []

# Iterate through each match (row in the original dataset)
for _, match_row in result_df.iterrows():
    for prefix in ['A', 'E']:  # Iterate through teams (A and B)
        for i in range(1, 6):  # Iterate through players (1 to 5)
            # Create a new row for each player
            player_data = {}
            for col in match_row.index:
                if col.startswith(f'{prefix}{i}_'):  # Filter columns for the specific player
                    # Rename column by removing prefix (e.g., A1_kills -> kills)
                    new_col_name = col[len(f'{prefix}{i}_'):]
                    player_data[new_col_name] = match_row[col]
            
            # Add team information
            player_data['duration'] = match_row['duration']
            new_rows.append(player_data)

# Create a new dataframe from the new rows
final_df = pd.DataFrame(new_rows)

# Display the final dataframe
print(final_df)
print(len(file_list))

      duration  A1_hero_id  A1_kills  A1_deaths  A1_assists  A1_gold_per_min  \
0         3103          84         9         12          12              448   
1         1827          98         5          5          16             1088   
2         1728          47         7          8          13              884   
3         1242          37         1          1          32              993   
4         1318          63         9          5           6              961   
...        ...         ...       ...        ...         ...              ...   
4685      2567          23         4         13           4              394   
4686      1454          37         3          8           6              677   
4687      2468          38         4         13           8              444   
4688      2005         108         1          7           7              405   
4689      1338          67         5          2          12             1080   

      A1_xp_per_min  A1_win  A1_perform

In [2]:
final_df.head(20)   

Unnamed: 0,hero_id,kills,deaths,assists,gold_per_min,xp_per_min,win,performance,normalized_performance,level,net_worth,team_score,opponent_score,tower_damage,hero_damage,duration
0,84.0,9.0,12.0,12.0,448.0,533.0,1.0,0.2818,0.194156,22.0,19970.0,42.0,55.0,399.0,26295.0,3103.0
1,42.0,6.0,5.0,5.0,797.0,1072.0,1.0,0.171,0.103175,28.0,37014.0,42.0,55.0,16667.0,22606.0,3103.0
2,45.0,11.0,9.0,13.0,414.0,742.0,1.0,0.4078,0.217121,26.0,21103.0,42.0,55.0,478.0,28962.0,3103.0
3,78.0,11.0,13.0,18.0,594.0,1164.0,1.0,0.4541,0.276931,29.0,25591.0,42.0,55.0,5777.0,99410.0,3103.0
4,36.0,5.0,16.0,23.0,430.0,626.0,1.0,0.3758,0.199169,24.0,19313.0,42.0,55.0,320.0,41941.0,3103.0
5,34.0,6.0,6.0,20.0,384.0,698.0,0.0,0.3299,0.151948,25.0,16695.0,55.0,42.0,730.0,50621.0,3103.0
6,111.0,6.0,8.0,21.0,701.0,879.0,0.0,0.3004,0.190628,27.0,30501.0,55.0,42.0,1208.0,67604.0,3103.0
7,72.0,4.0,11.0,19.0,410.0,591.0,0.0,0.1563,0.088857,23.0,18238.0,55.0,42.0,424.0,32475.0,3103.0
8,131.0,4.0,9.0,36.0,440.0,980.0,0.0,0.513,0.288545,28.0,15445.0,55.0,42.0,51.0,21188.0,3103.0
9,8.0,34.0,8.0,14.0,845.0,1187.0,0.0,0.6823,0.362193,29.0,35566.0,55.0,42.0,6188.0,141596.0,3103.0


In [3]:
import numpy as np
def log_mse_calculate(y_true, y_pred):
  """
  Calculate the Log Mean Squared Error (Log MSE) for like counts (log(like_count + 1)).

  Parameters:
  - y_true: array-like, actual like counts
  - y_pred: array-like, predicted like counts

  Returns:
  - log_mse: float, Log Mean Squared Error
  """
  # Ensure inputs are numpy arrays
  y_true = np.array(y_true)
  y_pred = np.array(y_pred)

  # Log transformation: log(like_count + 1)
  log_y_true = np.log1p(y_true)
  log_y_pred = np.log1p(y_pred)

  # Compute squared errors
  squared_errors = (log_y_true - log_y_pred) ** 2

  # Return the mean of squared errors
  return np.mean(squared_errors)

## FIRST PART
-Now I will form a model which will predict tower_damage

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Preprocess the data
# Assuming 'tower_damage' is the target variable and the rest are features
X = final_df.drop(columns=['tower_damage', 'hero_id', 'normalized_performance', 'kills', 'deaths', 'assists', 'hero_damage']) # Drop columns that are not needed to train the model. 
#hero_damage is dropped because it's not present in my matches(I am trying to predict tower damage in my matches).

y = final_df['tower_damage']

# Convert y values to log form in order not to be skewed by outliers
y_log = np.log1p(y)

# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X, y_log, test_size=0.2, random_state=42)


In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True]
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                           param_grid=param_grid,
                           cv=3,
                           n_jobs=-1,
                           verbose=2,
                           scoring='neg_mean_squared_error')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f'Best Parameters: {best_params}')
print(f'Best Score: {best_score}')

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   8.1s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   8.2s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   8.2s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   6.8s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   6.8s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  15.2s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  15.3s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  15.4s
[CV] EN



[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=  13.2s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=  27.9s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=  13.3s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=  13.4s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=  28.0s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   6.8s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   6.4s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   6.4s
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=1, min_s

In [6]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Train a RandomForestRegressor model with the best parameters from GridSearchCV
model = RandomForestRegressor(**best_params, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred_log = model.predict(X_val)

# Convert predictions back to original scale
y_pred = np.expm1(y_pred_log)

# Evaluate the model using log mean squared error
log_mse = log_mse_calculate(y_val, y_pred_log)
print(f'Log Mean Squared Error: {log_mse}')

# Calculate and print R^2 score
r2 = r2_score(np.expm1(y_val), y_pred)
print(f'R^2 Score: {r2}')


Log Mean Squared Error: 0.4084544279414639
R^2 Score: 0.4912709308532357


In [25]:
from sklearn.model_selection import cross_val_score

# Initialize the RandomForestRegressor model with the best parameters from GridSearchCV
model = RandomForestRegressor(**best_params, random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(model, X, y_log, cv=5, scoring='neg_mean_squared_error')

# Calculate the mean and standard deviation of the cross-validation scores
mean_cv_score = -np.mean(cv_scores)
std_cv_score = np.std(cv_scores)

print(f'Mean Cross-Validation Score: {mean_cv_score}')
print(f'Standard Deviation of Cross-Validation Score: {std_cv_score}')

Mean Cross-Validation Score: -4.576466982960329
Standard Deviation of Cross-Validation Score: 0.09236216954869056


## SECOND PART
-Now I will form a model which will predict hero_damage

In [9]:
from sklearn.model_selection import train_test_split

# Preprocess the data
# Assuming 'tower_damage' is the target variable and the rest are features
X_damage = final_df.drop(columns=['tower_damage', 'hero_id', 'normalized_performance','hero_damage']) # Drop columns that are not needed to train the model. 
#hero_damage is dropped because it's not present in my matches(I am trying to predict tower damage in my matches).

y_damage = final_df['hero_damage']

# Convert y values to log form
y_damage_log = np.log1p(y_damage)

# Split the data into training and testing sets
X_damage_train, X_damage_val, y_damage_train, y_damage_val = train_test_split(X_damage, y_damage_log, test_size=0.2, random_state=42)


In [10]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Train a RandomForestRegressor model with the best parameters from GridSearchCV
model_damage = RandomForestRegressor(**best_params, random_state=42)
model_damage.fit(X_damage_train, y_damage_train)

# Make predictions
y_damage_pred_log = model_damage.predict(X_damage_val)

# Convert predictions back to original scale
y_damage_pred = np.expm1(y_damage_pred_log)

# Evaluate the model using log mean squared error
log_mse_damage = log_mse_calculate(y_damage_val, y_damage_pred_log)
print(f'Log Mean Squared Error: {log_mse_damage}')

# Calculate and print R^2 score
r2_damage = r2_score(np.expm1(y_damage_val), y_damage_pred)
print(f'R^2 Score: {r2_damage}')


Log Mean Squared Error: 0.002229542990831553
R^2 Score: 0.7706174336317602


# PREDICTION FOR MY MATCHES
- There's no way to find out how the model performs for my matches since tower_damage and hero_damage features are missing for most of them. But the purpose is just to fill these empty features in my matches, this will not be used elsewhere in this project. 