In [None]:
import pandas as pd
import re

timestamps_data = pd.read_csv('data_csv/timestamps_ext.csv')
target = pd.read_csv('data_csv/first_win.csv')

target.columns = ['matchId', 'first_win']
data = timestamps_data.merge(target, on='matchId')

In [None]:
others = ['matchId', 'timeStamp', 'percentTimeStamp', "gameDuration"]

blue_team_pattern = re.compile(r'p[1-5]_(.*)')
red_team_pattern = re.compile(r'p(6|7|8|9|10)_(.*)')

blue_team_columns = {}
red_team_columns = {}

for col in data.columns:
    if blue_team_pattern.match(col):
        feature_name = blue_team_pattern.match(col).group(1)
        if feature_name not in blue_team_columns:
            blue_team_columns[feature_name] = []
        blue_team_columns[feature_name].append(col)
    elif red_team_pattern.match(col):
        feature_name = red_team_pattern.match(col).group(2)
        if feature_name not in red_team_columns:
            red_team_columns[feature_name] = []
        red_team_columns[feature_name].append(col)

def aggregate_team_columns(data, team_columns, team_prefix):
    for feature, columns in team_columns.items():
        aggregated_column = f'{team_prefix}_{feature}'
        data[aggregated_column] = data[columns].sum(axis=1)
        data.drop(columns=columns, inplace=True)
    return data

data = aggregate_team_columns(data, blue_team_columns, 'blue')
data = aggregate_team_columns(data, red_team_columns, 'red')

In [None]:
missing_values = data.isnull().sum()

print("Columns with missing values:")
print(missing_values[missing_values > 0])

# Count the number of rows with missing values
missing_rows = data.isnull().any(axis=1).sum()
total_rows = data.shape[0]

print(f"\nTotal number of rows: {total_rows}")
print(f"Number of rows with missing values: {missing_rows}")

# If the number of rows with missing values is small, drop them
if missing_rows > 0:
    data_cleaned = data.dropna()
    print(f"\nNumber of rows after dropping missing values: {data_cleaned.shape[0]}")
else:
    data_cleaned = data

# Identify and remove constant columns
constant_columns = [col for col in data_cleaned.columns if data_cleaned[col].nunique() <= 1]
print(f"\nConstant columns to be removed: {constant_columns}")

data_cleaned = data_cleaned.drop(columns=constant_columns)

X = data_cleaned.drop(columns=['first_win'] + others)
y = data_cleaned['first_win']

In [None]:
timestamps_starting = data[data['timeStamp'] == 0]
timestamps_30p = data_cleaned.loc[data_cleaned.groupby('matchId')['percentTimeStamp'].apply(lambda x: (x - 0.3).abs().idxmin())]
timestamps_60p = data_cleaned.loc[data_cleaned.groupby('matchId')['percentTimeStamp'].apply(lambda x: (x - 0.6).abs().idxmin())]
timestamps_80p = data_cleaned.loc[data_cleaned.groupby('matchId')['percentTimeStamp'].apply(lambda x: (x - 0.8).abs().idxmin())]
timestamps_100p = data_cleaned.loc[data_cleaned.groupby('matchId')['percentTimeStamp'].apply(lambda x: (x - 1.0).abs().idxmin())]

In [None]:
timestamps = []
X_0 = timestamps_starting.drop(columns=['first_win']+others)
y_0 =timestamps_starting['first_win']
timestamps.append((X_0, y_0, '0'))

X_30 = timestamps_30p.drop(columns=['first_win'] + others)
y_30 = timestamps_30p['first_win']
timestamps.append((X_30, y_30, '30'))

X_60 = timestamps_60p.drop(columns=['first_win'] + others)
y_60 = timestamps_60p['first_win']
timestamps.append((X_60, y_60, '60'))

X_80 = timestamps_80p.drop(columns=['first_win'] + others)
y_80 = timestamps_80p['first_win']
timestamps.append((X_80, y_80, '80'))

X_100 = timestamps_100p.drop(columns=['first_win'] + others)
y_100 = timestamps_100p['first_win']
timestamps.append((X_100, y_100, '100'))

In [None]:
correlations = X.corrwith(y).sort_values(ascending=False)

print("\nTop 10 features most positively correlated with 'first_win':")
print(correlations.head(10))

print("\nTop 10 features most negatively correlated with 'first_win':")
print(correlations.tail(10))


In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def evaluate_model(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    print(f"Model: {model.__class__.__name__}")
    print(f"Accuracy: {accuracy:.4f}")

models = [
    LogisticRegression(random_state=42, n_jobs=-1),
    DecisionTreeClassifier(random_state=42),
    RandomForestClassifier(random_state=42, n_jobs=-1),
    HistGradientBoostingClassifier(random_state=42)
]

In [None]:
from sklearn.model_selection import cross_val_score

def cross_val_model(model, X, y):
    # Perform cross-validation and get accuracy scores
    scores = cross_val_score(model, X, y, cv=5, n_jobs=-1, scoring='accuracy')
    # Print the model's class name
    print(f"Model: {model.__class__.__name__}")
    # Print all cross-validation scores
    print(f"Cross-validation scores (Accuracy): {scores}")
    # Print the mean cross-validation score
    print(f"Mean cross-validation score (Accuracy): {scores.mean():.4f}\n")

In [None]:
def test_timestamp(timestamp):
    X, y, name = timestamp
    print('timestamp at {} percent'.format(name))
    for model in models:
        evaluate_model(model, X, y)
    
    # uncomment to also test with cross_validation - takes more time and results are similar so I left it commented out for now
    # for model in models:
    #     cross_val_model(model, X, y)

In [None]:
test_timestamp(timestamps[0])

In [None]:
test_timestamp(timestamps[1])

In [None]:
test_timestamp(timestamps[2])

In [None]:
test_timestamp(timestamps[3])

In [None]:
test_timestamp(timestamps[4])