In [22]:
#importing important libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression, RFE
from sklearn.linear_model import LinearRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.cross_decomposition import PLSRegression
from sklearn.utils import resample
from itertools import combinations

In [23]:
#Creating/reading in external CSV files
players_df = pd.read_csv('grid_collector/data/players.csv')
display(players_df.head())

players_stats_df = pd.read_csv('grid_collector/data/player_stats_20241107_1832.csv')
display(players_stats_df.tail())

Unnamed: 0,id,nickname,title,team_id,team_name,private
0,18932,AG Nuke,Counter Strike 2,52314.0,CS2-1,False
1,18933,BadjoSP,Counter Strike 2,52314.0,CS2-1,False
2,18934,wunder,Counter Strike 2,52314.0,CS2-1,False
3,18935,Bulletito,Counter Strike 2,52315.0,CS2-2,False
4,18936,AGmurdercore,Counter Strike 2,52315.0,CS2-2,False


Unnamed: 0,player_id,series_count,game_count,total_kills,avg_kills,max_kills,min_kills,total_deaths,avg_deaths,max_deaths,min_deaths,kd_ratio,kills_per_game,deaths_per_game,id,nickname,team_name
478,119496,1,2,20,20.0,20,20,21,21.0,21,21,0.952381,10.0,10.5,119496,dobbo,Verdant
479,119696,1,2,31,31.0,31,31,33,33.0,33,33,0.939394,15.5,16.5,119696,MAT1-_-,kubixPulse
480,120255,1,2,23,23.0,23,23,19,19.0,19,19,1.210526,11.5,9.5,120255,redi,Hype E-sports
481,120298,1,2,14,14.0,14,14,30,30.0,30,30,0.466667,7.0,15.0,120298,★ ⑲ iogazeraXD,JOGA DE TERNO
482,120300,1,2,13,13.0,13,13,28,28.0,28,28,0.464286,6.5,14.0,120300,★ ⑳ lulu,JOGA DE TERNO


In [24]:
#Display unique data
display(players_stats_df.nunique)

<bound method DataFrame.nunique of      player_id  series_count  game_count  total_kills  avg_kills  max_kills  \
0        18932             0           0            0        0.0          0   
1        18933             0           0            0        0.0          0   
2        18934             0           0            0        0.0          0   
3        18935             0           0            0        0.0          0   
4        18936             0           0            0        0.0          0   
..         ...           ...         ...          ...        ...        ...   
478     119496             1           2           20       20.0         20   
479     119696             1           2           31       31.0         31   
480     120255             1           2           23       23.0         23   
481     120298             1           2           14       14.0         14   
482     120300             1           2           13       13.0         13   

     min_kills  

In [25]:
#Compare data-types of the dataframes
display(players_df.dtypes)
display(players_stats_df.dtypes)
#players_stats_df holds content that players_df contains

id             int64
nickname      object
title         object
team_id      float64
team_name     object
private         bool
dtype: object

player_id            int64
series_count         int64
game_count           int64
total_kills          int64
avg_kills          float64
max_kills            int64
min_kills            int64
total_deaths         int64
avg_deaths         float64
max_deaths           int64
min_deaths           int64
kd_ratio           float64
kills_per_game     float64
deaths_per_game    float64
id                   int64
nickname            object
team_name           object
dtype: object

In [26]:
#Drop duplicate/non-necessary columns
players_stats_df = players_stats_df.drop(columns=['id', 'team_name'])
#Set index as 'nickname'
players_stats_df = players_stats_df.set_index('nickname')
players_stats_df.head()

Unnamed: 0_level_0,player_id,series_count,game_count,total_kills,avg_kills,max_kills,min_kills,total_deaths,avg_deaths,max_deaths,min_deaths,kd_ratio,kills_per_game,deaths_per_game
nickname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
AG Nuke,18932,0,0,0,0.0,0,0,0,0.0,0,0,0.0,0.0,0.0
BadjoSP,18933,0,0,0,0.0,0,0,0,0.0,0,0,0.0,0.0,0.0
wunder,18934,0,0,0,0.0,0,0,0,0.0,0,0,0.0,0.0,0.0
Bulletito,18935,0,0,0,0.0,0,0,0,0.0,0,0,0.0,0.0,0.0
AGmurdercore,18936,0,0,0,0.0,0,0,0,0.0,0,0,0.0,0.0,0.0


In [27]:
#Displlay dataframe data-types
players_stats_df.dtypes

player_id            int64
series_count         int64
game_count           int64
total_kills          int64
avg_kills          float64
max_kills            int64
min_kills            int64
total_deaths         int64
avg_deaths         float64
max_deaths           int64
min_deaths           int64
kd_ratio           float64
kills_per_game     float64
deaths_per_game    float64
dtype: object

In [28]:
#A function that removes players with stats of '0' in all columns saved to 'stat_columns'
def remove_zero_stat_players(df, stat_columns):
    """
    Remove rows from a DataFrame where all specified stat columns have a value of 0.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing player stats
    stat_columns (list): List of column names to check for zeros. 
                        If None, uses all numeric columns except index
    
    Returns:
    pandas.DataFrame: DataFrame with zero-stat players removed
    """
    # If no stat columns specified, use all numeric columns
    if stat_columns is None:
        stat_columns = df.select_dtypes(include=['int64', 'float64']).columns
    
    # Create a boolean mask where True means the row has all zeros in stat columns
    zero_mask = df[stat_columns].eq(0).all(axis=1)
    
    # Return DataFrame with zero-stat players removed
    return df[~zero_mask]

In [29]:
#Columns for the function to cycle through
stat_columns = [
    'total_kills', 'avg_kills', 'max_kills', 'min_kills',
    'total_deaths', 'avg_deaths', 'max_deaths', 'min_deaths',
    'kd_ratio', 'kills_per_game', 'deaths_per_game'
]

In [31]:
#Creating a new DF for cleaned original dataframe
clean_player_stats = remove_zero_stat_players(players_stats_df, stat_columns)
#Display/Print number of removed players
print(f"Removed {len(players_stats_df) - len(clean_player_stats)} players with all zero stats")

Removed 46 players with all zero stats


In [32]:
#Display top 5 players
clean_player_stats.head()

Unnamed: 0_level_0,player_id,series_count,game_count,total_kills,avg_kills,max_kills,min_kills,total_deaths,avg_deaths,max_deaths,min_deaths,kd_ratio,kills_per_game,deaths_per_game
nickname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
marat2k 風速,19538,37,89,1274,34.432432,61,14,1276,34.486486,68,13,0.998433,14.314607,14.337079
2high,19549,22,53,750,34.090909,53,18,784,35.636364,47,18,0.956633,14.150943,14.792453
chawzyyy,43856,4,9,134,33.5,49,22,136,34.0,43,18,0.985294,14.888889,15.111111
arvid,43890,18,42,626,34.777778,63,20,633,35.166667,54,24,0.988942,14.904762,15.071429
ykis,43891,5,10,165,33.0,48,19,160,32.0,42,18,1.03125,16.5,16.0


In [33]:
#Show Unique Data/Numbers
clean_player_stats.nunique

<bound method DataFrame.nunique of                 player_id  series_count  game_count  total_kills  avg_kills  \
nickname                                                                      
marat2k 風速          19538            37          89         1274  34.432432   
2high               19549            22          53          750  34.090909   
chawzyyy            43856             4           9          134  33.500000   
arvid               43890            18          42          626  34.777778   
ykis                43891             5          10          165  33.000000   
...                   ...           ...         ...          ...        ...   
dobbo              119496             1           2           20  20.000000   
MAT1-_-            119696             1           2           31  31.000000   
redi               120255             1           2           23  23.000000   
★ ⑲ iogazeraXD     120298             1           2           14  14.000000   
★ ⑳ lulu         

In [34]:
#Save cleaned data as a new CSV
clean_player_stats.to_csv('grid_collector/data/clean_player_stats.csv')