In [1]:
# Подготовка
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.graphics.mosaicplot import mosaic

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report

import zipfile


__z = zipfile.ZipFile("dataset.zip")
#df_str = __z.open("csgo_round_snapshots.csv").read().decode("utf-8")
df_str_fs = __z.open("csgo_round_snapshots.csv")

df_src = pd.read_csv(df_str_fs)

category_columns = ["map", "bomb_planted", "round_winner"]

# Предварительная обработка данных

Данные [отсюда](https://www.kaggle.com/datasets/christianlillelund/csgo-round-winner-classification):

![](./imgs/table.png)

In [None]:
# Look dataframe
#df_src.head()
pd.set_option('display.max_columns', 97)
df_src.tail()

# Убираем дубликаты

In [2]:
print(f"Number of (rows, columns): {df_src.shape}")
duplicate_rows_df = df_src[df_src.duplicated()]
print(f"Number of duplicate (rows, columns): {duplicate_rows_df.shape}")
df_src = df_src.drop_duplicates()
print(f"Number of (rows, columns) after drop dublicates: {df_src.shape}")

Number of (rows, columns): (122410, 97)
Number of duplicate (rows, columns): (4962, 97)
Number of (rows, columns) after drop dublicates: (117448, 97)


# Смотрим, чтобы не было нерелевантных данных

## Как поступить с такими данными?

![](./imgs/if_missing_data.png)

In [None]:
pd.set_option('display.max_rows', 97)
df_src.count() # Кол-во не None значений в каждой колонке

In [None]:
pd.set_option('display.max_rows', 97)
print(df_src.isnull().sum()) # Смотрим есть ли null хотя бы в каком-нибудь столбце
# null'ов нет, поэтому "df_src = df_src.dropna()" не нужно

# Делаем кодирование строк + float->int

In [3]:
df_float_cat = df_src.copy()

label_columns = category_columns
for label_column_i in label_columns:
    df_float_cat[label_column_i] = df_float_cat[label_column_i].astype('category')

pd.set_option('display.max_rows', 97)
df_float_cat.dtypes                            # Смотрим какие вообще есть типы

time_left                        float64
ct_score                         float64
t_score                          float64
map                             category
bomb_planted                    category
ct_health                        float64
t_health                         float64
ct_armor                         float64
t_armor                          float64
ct_money                         float64
t_money                          float64
ct_helmets                       float64
t_helmets                        float64
ct_defuse_kits                   float64
ct_players_alive                 float64
t_players_alive                  float64
ct_weapon_ak47                   float64
t_weapon_ak47                    float64
ct_weapon_aug                    float64
t_weapon_aug                     float64
ct_weapon_awp                    float64
t_weapon_awp                     float64
ct_weapon_bizon                  float64
t_weapon_bizon                   float64
ct_weapon_cz75au

In [4]:
df_ints = df_src.copy()

float64_cols = df_ints.loc[:, df_ints.dtypes == float].columns

df_ints[float64_cols] = df_ints[float64_cols].astype(int)

# Label Encoding
label_columns = category_columns
for label_column_i in label_columns:
    df_ints[label_column_i] = df_ints[label_column_i].astype('category')
    df_ints[label_column_i] = df_ints[label_column_i].cat.codes
    df_ints[label_column_i] = df_ints[label_column_i].astype(int)

pd.set_option('display.max_rows', 97)
df_ints.dtypes

time_left                       int64
ct_score                        int64
t_score                         int64
map                             int64
bomb_planted                    int64
ct_health                       int64
t_health                        int64
ct_armor                        int64
t_armor                         int64
ct_money                        int64
t_money                         int64
ct_helmets                      int64
t_helmets                       int64
ct_defuse_kits                  int64
ct_players_alive                int64
t_players_alive                 int64
ct_weapon_ak47                  int64
t_weapon_ak47                   int64
ct_weapon_aug                   int64
t_weapon_aug                    int64
ct_weapon_awp                   int64
t_weapon_awp                    int64
ct_weapon_bizon                 int64
t_weapon_bizon                  int64
ct_weapon_cz75auto              int64
t_weapon_cz75auto               int64
ct_weapon_el

# Графики

![](./imgs/which_visualization.png)

В нашем случае есть смысл только в мозаичные диаграммы и диаграммы размаха

In [None]:
# Тепловая карта


plt.figure(figsize=(100,50))
c= df_ints.corr()
sns.heatmap(c,cmap="BrBG",annot=True)
c

In [None]:
#sns.boxplot(x=df_src['time_left'])

In [None]:
# Диаграммы размаха

float64_cols = df_float_cat.loc[:, df_float_cat.dtypes == "float64"].columns

for float_i in float64_cols:
    df_float_cat.boxplot(by ='round_winner', column =[float_i], grid = True)

In [None]:
# Мозаичные диаграммы

plt.rcParams["figure.figsize"]=(10, 5)
category_cols = df_float_cat.loc[:, df_float_cat.dtypes == "category"].columns

for cat_i in category_cols:
    if(cat_i != "round_winner"):
        null_out = mosaic(df_float_cat, [cat_i, 'round_winner'])


In [None]:
# Гистограммы

plt.figure(figsize=(100,50))

cols = df_ints.columns
for col_i in cols:
    df_ints.hist(column=col_i)
#df_float_cat.hist()

# Проектирование признаков

In [None]:
#df_src.columns

# https://stackoverflow.com/questions/25748683/pandas-sum-dataframe-rows-for-given-columns

weapons_auto_guns_ct = ["ct_weapon_ak47", "ct_weapon_aug", "ct_weapon_famas", "ct_weapon_galilar", 
                        "ct_weapon_m249", "ct_weapon_m4a1s", "ct_weapon_m4a4", 
                        "ct_weapon_negev", "ct_weapon_sg553"]
weapons_rifle_ct = ["ct_weapon_awp", "ct_weapon_g3sg1", 
                    "ct_weapon_scar20", "ct_weapon_ssg08"]
weapons_sub_guns_ct = ["ct_weapon_bizon", "ct_weapon_mac10",
                      "ct_weapon_mp5sd", "ct_weapon_mp7",
                      "ct_weapon_mp9", "ct_weapon_p90", "ct_weapon_ump45"]
weapons_pistols_ct = ["ct_weapon_cz75auto", "ct_weapon_elite", 
                      "ct_weapon_glock", "ct_weapon_r8revolver",
                     "ct_weapon_fiveseven", "ct_weapon_deagle", 
                     "ct_weapon_usps", "ct_weapon_p250", 
                      "ct_weapon_p2000", "ct_weapon_tec9"]
weapons_shutguns_ct = ["ct_weapon_mag7", "ct_weapon_nova", 
                       "ct_weapon_sawedoff", "ct_weapon_xm1014"]
weapons_grenades_ct = ["ct_grenade_hegrenade", "ct_grenade_flashbang", "ct_grenade_smokegrenade", 
                       "ct_grenade_incendiarygrenade", "ct_grenade_molotovgrenade", 
                       "ct_grenade_decoygrenade"]

weapons_auto_guns_t = ["t_weapon_ak47", "t_weapon_aug", "t_weapon_famas", "t_weapon_galilar", 
                        "t_weapon_m249", "t_weapon_m4a1s", "t_weapon_m4a4", 
                        "t_weapon_negev", "t_weapon_sg553"]
weapons_rifle_t = ["t_weapon_awp", "t_weapon_g3sg1", 
                    "t_weapon_scar20", "t_weapon_ssg08"]
weapons_sub_guns_t = ["t_weapon_bizon", "t_weapon_mac10",
                      "t_weapon_mp5sd", "t_weapon_mp7",
                      "t_weapon_mp9", "t_weapon_p90", "t_weapon_ump45"]
weapons_pistols_t = ["t_weapon_cz75auto", "t_weapon_elite", 
                      "t_weapon_glock", "t_weapon_r8revolver",
                     "t_weapon_fiveseven", "t_weapon_deagle", 
                     "t_weapon_usps", "t_weapon_p250", 
                      "t_weapon_p2000", "t_weapon_tec9"]
weapons_shutguns_t = ["t_weapon_mag7", "t_weapon_nova", 
                       "t_weapon_sawedoff", "t_weapon_xm1014"]
weapons_grenades_t = ["t_grenade_hegrenade", "t_grenade_flashbang", "t_grenade_smokegrenade", 
                       "t_grenade_incendiarygrenade", "t_grenade_molotovgrenade", 
                       "t_grenade_decoygrenade"]


df_float_cat_round = df_float_cat.copy()

df_float_cat_round["weapons_auto_guns_ct"] = df_float_cat_round[weapons_auto_guns_ct].sum(axis=1)
df_float_cat_round["weapons_rifle_ct"] = df_float_cat_round[weapons_rifle_ct].sum(axis=1)
df_float_cat_round["weapons_sub_guns_ct"] = df_float_cat_round[weapons_sub_guns_ct].sum(axis=1)
df_float_cat_round["weapons_pistols_ct"] = df_float_cat_round[weapons_pistols_ct].sum(axis=1)
df_float_cat_round["weapons_shutguns_ct"] = df_float_cat_round[weapons_shutguns_ct].sum(axis=1)
df_float_cat_round["weapons_grenades_ct"] = df_float_cat_round[weapons_grenades_ct].sum(axis=1)

df_float_cat_round["weapons_auto_guns_t"] = df_float_cat_round[weapons_auto_guns_t].sum(axis=1)
df_float_cat_round["weapons_rifle_t"] = df_float_cat_round[weapons_rifle_t].sum(axis=1)
df_float_cat_round["weapons_sub_guns_t"] = df_float_cat_round[weapons_sub_guns_t].sum(axis=1)
df_float_cat_round["weapons_pistols_t"] = df_float_cat_round[weapons_pistols_t].sum(axis=1)
df_float_cat_round["weapons_shutguns_t"] = df_float_cat_round[weapons_shutguns_t].sum(axis=1)
df_float_cat_round["weapons_grenades_t"] = df_float_cat_round[weapons_grenades_t].sum(axis=1)


df_float_cat_round = df_float_cat_round.drop(weapons_auto_guns_ct+weapons_rifle_ct+weapons_sub_guns_ct+weapons_pistols_ct+weapons_shutguns_ct+weapons_grenades_ct+
                        weapons_auto_guns_t+weapons_rifle_t+weapons_sub_guns_t+weapons_pistols_t+weapons_shutguns_t+weapons_grenades_t
                       , axis=1)

l = list(df_float_cat_round.columns)
l.remove("round_winner")
l += ["round_winner"]

df_float_cat_round = df_float_cat_round[l]

pd.set_option('display.max_columns', 97)
df_float_cat_round.head(100)



In [None]:
# Тепловая карта

df_ints_round = df_float_cat_round.copy()

float64_cols = df_ints_round.loc[:, df_ints_round.dtypes == float].columns

df_ints_round[float64_cols] = df_ints_round[float64_cols].astype(int)

# Label Encoding
label_columns = category_columns
for label_column_i in label_columns:
    df_ints_round[label_column_i] = df_ints_round[label_column_i].astype('category')
    df_ints_round[label_column_i] = df_ints_round[label_column_i].cat.codes
    df_ints_round[label_column_i] = df_ints_round[label_column_i].astype(int)


plt.figure(figsize=(100,50))
c= df_ints_round.corr()
sns.heatmap(c,cmap="BrBG",annot=True)
c

# Обучение и прогноз

## Логистическая регрессия

In [10]:
df = df_ints.copy()

X = df.copy()
Y = X.pop('round_winner')

#X,y=SMOTE().fit_resample(X,y) # Не нужен. И так поровну в "round_winner"

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=5051)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

model = LogisticRegression()
model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

print(classification_report(Y_test, Y_pred))


              precision    recall  f1-score   support

           0       0.75      0.77      0.76     18854
           1       0.77      0.75      0.76     19904

    accuracy                           0.76     38758
   macro avg       0.76      0.76      0.76     38758
weighted avg       0.76      0.76      0.76     38758

