In [1]:
# Подготовка
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from statsmodels.graphics.mosaicplot import mosaic

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report

import zipfile


__z = zipfile.ZipFile("dataset.zip")
#df_str = __z.open("csgo_round_snapshots.csv").read().decode("utf-8")
df_str_fs = __z.open("csgo_round_snapshots.csv")

df_src = pd.read_csv(df_str_fs)

category_columns = ["map", "bomb_planted", "round_winner"]

# Предварительная обработка данных

Данные [отсюда](https://www.kaggle.com/datasets/christianlillelund/csgo-round-winner-classification):

![](./imgs/table.png)

In [None]:
# Look dataframe
#df_src.head()
pd.set_option('display.max_columns', 97)
df_src.tail()

# Убираем дубликаты

In [2]:
print(f"Number of (rows, columns): {df_src.shape}")
duplicate_rows_df = df_src[df_src.duplicated()]
print(f"Number of duplicate (rows, columns): {duplicate_rows_df.shape}")
df_src = df_src.drop_duplicates()
print(f"Number of (rows, columns) after drop dublicates: {df_src.shape}")

Number of (rows, columns): (122410, 97)
Number of duplicate (rows, columns): (4962, 97)
Number of (rows, columns) after drop dublicates: (117448, 97)


# Смотрим, чтобы не было нерелевантных данных

## Как поступить с такими данными?

![](./imgs/if_missing_data.png)

In [None]:
pd.set_option('display.max_rows', 97)
df_src.count() # Кол-во не None значений в каждой колонке

In [None]:
pd.set_option('display.max_rows', 97)
print(df_src.isnull().sum()) # Смотрим есть ли null хотя бы в каком-нибудь столбце
# null'ов нет, поэтому "df_src = df_src.dropna()" не нужно

# Делаем кодирование строк + float->int

In [3]:
df_float_cat = df_src.copy()

label_columns = category_columns
for label_column_i in label_columns:
    df_float_cat[label_column_i] = df_float_cat[label_column_i].astype('category')

pd.set_option('display.max_rows', 97)
df_float_cat.dtypes                            # Смотрим какие вообще есть типы

time_left                        float64
ct_score                         float64
t_score                          float64
map                             category
bomb_planted                    category
ct_health                        float64
t_health                         float64
ct_armor                         float64
t_armor                          float64
ct_money                         float64
t_money                          float64
ct_helmets                       float64
t_helmets                        float64
ct_defuse_kits                   float64
ct_players_alive                 float64
t_players_alive                  float64
ct_weapon_ak47                   float64
t_weapon_ak47                    float64
ct_weapon_aug                    float64
t_weapon_aug                     float64
ct_weapon_awp                    float64
t_weapon_awp                     float64
ct_weapon_bizon                  float64
t_weapon_bizon                   float64
ct_weapon_cz75au

In [4]:
df_ints = df_src.copy()

float64_cols = df_ints.loc[:, df_ints.dtypes == float].columns

df_ints[float64_cols] = df_ints[float64_cols].astype(int)

# Label Encoding
label_columns = category_columns
for label_column_i in label_columns:
    df_ints[label_column_i] = df_ints[label_column_i].astype('category')
    df_ints[label_column_i] = df_ints[label_column_i].cat.codes
    df_ints[label_column_i] = df_ints[label_column_i].astype(int)

pd.set_option('display.max_rows', 97)
df_ints.dtypes

time_left                       int64
ct_score                        int64
t_score                         int64
map                             int64
bomb_planted                    int64
ct_health                       int64
t_health                        int64
ct_armor                        int64
t_armor                         int64
ct_money                        int64
t_money                         int64
ct_helmets                      int64
t_helmets                       int64
ct_defuse_kits                  int64
ct_players_alive                int64
t_players_alive                 int64
ct_weapon_ak47                  int64
t_weapon_ak47                   int64
ct_weapon_aug                   int64
t_weapon_aug                    int64
ct_weapon_awp                   int64
t_weapon_awp                    int64
ct_weapon_bizon                 int64
t_weapon_bizon                  int64
ct_weapon_cz75auto              int64
t_weapon_cz75auto               int64
ct_weapon_el

# Графики

![](./imgs/which_visualization.png)

В нашем случае есть смысл только в мозаичные диаграммы и диаграммы размаха

In [None]:
# Тепловая карта


plt.figure(figsize=(100,50))
c= df_ints.corr()
sns.heatmap(c,cmap="BrBG",annot=True)
c

In [None]:
# Диаграммы размаха

float64_cols = df_float_cat.loc[:, df_float_cat.dtypes == "float64"].columns

for float_i in float64_cols:
    df_float_cat.boxplot(by ='round_winner', column =[float_i], grid = True)

In [None]:
# Мозаичные диаграммы

plt.rcParams["figure.figsize"]=(10, 5)
category_cols = df_float_cat.loc[:, df_float_cat.dtypes == "category"].columns

for cat_i in category_cols:
    if(cat_i != "round_winner"):
        null_out = mosaic(df_float_cat, [cat_i, 'round_winner'])


In [None]:
# Гистограммы

plt.figure(figsize=(100,50))

cols = df_ints.columns
for col_i in cols:
    df_ints.hist(column=col_i)
#df_float_cat.hist()

# Проектирование признаков

In [5]:
#df_src.columns

# https://stackoverflow.com/questions/25748683/pandas-sum-dataframe-rows-for-given-columns

weapons_auto_guns_ct = ["ct_weapon_ak47", "ct_weapon_aug", "ct_weapon_famas", "ct_weapon_galilar", 
                        "ct_weapon_m249", "ct_weapon_m4a1s", "ct_weapon_m4a4", 
                        "ct_weapon_negev", "ct_weapon_sg553"]
weapons_rifle_ct = ["ct_weapon_awp", "ct_weapon_g3sg1", 
                    "ct_weapon_scar20", "ct_weapon_ssg08"]
weapons_sub_guns_ct = ["ct_weapon_bizon", "ct_weapon_mac10",
                      "ct_weapon_mp5sd", "ct_weapon_mp7",
                      "ct_weapon_mp9", "ct_weapon_p90", "ct_weapon_ump45"]
weapons_pistols_ct = ["ct_weapon_cz75auto", "ct_weapon_elite", 
                      "ct_weapon_glock", "ct_weapon_r8revolver",
                     "ct_weapon_fiveseven", "ct_weapon_deagle", 
                     "ct_weapon_usps", "ct_weapon_p250", 
                      "ct_weapon_p2000", "ct_weapon_tec9"]
weapons_shutguns_ct = ["ct_weapon_mag7", "ct_weapon_nova", 
                       "ct_weapon_sawedoff", "ct_weapon_xm1014"]
weapons_grenades_ct = ["ct_grenade_hegrenade", "ct_grenade_flashbang", "ct_grenade_smokegrenade", 
                       "ct_grenade_incendiarygrenade", "ct_grenade_molotovgrenade", 
                       "ct_grenade_decoygrenade"]

weapons_auto_guns_t = ["t_weapon_ak47", "t_weapon_aug", "t_weapon_famas", "t_weapon_galilar", 
                        "t_weapon_m249", "t_weapon_m4a1s", "t_weapon_m4a4", 
                        "t_weapon_negev", "t_weapon_sg553"]
weapons_rifle_t = ["t_weapon_awp", "t_weapon_g3sg1", 
                    "t_weapon_scar20", "t_weapon_ssg08"]
weapons_sub_guns_t = ["t_weapon_bizon", "t_weapon_mac10",
                      "t_weapon_mp5sd", "t_weapon_mp7",
                      "t_weapon_mp9", "t_weapon_p90", "t_weapon_ump45"]
weapons_pistols_t = ["t_weapon_cz75auto", "t_weapon_elite", 
                      "t_weapon_glock", "t_weapon_r8revolver",
                     "t_weapon_fiveseven", "t_weapon_deagle", 
                     "t_weapon_usps", "t_weapon_p250", 
                      "t_weapon_p2000", "t_weapon_tec9"]
weapons_shutguns_t = ["t_weapon_mag7", "t_weapon_nova", 
                       "t_weapon_sawedoff", "t_weapon_xm1014"]
weapons_grenades_t = ["t_grenade_hegrenade", "t_grenade_flashbang", "t_grenade_smokegrenade", 
                       "t_grenade_incendiarygrenade", "t_grenade_molotovgrenade", 
                       "t_grenade_decoygrenade"]


df_float_cat_round = df_float_cat.copy()

df_float_cat_round["weapons_auto_guns_ct"] = df_float_cat_round[weapons_auto_guns_ct].sum(axis=1)
df_float_cat_round["weapons_rifle_ct"] = df_float_cat_round[weapons_rifle_ct].sum(axis=1)
df_float_cat_round["weapons_sub_guns_ct"] = df_float_cat_round[weapons_sub_guns_ct].sum(axis=1)
df_float_cat_round["weapons_pistols_ct"] = df_float_cat_round[weapons_pistols_ct].sum(axis=1)
df_float_cat_round["weapons_shutguns_ct"] = df_float_cat_round[weapons_shutguns_ct].sum(axis=1)
df_float_cat_round["weapons_grenades_ct"] = df_float_cat_round[weapons_grenades_ct].sum(axis=1)

df_float_cat_round["weapons_auto_guns_t"] = df_float_cat_round[weapons_auto_guns_t].sum(axis=1)
df_float_cat_round["weapons_rifle_t"] = df_float_cat_round[weapons_rifle_t].sum(axis=1)
df_float_cat_round["weapons_sub_guns_t"] = df_float_cat_round[weapons_sub_guns_t].sum(axis=1)
df_float_cat_round["weapons_pistols_t"] = df_float_cat_round[weapons_pistols_t].sum(axis=1)
df_float_cat_round["weapons_shutguns_t"] = df_float_cat_round[weapons_shutguns_t].sum(axis=1)
df_float_cat_round["weapons_grenades_t"] = df_float_cat_round[weapons_grenades_t].sum(axis=1)


df_float_cat_round = df_float_cat_round.drop(weapons_auto_guns_ct+weapons_rifle_ct+weapons_sub_guns_ct+weapons_pistols_ct+weapons_shutguns_ct+weapons_grenades_ct+
                        weapons_auto_guns_t+weapons_rifle_t+weapons_sub_guns_t+weapons_pistols_t+weapons_shutguns_t+weapons_grenades_t
                       , axis=1)

l = list(df_float_cat_round.columns)
l.remove("round_winner")
l += ["round_winner"]

df_float_cat_round = df_float_cat_round[l]

pd.set_option('display.max_columns', 97)
df_float_cat_round.head(100)



Unnamed: 0,time_left,ct_score,t_score,map,bomb_planted,ct_health,t_health,ct_armor,t_armor,ct_money,t_money,ct_helmets,t_helmets,ct_defuse_kits,ct_players_alive,t_players_alive,weapons_auto_guns_ct,weapons_rifle_ct,weapons_sub_guns_ct,weapons_pistols_ct,weapons_shutguns_ct,weapons_grenades_ct,weapons_auto_guns_t,weapons_rifle_t,weapons_sub_guns_t,weapons_pistols_t,weapons_shutguns_t,weapons_grenades_t,round_winner
0,175.00,0.0,0.0,de_dust2,False,500.0,500.0,0.0,0.0,4000.0,4000.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,CT
1,156.03,0.0,0.0,de_dust2,False,500.0,500.0,400.0,300.0,600.0,650.0,0.0,0.0,1.0,5.0,5.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,2.0,CT
2,96.03,0.0,0.0,de_dust2,False,391.0,400.0,294.0,200.0,750.0,500.0,0.0,0.0,1.0,4.0,4.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,2.0,CT
3,76.03,0.0,0.0,de_dust2,False,391.0,400.0,294.0,200.0,750.0,500.0,0.0,0.0,1.0,4.0,4.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,CT
4,174.97,1.0,0.0,de_dust2,False,500.0,500.0,192.0,0.0,18350.0,10750.0,0.0,0.0,1.0,5.0,5.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,CT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,54.97,4.0,14.0,de_dust2,False,100.0,300.0,100.0,379.0,0.0,15650.0,0.0,4.0,1.0,1.0,4.0,1.0,0.0,0.0,1.0,0.0,1.0,3.0,1.0,0.0,4.0,0.0,5.0,T
96,24.94,4.0,14.0,de_dust2,True,28.0,147.0,93.0,195.0,600.0,6850.0,0.0,2.0,1.0,1.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0,4.0,T
97,4.94,4.0,14.0,de_dust2,True,28.0,147.0,93.0,195.0,600.0,6850.0,0.0,2.0,1.0,1.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,4.0,T
98,174.96,4.0,15.0,de_dust2,False,500.0,500.0,91.0,195.0,18000.0,36100.0,0.0,2.0,1.0,5.0,5.0,1.0,0.0,0.0,5.0,0.0,0.0,1.0,1.0,0.0,5.0,0.0,4.0,T


In [None]:
# Тепловая карта

df_ints_round = df_float_cat_round.copy()

float64_cols = df_ints_round.loc[:, df_ints_round.dtypes == float].columns

df_ints_round[float64_cols] = df_ints_round[float64_cols].astype(int)

# Label Encoding
label_columns = category_columns
for label_column_i in label_columns:
    df_ints_round[label_column_i] = df_ints_round[label_column_i].astype('category')
    df_ints_round[label_column_i] = df_ints_round[label_column_i].cat.codes
    df_ints_round[label_column_i] = df_ints_round[label_column_i].astype(int)


plt.figure(figsize=(100,50))
c= df_ints_round.corr()
sns.heatmap(c,cmap="BrBG",annot=True)
c

# Обучение и прогноз

In [8]:

# dataframe is df_src or
# dataframe is df_float_cat or
# dataframe is df_float_cat_round or
# dataframe is df_ints or
# dataframe is df_src df_ints_round
def do_fit_and_predict(model, dataframe) -> "(predicts, class_report)":
    df = dataframe.copy()

    X = df.copy()
    Y = X.pop('round_winner')

    #X, y = SMOTE().fit_resample(X, y) # Не нужен. И так поровну в "round_winner"

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=5051)

    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    #model = LogisticRegression()
    model.fit(X_train, Y_train)

    Y_pred = model.predict(X_test)
    
    class_report = classification_report(Y_test, Y_pred)
    return (Y_pred, class_report)

## Логистическая регрессия

In [9]:
preds, class_report = do_fit_and_predict(LogisticRegression(), df_ints)

print(class_report)

              precision    recall  f1-score   support

           0       0.75      0.77      0.76     18854
           1       0.77      0.75      0.76     19904

    accuracy                           0.76     38758
   macro avg       0.76      0.76      0.76     38758
weighted avg       0.76      0.76      0.76     38758



## K-ближайших соседей

In [11]:
preds, class_report = do_fit_and_predict(KNeighborsClassifier(), df_ints)

print(class_report)

              precision    recall  f1-score   support

           0       0.81      0.82      0.81     18854
           1       0.82      0.81      0.82     19904

    accuracy                           0.82     38758
   macro avg       0.82      0.82      0.82     38758
weighted avg       0.82      0.82      0.82     38758



## Машина опорных векторов (SVM)

In [16]:
preds, class_report = do_fit_and_predict(SVC(), df_ints)

print(class_report) # оооооочень долго

              precision    recall  f1-score   support

           0       0.76      0.84      0.80     18854
           1       0.83      0.75      0.79     19904

    accuracy                           0.79     38758
   macro avg       0.80      0.79      0.79     38758
weighted avg       0.80      0.79      0.79     38758



## Наивный байесовский классификатор

In [26]:
# shitty
## https://proglib.io/p/izuchaem-naivnyy-bayesovskiy-algoritm-klassifikacii-dlya-mashinnogo-obucheniya-2021-11-12

from scipy.stats import norm

def np_to_pd(np_df) -> "pd_df":
    new_df = pd.DataFrame(StandardScaler().fit_transform(np_df), columns=np_df.columns, index=np_df.index)
    return new_df

df = df_ints.copy()
X = df.copy()
Y = X.pop('round_winner')

#X, y = SMOTE().fit_resample(X, y) # Не нужен. И так поровну в "round_winner"

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=5051)

#X_train, X_test, Y_train, Y_test = np_to_pd(X_train), np_to_pd(X_test), np_to_pd(Y_train), np_to_pd(Y_test)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

means = X_train.groupby(Y_train).apply(np.mean)
stds  = X_train.groupby(Y_train).apply(np.std)

probs = X_train.groupby(Y_train).apply(lambda x: len(x)) / X_train.shape[0]

Y_pred = []

for elem in range(X_test.shape[0]):
    p = {}
    
    for cl in np.unique(y_train):
        p[cl] = probs.iloc[cl]

        for index, param in enumerate(X_test.iloc[elem]):
            p[cl] *= norm.pdf(param, means.iloc[cl, index], stds.iloc[cl, index])

    Y_pred.append(pd.Series(p).values.argmax())


    
class_report = classification_report(Y_test, Y_pred)
return (Y_pred, class_report)




<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


AttributeError: 'numpy.ndarray' object has no attribute 'groupby'

In [31]:
from collections import defaultdict

# https://shwanoff.ru/%D0%BD%D0%B0%D0%B8%D0%B2%D0%BD%D1%8B%D0%B9-%D0%B1%D0%B0%D0%B9%D0%B5%D1%81%D0%BE%D0%B2%D1%81%D0%BA%D0%B8%D0%B9-%D0%BA%D0%BB%D0%B0%D1%81%D1%81%D0%B8%D1%84%D0%B8%D0%BA%D0%B0%D1%82%D0%BE%D1%80-%D0%BD/
class NaiveBayesClassifier(object):

    def __init__(self):  
        self.__class_freq = defaultdict(lambda:0)
        self.__feat_freq = defaultdict(lambda:0)


    def fit(self, X, y):
        # calculate classes and features frequencies
        for feature, label in zip(X, y):
            self.__class_freq[label] += 1
            for value in feature:
                self.__feat_freq[(value, label)] += 1

        # normalizate values
        num_samples = len(X)
        for k in self.__class_freq:
            self.__class_freq[k] /= num_samples

        for value, label in self.__feat_freq:
            self.__feat_freq[(value, label)] /= self.__class_freq[label]

        return self

    def predict(self, X):
        # return argmin of classes 
        return min(self.__class_freq.keys(), 
                   key=lambda c : self.__calculate_class_freq(X, c)) 

    def __calculate_class_freq(self, X, clss):
        # calculate frequence for current class
        freq = - np.log(self.__class_freq[clss])

        for feat in X: 
            freq += - np.log(self.__feat_freq.get((feat, clss), 10 ** (-7)))
        return freq
    
df = df_ints.copy()
X = df.copy()
Y = X.pop('round_winner')

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=5051)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

model = NaiveBayesClassifier()

model.fit(X_train, Y_train)

Y_pred = [model.predict(x) for x in X_test]

class_report = classification_report(Y_test, Y_pred)

print(class_report)


              precision    recall  f1-score   support

           0       0.72      0.76      0.74     18854
           1       0.76      0.71      0.74     19904

    accuracy                           0.74     38758
   macro avg       0.74      0.74      0.74     38758
weighted avg       0.74      0.74      0.74     38758

