In [23]:
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
import pickle
import joblib
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split


In [2]:
label_encoder = LabelEncoder()

In [3]:
df = pd.read_excel("../data/hardcoded/Коды услуг.xlsx")
num_clusters = len(df["Класс услуги"].unique())
df["Класс услуги"] = label_encoder.fit_transform(df["Класс услуги"])

In [18]:
kmeans = KMeans(n_clusters=num_clusters, random_state=0, init='k-means++').fit(df)
df["Кластер"] = kmeans.labels_

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 431 entries, 0 to 430
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   ID услуги     431 non-null    int64
 1   Класс услуги  431 non-null    int32
 2   Кластер       431 non-null    int32
dtypes: int32(2), int64(1)
memory usage: 6.9 KB


In [19]:

fig = go.Figure(data=[
    go.Scatter3d(
        x=df["ID услуги"],
        y=df["Класс услуги"],
        z=df["Кластер"],
        mode='markers',
        marker=dict(
            color=df["Кластер"],
            size=5,
            colorbar=dict(title='Cluster')
        )
    
    )
])

fig.show()

In [22]:
with open("service_class_cluster_model.pkl", "wb") as f:
    pickle.dump(kmeans, f)
joblib.dump(label_encoder, 'label_encoder.joblib')

['label_encoder.joblib']

In [33]:
train_data = df.drop(["Кластер"], axis=1)
train_data.info()
train_labels = df["Кластер"]
train_labels.info()
x_test, x_train, y_test, y_train = train_test_split(train_data, train_labels, test_size=0.33, random_state=42)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 431 entries, 0 to 430
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   ID услуги     431 non-null    int64
 1   Класс услуги  431 non-null    int32
dtypes: int32(1), int64(1)
memory usage: 5.2 KB
<class 'pandas.core.series.Series'>
RangeIndex: 431 entries, 0 to 430
Series name: Кластер
Non-Null Count  Dtype
--------------  -----
431 non-null    int32
dtypes: int32(1)
memory usage: 1.8 KB


In [39]:
model = CatBoostClassifier(iterations=1000,
                           depth=6,
                           learning_rate=0.1,
                           verbose=100)
model.fit(x_test, y_test)

0:	learn: 2.2145938	total: 2.3ms	remaining: 2.29s
100:	learn: 0.1061237	total: 274ms	remaining: 2.44s
200:	learn: 0.0316115	total: 471ms	remaining: 1.87s
300:	learn: 0.0173281	total: 658ms	remaining: 1.53s
400:	learn: 0.0117825	total: 836ms	remaining: 1.25s
500:	learn: 0.0087745	total: 1.01s	remaining: 1s
600:	learn: 0.0070133	total: 1.18s	remaining: 785ms
700:	learn: 0.0058261	total: 1.36s	remaining: 582ms
800:	learn: 0.0049585	total: 1.55s	remaining: 385ms
900:	learn: 0.0043602	total: 1.74s	remaining: 191ms
999:	learn: 0.0038296	total: 1.92s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x218a4dbbad0>

In [48]:
# Оценка производительности модели
accuracy = model.score(x_train, y_train)
xd = model.predict(x_train)
x_train.info()
print(x_train)

print(f"Accuracy: {accuracy}")

<class 'pandas.core.frame.DataFrame'>
Index: 143 entries, 423 to 211
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   ID услуги     143 non-null    int64
 1   Класс услуги  143 non-null    int32
dtypes: int32(1), int64(1)
memory usage: 2.8 KB
     ID услуги  Класс услуги
423  800007501             1
75   800007549             6
296  800007417             4
30   800007313             1
362  800000599             0
..         ...           ...
353  800006282             1
153  800006760             1
181  800007318             1
109  800008831             1
211  800007535             6

[143 rows x 2 columns]
Accuracy: 0.986013986013986


In [41]:
model.save_model("../service_class_cluster_model.cbm")