In [2]:
import pathlib
import random
import pandas as pd
import numpy as np
import sys

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict

from sklearn.metrics import (
    f1_score, 
    accuracy_score,
    classification_report, 
)

ROOT_DIR = pathlib.Path().absolute().parent
DATA_DIR = ROOT_DIR / "data" / "cleared_df_final"
RANDOM_SEED = 42

## –ó–∞–≥—Ä—É–∑–∫–∞ –∏ –æ–±–∑–æ—Ä –¥–∞–Ω–Ω—ã—Ö

In [3]:
df_trends = pd.read_csv(DATA_DIR / "trends_description.csv")
df = pd.read_csv(DATA_DIR / "train.csv")

In [4]:
df.head()

Unnamed: 0,index,assessment,tags,text,trend_id_res0,trend_id_res1,trend_id_res2,trend_id_res3,trend_id_res4,trend_id_res5,...,trend_id_res40,trend_id_res41,trend_id_res42,trend_id_res43,trend_id_res44,trend_id_res45,trend_id_res46,trend_id_res47,trend_id_res48,trend_id_res49
0,5652,6.0,"{ASSORTMENT,PROMOTIONS,DELIVERY}","–ú–∞–ª–µ–Ω—å–∫–∏–π –≤—ã–±–æ—Ä —Ç–æ–≤–∞—Ä–æ–≤, —Ö–æ—Ç–µ–ª–æ—Å—å –±—ã –∞—Å—Å–æ—Ä—Ç–∏–º–µ...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,18092,4.0,"{ASSORTMENT,PRICE,PRODUCTS_QUALITY,DELIVERY}",–ë—ã—Å—Ç—Ä–æ,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,13845,6.0,"{DELIVERY,PROMOTIONS,PRICE,ASSORTMENT,SUPPORT}",–î–æ—Å—Ç–∞–≤–∫–∞ –ø–æ—Å—Ç–æ—è–Ω–Ω–æ –∑–∞–¥–µ—Ä–∂–∏–≤–∞–µ—Ç—Å—è,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,25060,6.0,"{PRICE,PROMOTIONS,ASSORTMENT}",–ù–∞—Ü–µ–Ω–∫–∞ –∏ –∞—Å—Å–æ—Ä—Ç–∏–º–µ–Ω—Ç —Ä–∞—Å—Å—Ç—Ä–∞–∏–≤–∞—é—Ç,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,15237,5.0,"{ASSORTMENT,PRODUCTS_QUALITY,PROMOTIONS,CATALO...",–î–æ—Å—Ç–∞–≤–∫–∞ –ø—Ä–æ—Å—Ç–æ üëç,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–µ–π

### –ü—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∞ –¥–∞–Ω–Ω—ã—Ö

In [5]:
df.head()

Unnamed: 0,index,assessment,tags,text,trend_id_res0,trend_id_res1,trend_id_res2,trend_id_res3,trend_id_res4,trend_id_res5,...,trend_id_res40,trend_id_res41,trend_id_res42,trend_id_res43,trend_id_res44,trend_id_res45,trend_id_res46,trend_id_res47,trend_id_res48,trend_id_res49
0,5652,6.0,"{ASSORTMENT,PROMOTIONS,DELIVERY}","–ú–∞–ª–µ–Ω—å–∫–∏–π –≤—ã–±–æ—Ä —Ç–æ–≤–∞—Ä–æ–≤, —Ö–æ—Ç–µ–ª–æ—Å—å –±—ã –∞—Å—Å–æ—Ä—Ç–∏–º–µ...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,18092,4.0,"{ASSORTMENT,PRICE,PRODUCTS_QUALITY,DELIVERY}",–ë—ã—Å—Ç—Ä–æ,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,13845,6.0,"{DELIVERY,PROMOTIONS,PRICE,ASSORTMENT,SUPPORT}",–î–æ—Å—Ç–∞–≤–∫–∞ –ø–æ—Å—Ç–æ—è–Ω–Ω–æ –∑–∞–¥–µ—Ä–∂–∏–≤–∞–µ—Ç—Å—è,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,25060,6.0,"{PRICE,PROMOTIONS,ASSORTMENT}",–ù–∞—Ü–µ–Ω–∫–∞ –∏ –∞—Å—Å–æ—Ä—Ç–∏–º–µ–Ω—Ç —Ä–∞—Å—Å—Ç—Ä–∞–∏–≤–∞—é—Ç,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,15237,5.0,"{ASSORTMENT,PRODUCTS_QUALITY,PROMOTIONS,CATALO...",–î–æ—Å—Ç–∞–≤–∫–∞ –ø—Ä–æ—Å—Ç–æ üëç,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
X, y = df[["text"]], df[[f"trend_id_res{i}" for i in range(50)]]
X = X.astype("str").copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = RANDOM_SEED)
print(f"X_train.shape is {X_train.shape}")
print(f"y_train.shape is {y_train.shape}")
print(f"X_test.shape is {X_test.shape}")
print(f"y_test.shape is {y_test.shape}")

X_train.shape is (6966, 1)
y_train.shape is (6966, 50)
X_test.shape is (1742, 1)
y_test.shape is (1742, 50)


### ¬†–ü—Ä–æ–≤–µ—Ä–∫–∞ –∫–∞—á–µ—Å—Ç–≤–∞ –Ω–∞ —Ç—Ä–µ—á–Ω–∏—Ä–æ–≤—á–Ω–æ–º –¥–∞—Ç–∞—Å–µ—Ç–µ

In [7]:
preprocessor = ColumnTransformer(
    [
        ("vetorizer", TfidfVectorizer(analyzer="char_wb", ngram_range = (1,3)), "text")
    ],                         
    remainder = "passthrough"
)

pipeline_multiout = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("clf", MultiOutputClassifier(LogisticRegression(max_iter = 10_000))),
    ]
)
display(pipeline_multiout)

In [8]:
cross_valid = cross_validate(pipeline_multiout, 
                             X_train, y_train, 
                             cv = 5, scoring = ["accuracy"], n_jobs = -1)
print("test_accuracy:", cross_valid["test_accuracy"].mean())



test_accuracy: 0.5044498986014311


In [9]:
y_pred = cross_val_predict(pipeline_multiout, X_train, y_train, cv = 2)

In [10]:
# –ü–æ—Å–º–æ—Ç—Ä–∏–º –Ω–∞ —Ä–∞–∑–ª–∏—á–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏
print(classification_report(y_train, y_pred, zero_division = 0))

              precision    recall  f1-score   support

           0       0.78      0.28      0.41       662
           1       0.65      0.05      0.09       278
           2       0.66      0.21      0.32       473
           3       0.72      0.13      0.23       268
           4       0.00      0.00      0.00        98
           5       0.00      0.00      0.00        37
           6       0.00      0.00      0.00        18
           7       0.00      0.00      0.00        26
           8       0.00      0.00      0.00       115
           9       0.00      0.00      0.00         8
          10       0.00      0.00      0.00        84
          11       0.00      0.00      0.00        89
          12       0.60      0.17      0.27       492
          13       0.00      0.00      0.00        28
          14       0.00      0.00      0.00        57
          15       0.00      0.00      0.00        62
          16       0.00      0.00      0.00       160
          17       0.00    

In [11]:
# –ü–æ—Å–º–æ—Ç—Ä–∏–º –Ω–∞ —Ü–µ–ª–µ–≤—É—é –º–µ—Ç—Ä–∏–∫—É
accuracy_score(y_train, y_pred)

0.5007177720356015

### ¬†–¢—Ä–µ–Ω–∏—Ä–æ–≤–∫–∞ –æ–∫–æ–Ω—á–∞—Ç–µ–ª—å–Ω–æ–π –º–æ–¥–µ–ª–∏

In [12]:
pipeline_multiout.fit(X_train, y_train)

## ¬†–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ –∏ –∑–∞–≥—Ä—É–∑–∫–∞ —Ä–µ—à–µ–Ω–∏—è

In [13]:
test =  pd.read_csv(DATA_DIR / "test.csv")

In [14]:
pred_test = pipeline_multiout.predict(test[["text"]].astype("str"))

In [15]:
res = pd.DataFrame(np.hstack([test["index"].values.reshape(test.shape[0], 1), pred_test]),
                  columns = ["index"]+[f"trend_id_res{i}" for i in range(50)])

In [16]:
res.head()

Unnamed: 0,index,trend_id_res0,trend_id_res1,trend_id_res2,trend_id_res3,trend_id_res4,trend_id_res5,trend_id_res6,trend_id_res7,trend_id_res8,...,trend_id_res40,trend_id_res41,trend_id_res42,trend_id_res43,trend_id_res44,trend_id_res45,trend_id_res46,trend_id_res47,trend_id_res48,trend_id_res49
0,5905,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3135,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9285,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4655,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,16778,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
res.iloc[:, 1:].sum()

trend_id_res0     529
trend_id_res1     145
trend_id_res2     516
trend_id_res3     171
trend_id_res4       0
trend_id_res5       0
trend_id_res6       0
trend_id_res7       0
trend_id_res8      18
trend_id_res9       0
trend_id_res10      1
trend_id_res11     16
trend_id_res12    526
trend_id_res13      0
trend_id_res14      0
trend_id_res15      5
trend_id_res16      9
trend_id_res17      0
trend_id_res18      0
trend_id_res19    180
trend_id_res20     79
trend_id_res21      0
trend_id_res22      0
trend_id_res23      0
trend_id_res24      0
trend_id_res25      0
trend_id_res26      0
trend_id_res27    219
trend_id_res28    223
trend_id_res29      0
trend_id_res30    155
trend_id_res31      0
trend_id_res32      0
trend_id_res33      0
trend_id_res34      0
trend_id_res35      4
trend_id_res36      0
trend_id_res37      0
trend_id_res38      0
trend_id_res39      0
trend_id_res40      0
trend_id_res41      0
trend_id_res42      0
trend_id_res43      0
trend_id_res44      0
trend_id_r

In [19]:
res["trend_id_res0"].value_counts()

0    16470
1      529
Name: trend_id_res0, dtype: int64

In [21]:
res[["index"]+[f"trend_id_res{i}" for i in range(50)]].to_csv(DATA_DIR / "submission.csv", index=False)