In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from lightgbm import LGBMRegressor, log_evaluation
from lightgbm import LGBMClassifier
from lightgbm import early_stopping
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
import gc
from sklearn.metrics import r2_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.feature_selection import f_regression
import tqdm

%matplotlib inline

import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)
warnings.simplefilter(action = 'ignore', category = DeprecationWarning)
warnings.simplefilter(action = 'ignore', category = UserWarning)
warnings.simplefilter(action = 'ignore', category = RuntimeWarning)
warnings.filterwarnings("ignore", message = "numpy.dtype size changed")
warnings.filterwarnings("ignore", message = "numpy.ufunc size changed")
pd.options.mode.chained_assignment = None

from warnings import simplefilter
simplefilter(action = "ignore", category = pd.errors.PerformanceWarning)

target = 'object_img'

In [2]:
dtrain = pd.read_csv(
    "../input/train.csv",
)
dtrain.sort_values(by = [target], inplace = True)
dtrain.reset_index(drop = True, inplace = True)
dtrain.head(10)

Unnamed: 0,id,description,object_img
0,725,Монета ДМ/Н-1104 ПОКМ-3873/6 серебро; штамповк...,1
1,134,Монета ДМ/Н-1101 ПОКМ-3873/3 серебро; штамповк...,2
2,1617,Монета ДМ/Н-1100 ПОКМ-3873/2 серебро; штамповк...,3
3,646,Монета ДМ/Н-1103 ПОКМ-3873/5 серебро; штамповк...,4
4,705,Монета ДМ/Н-850 ПОКМ-11035/17 серебро; штампов...,5
5,2478,Монета ДМ/Н-1114 ПОКМ-3873/17 серебро; штампов...,6
6,1166,Монета ДМ/Н-1508 ПОКМ-16405/9 серебро; чеканка...,7
7,1843,Монета ДМ/Н-678 ПОКМ-10727/20 серебро; штампов...,10
8,2691,Монета ДМ/Н-168 ПОКМ-17597 серебро; штамповка ...,12
9,1967,Монета ДМ/Н-1203 ПОКМ-3882/6 серебро; штамповк...,15


In [3]:
targets = sorted(dtrain[target].unique())

In [4]:
dtrain.shape

(2098, 3)

In [5]:
len(dtrain[target].unique())

2098

In [6]:
dtest = pd.read_csv('../input/test.csv')

# Векторизация данных (преобразование текста в частотные векторы)

In [7]:
vectorizer = TfidfVectorizer(
    analyzer = 'word',
    lowercase = True, 
    preprocessor = None, 
    tokenizer = None, 
    stop_words = None, 
    #token_pattern = None, 
    ngram_range = (1, 3), 
    #max_df = 1.0, 
    #min_df = 1, 
    max_features = 1500,
)

corpus = list(dtrain['description'].values)
train = vectorizer.fit_transform(corpus)
test  = vectorizer.transform(dtest['description'].values)
#vectorizer.get_feature_names_out()
train.shape

(2098, 1500)

In [8]:
for i in range(train.shape[1]):
    dtrain['v_' + str(i)] = train[:, i].toarray()
    dtest['v_' + str(i)] = test[:, i].toarray()

In [9]:
use = [f for f in dtrain.columns if f not in [
    'id', 'description', target]
      ]

In [10]:
temp = dtrain.copy()
temp = pd.concat([dtrain, temp], ignore_index = True)
temp = pd.concat([dtrain, temp], ignore_index = True)
temp = pd.concat([dtrain, temp], ignore_index = True)
temp = pd.concat([dtrain, temp], ignore_index = True)

dtrain = temp
dtrain.reset_index(drop = True, inplace = True)
gc.collect()

0

In [11]:
X_train, X_val = train_test_split(
    dtrain,
    test_size = 0.1707, 
    random_state = 0,
)
X_train.shape, X_val.shape

((8699, 1503), (1791, 1503))

# Модель (Light Gradient Boosted Machine или LightGBM)

In [12]:
def lgb_metric(y_true, y_pred):
    sc = r2_score(y_true, y_pred)
    return 'metric', sc, True

SEEDS = 3
models = []

params = {
    'max_depth': 5, 
    'num_leaves': 31, 
    'learning_rate': 0.1, 
    'reg_alpha': 0.1, 
    'reg_lambda': 0.1, 
    'n_estimators': 5500, 
    'subsample': 0.99, 
    'subsample_freq': 5, 
    'colsample_bytree': 0.99, 
    'random_state': 0, 
    'verbose': -1, 
    'metric': 'custom',
}

for seed in range(SEEDS):
    params['random_state'] = seed
    model = LGBMRegressor(**params)
    model.fit(
        X_train[use].values,
        X_train[target],
        eval_set = (X_val[use].values, X_val[target]),
        callbacks = [
            early_stopping(100), 
            log_evaluation(1500)
        ],
        eval_metric = lgb_metric,
    )
    models.append(model)

Training until validation scores don't improve for 100 rounds
[1500]	valid_0's metric: 0.999158
[3000]	valid_0's metric: 0.999652
[4500]	valid_0's metric: 0.999723
Early stopping, best iteration is:
[4870]	valid_0's metric: 0.999732
Training until validation scores don't improve for 100 rounds
[1500]	valid_0's metric: 0.999192
[3000]	valid_0's metric: 0.999667
Early stopping, best iteration is:
[4155]	valid_0's metric: 0.999724
Training until validation scores don't improve for 100 rounds
[1500]	valid_0's metric: 0.99918
[3000]	valid_0's metric: 0.999664
[4500]	valid_0's metric: 0.999728
Early stopping, best iteration is:
[4680]	valid_0's metric: 0.999733


In [13]:
from scipy.stats import mode

preds = []

for model in models:
    pr = model.predict(dtest[use])
    preds.append(pr)
    
dtest[target] = mode(preds)[0][0]

In [14]:
dtest[target] = dtest[target].astype(int)

dtest[['id', 'object_img']].to_csv('final.csv', index = False)

In [15]:
dtest[['id', 'object_img']].head()

Unnamed: 0,id,object_img
0,486,1202
1,813,1761
2,2980,1467
3,13,1203
4,2467,700


# Подготовка решения согласно требований от 05.10.2022 (остутсвие дублей и пересечения с тренировочными данными)

In [16]:
sub = pd.read_csv('final.csv')
sub[['id', 'object_img']].head()

Unnamed: 0,id,object_img
0,486,1202
1,813,1761
2,2980,1467
3,13,1203
4,2467,700


# 1. Избавление от дублей.

In [17]:
sub[target] = sub[target] - sub[target].min() # делаем все значения положительными в диапазоне от '0' и выше

sub.sort_values(by = [target], inplace = True)
sub.reset_index(drop = True, inplace = True)
sub

Unnamed: 0,id,object_img
0,2059,0
1,30,84
2,264,89
3,1106,90
4,1776,93
...,...,...
895,1504,3041
896,843,3050
897,2824,3056
898,2226,3421


In [18]:
sub[target].value_counts() # количество дублей в решении

882     6
2958    5
2425    4
2357    4
1542    4
       ..
1104    1
1109    1
1111    1
1114    1
3535    1
Name: object_img, Length: 750, dtype: int64

In [19]:
sub.index

RangeIndex(start=0, stop=900, step=1)

In [20]:
sub['object_img'] = sub['object_img'] + sub.index

In [21]:
sub['object_img'].value_counts()

0       1
2546    1
2695    1
2697    1
2699    1
       ..
1402    1
1403    1
1405    1
1408    1
4434    1
Name: object_img, Length: 900, dtype: int64

# 2. Избавление от пересечения с тренировочными данными.

In [22]:
sub[sub['object_img'].isin(dtrain['object_img'])] # пересечения

Unnamed: 0,id,object_img
2,264,91
3,1106,93
4,1776,97
5,832,117
6,142,124
...,...,...
660,461,2961
663,440,2978
664,2381,2981
665,756,2987


# Процесс удаления пересечений

In [24]:
for i in tqdm.tqdm(sub.index):
    img = sub.loc[i, 'object_img']
    if img in dtrain['object_img'].unique():
        
        for y in range(2500):
            img2 = img - y
            if (img2 not in dtrain['object_img'].unique()) and (img2 not in sub['object_img'].unique()):
                if img2 > 0:
                    sub.loc[i, 'object_img'] = img2

100%|█████████████████████████████████████████████| 900/900 [01:20<00:00, 11.23it/s]


In [25]:
sub[sub['object_img'].isin(dtrain['object_img'])] # проверяю на дубли

Unnamed: 0,id,object_img


In [26]:
sub['object_img'].value_counts() # проверяю на пересечения

0       1
1731    1
2695    1
1816    1
2699    1
       ..
925     1
926     1
928     1
930     1
4434    1
Name: object_img, Length: 900, dtype: int64

In [27]:
sub

Unnamed: 0,id,object_img
0,2059,0
1,30,85
2,264,8
3,1106,9
4,1776,11
...,...,...
895,1504,3936
896,843,3946
897,2824,3953
898,2226,4319


In [28]:
sub[['id', 'object_img']].to_csv('final2.csv', index = False)

In [29]:
# 0.650572

https://lk.hacks-ai.ru/758290/champ