In [1]:
import pandas as pd

In [2]:
train_csv = pd.read_csv('train.csv')
test_csv = pd.read_csv('test.csv')
sample_sub_csv = pd.read_csv('sample_submission.csv').drop(columns=['Unnamed: 0'])

In [3]:
def get_cols_by_substr(df, substr):
    return [col for col in df.columns if substr in col]

target = "successful_utilization"
mb_cols = get_cols_by_substr(train_csv, 'mb')
app_cols = get_cols_by_substr(train_csv, 'application')
bki_cols = get_cols_by_substr(train_csv, 'bki')
partner_cols = get_cols_by_substr(train_csv, 'partner')
graph_cols = get_cols_by_substr(train_csv, 'graph')
feature_cols = get_cols_by_substr(train_csv, 'feature')

In [4]:
train_csv['cc_2'].fillna('missing', inplace=True)
b = (train_csv['cc_2'].value_counts() / train_csv['cc_2'].shape[0]).reset_index()
train_csv['cc_2_num'] = train_csv['cc_2'].apply(lambda x: b[b['cc_2'] == x]['count'].values[0])

In [5]:
train_csv['cc_4'].fillna('missing', inplace=True)
b = (train_csv['cc_4'].value_counts() / train_csv['cc_4'].shape[0]).reset_index()
train_csv['cc_4_num'] = train_csv['cc_4'].apply(lambda x: b[b['cc_4'] == x]['count'].values[0])

In [6]:
train_csv['cc_6'].fillna('missing', inplace=True)
b = (train_csv['cc_6'].value_counts() / train_csv['cc_6'].shape[0]).reset_index()
train_csv['cc_6_num'] = train_csv['cc_6'].apply(lambda x: b[b['cc_6'] == x]['count'].values[0])

In [7]:
test_csv['cc_2'].fillna('missing', inplace=True)
b = (train_csv['cc_2'].value_counts() / train_csv['cc_2'].shape[0]).reset_index()
test_csv['cc_2_num'] = test_csv['cc_2'].apply(lambda x: b[b['cc_2'] == x]['count'].values[0])

test_csv['cc_4'].fillna('missing', inplace=True)
b = (train_csv['cc_4'].value_counts() / train_csv['cc_4'].shape[0]).reset_index()
test_csv['cc_4_num'] = test_csv['cc_4'].apply(lambda x: b[b['cc_4'] == x]['count'].values[0])

test_csv['cc_6'].fillna('missing', inplace=True)
b = (train_csv['cc_6'].value_counts() / train_csv['cc_6'].shape[0]).reset_index()
test_csv['cc_6_num'] = test_csv['cc_6'].apply(lambda x: b[b['cc_6'] == x]['count'].values[0])

In [8]:
cc_cols = list(set(get_cols_by_substr(train_csv, 'cc')) - {'cc_2', 'cc_4', 'cc_6'})

In [9]:
good_cols = list((set(mb_cols) | set(app_cols) | set(bki_cols) | set(partner_cols) | set(graph_cols) | set(feature_cols) | set(cc_cols) | {'treatment'}))

In [10]:
obj_cols = ['cc_2', 'cc_4', 'cc_6']

In [11]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe.fit(train_csv[obj_cols])

train_df = train_csv[good_cols].copy(deep=True)
test_df = test_csv[list(set(good_cols) - {'treatment', target})].copy(deep=True)

train_df = pd.concat([train_df, pd.DataFrame(ohe.transform(train_csv[obj_cols]), columns=ohe.get_feature_names_out(obj_cols))], axis=1)
test_df = pd.concat([test_df, pd.DataFrame(ohe.transform(test_csv[obj_cols]), columns=ohe.get_feature_names_out(obj_cols))], axis=1)

In [12]:
a = train_df.isna().sum().sort_values(ascending=False)
cols_with_na = list(a[a > 0].index)

for col in cols_with_na:
    na_count = train_df[col].isna().sum()
    min_val = train_df[col].min()
    if min_val >= 0:
        train_df[col].fillna(-1, inplace=True)
    else:
        train_df[col].fillna(train_df[col].mean(), inplace=True)

bin_cols = []

train_df["cc_1"] = train_df["cc_1"].apply(lambda x: 1 if x == 1 else 0)
train_df["feature_6"] = train_df["feature_6"].apply(lambda x: 1 if x == 1 else 0)

for col in train_df.columns:
    if train_df[col].unique().size == 2:
        bin_cols.append(col)



for col in cols_with_na:
    na_count = test_df[col].isna().sum()
    min_val = test_df[col].min()
    if min_val >= 0:
        test_df[col].fillna(-1, inplace=True)
    else:
        test_df[col].fillna(test_df[col].mean(), inplace=True)


test_df["cc_1"] = test_df["cc_1"].apply(lambda x: 1 if x == 1 else 0)
test_df["feature_6"] = train_df["feature_6"].apply(lambda x: 1 if x == 1 else 0)

---

Генерим фичи

In [13]:
from sklearn.preprocessing import PolynomialFeatures

In [14]:
good_cols_without_target = list(set(good_cols) - {target})

In [15]:
test_df_0 = test_df.copy(deep=True)
test_df_0['treatment'] = 0

test_df_1 = test_df.copy(deep=True)
test_df_1['treatment'] = 1

In [16]:
train_df_extra_features = train_df.copy(deep=True)

In [17]:
import random
random.seed(52)

random.shuffle(good_cols_without_target)

batches = [good_cols_without_target[i:i + 10] for i in range(0, len(good_cols_without_target), 10)]

# for i, batch in enumerate(batches):
#     print(f"Batch {i + 1}: {len(batch)}")

In [18]:
from tqdm.notebook import tqdm

for i, batch in tqdm(enumerate(batches), total=15):
    poly = PolynomialFeatures(degree=2, include_bias=False)
    poly.fit(train_df[batch])

    train_poly = poly.transform(train_df_extra_features[batch])
    test_poly_0 = poly.transform(test_df_0[batch])
    test_poly_1 = poly.transform(test_df_1[batch])

    assert train_poly.shape[1] == test_poly_0.shape[1] == test_poly_1.shape[1]

    col_names = poly.get_feature_names_out(batch)

    train_df_extra_features.drop(columns=batch, inplace=True)
    test_df_0.drop(columns=batch, inplace=True)
    test_df_1.drop(columns=batch, inplace=True)

    train_df_extra_features = pd.concat([train_df_extra_features, pd.DataFrame(train_poly, columns=col_names)], axis=1)
    test_df_0 = pd.concat([test_df_0, pd.DataFrame(test_poly_0, columns=col_names)], axis=1)
    test_df_1 = pd.concat([test_df_1, pd.DataFrame(test_poly_1, columns=col_names)], axis=1)

  0%|          | 0/15 [00:00<?, ?it/s]

In [19]:
from sklearn.decomposition import PCA

n = 5

pca = PCA(n_components=n)
X_train_pca = pca.fit_transform(train_df)

pca = PCA(n_components=n)
X_test_pca = pca.fit_transform(test_df)

col_names_pca = [f'pca_{i}' for i in range(n)]

train_df_extra_features = pd.concat([train_df_extra_features, pd.DataFrame(X_train_pca, columns=col_names_pca)], axis=1)
test_df_0 = pd.concat([test_df_0, pd.DataFrame(X_test_pca, columns=col_names_pca)], axis=1)
test_df_1 = pd.concat([test_df_1, pd.DataFrame(X_test_pca, columns=col_names_pca)], axis=1)

In [20]:
from sklearn.cluster import KMeans

In [21]:
# import matplotlib.pyplot as plt

# inertia = []  # Список для хранения значений инерции
# K = range(1, 7)  # Диапазон значений k (от 1 до 10)

# for k in K:
#     kmeans = KMeans(n_clusters=k, random_state=42)
#     kmeans.fit(train_df)
#     inertia.append(kmeans.inertia_)  # Сохраняем инерцию для каждого k

# # Шаг 2: Построение графика метода локтя
# plt.figure(figsize=(8, 6))
# plt.plot(K, inertia, 'bo-', linewidth=2)
# plt.xlabel('Количество кластеров k')
# plt.ylabel('Инерция')
# plt.title('Метод локтя для подбора k')
# plt.show()

In [21]:
optimal_k = 2
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
kmeans.fit(train_df)

# Получаем метки кластеров
clusters_train = kmeans.labels_

kmeans = KMeans(n_clusters=optimal_k, random_state=42)
kmeans.fit(test_df_0)

clusters_test_0 = kmeans.labels_

kmeans = KMeans(n_clusters=optimal_k, random_state=42)
kmeans.fit(test_df_1)

clusters_test_1 = kmeans.labels_

In [22]:
train_df_extra_features['cluster'] = clusters_train
test_df_0['cluster'] = clusters_test_0
test_df_1['cluster'] = clusters_test_1

In [23]:
cols_to_log = list(set(good_cols_without_target) - (set(bin_cols)))
new_log_cols = list(map(lambda x: f"{x}_log", cols_to_log))

In [24]:
import numpy as np

In [25]:
train_df_extra_features[new_log_cols] = train_df_extra_features[cols_to_log].apply(lambda x: np.log(x + 1))
test_df_0[new_log_cols] = test_df_0[cols_to_log].apply(lambda x: np.log(x + 1))
test_df_1[new_log_cols] = test_df_1[cols_to_log].apply(lambda x: np.log(x + 1))

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  train_df_extra_features[new_log_cols] = train_df_extra_features[cols_to_log].apply(lambda x: np.log(x + 1))
  train_df_extra_features[new_log_cols] = train_df_extra_features[cols_to_log].apply(lambda x: np.log(x + 1))
  train_df_extra_features[new_log_cols] = train_df_extra_features[cols_to_log].apply(lambda x: np.log(x + 1))
  train_df_extra_features[new_log_cols] = train_df_extra_features[cols_to_log].apply(lambda x: np.log(x + 1))
  train_df_extra_features[new_log_cols] = train_df_extra_features[cols_to_log].apply(lambda x: np.log(x + 1))
  train_df_extra_features[new_log_cols] = train_df_extra_features[cols_to_log].apply(lambda x: np.log(x + 1))
  train_df_extra_features[new_log_cols] = train_df_extra_features[cols_to_log].apply(lambda x: np.log(x + 1))
  train_df_extra_features[new_log_cols] = train_df_extra_features[cols_to_log].apply(lambda x: np.log(x + 1))
  train_df_ext

In [26]:
cols_to_sqrt = []

for col in cols_to_log:
    if train_df_extra_features[col].min() >= 0:
        cols_to_sqrt.append(col)

new_sqrt_cols = list(map(lambda x: x.replace('log', 'sqrt'), cols_to_sqrt))

In [27]:
train_df_extra_features[cols_to_sqrt] = train_df_extra_features[cols_to_sqrt].apply(lambda x: np.sqrt(x))
test_df_0[cols_to_sqrt] = test_df_0[cols_to_sqrt].apply(lambda x: np.sqrt(x))
test_df_1[cols_to_sqrt] = test_df_1[cols_to_sqrt].apply(lambda x: np.sqrt(x))

In [28]:
train_df_extra_features = train_df_extra_features.loc[:, ~train_df_extra_features.columns.duplicated()]
test_df_0 = test_df_0.loc[:, ~test_df_0.columns.duplicated()]
test_df_1 = test_df_1.loc[:, ~test_df_1.columns.duplicated()]

---

чистим оперативку xd

In [29]:
a = 0
b = 0
clusters_test_0 = 0
clusters_test_1 = 0
clusters = 0
clusters_train = 0

new_features = 0

test_csv = 0
train_csv = 0
train_df = 0
test_poly_0 = 0
test_poly_1 = 0
train_poly = 0

X_train = 0

kmeans = 0
pca = 0
poly = 0

X_train_pca = 0
X_test_pca = 0

---

In [55]:
train_df_extra_features.to_csv("extra_train.csv")
test_df_0.to_csv("test_df_0.csv")
test_df_1.to_csv("test_df_1.csv")

In [30]:
from sklearn.model_selection import train_test_split

X_train = train_df_extra_features.drop(columns=[target])
y_train = train_df_extra_features[target]

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=43)

In [32]:
# X_test_0 = test_df.copy(deep=True)
# X_test_0['treatment'] = 0

# X_test_1 = test_df.copy(deep=True)
# X_test_1['treatment'] = 1

# X_test_0 = X_test_0[X_train.columns]
# X_test_1 = X_test_1[X_train.columns]

In [31]:
k = 0

In [36]:
from catboost import CatBoostClassifier

catboost_model = CatBoostClassifier(
    iterations=500,
    depth=5,
    learning_rate=0.133,
    verbose=False,
    eval_metric = 'AUC',
    use_best_model=True,
    auto_class_weights="SqrtBalanced"
)

catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val), logging_level='Verbose')


bestTest = 0.7351421505
bestIteration = 438

Shrink model to first 439 iterations.


<catboost.core.CatBoostClassifier at 0x1f086a6bc40>

---

Cчитаем uplift

In [37]:
pred_0 = catboost_model.predict_proba(test_df_0)[:,1]
pred_1 = catboost_model.predict_proba(test_df_1)[:,1]
super_cat_uplift = pred_1 - pred_0
super_cat_uplift

k += 1
sample_sub_csv['successful_utilization'] = super_cat_uplift
sample_sub_csv.to_csv(f'sub_gen_1_{k}.csv')

---

усредняем всё что видим

In [38]:
i = 21

In [51]:
import os

dir = 'mean_subs/'

subs = os.listdir(dir)
subs

['super_cat_mean_23_subs_uplift.csv', 'super_cat_mean_24_subs_uplift.csv']

In [52]:
def get_target(file_path):
    return pd.read_csv(file_path).drop(columns=['Unnamed: 0'])['successful_utilization']

In [53]:
sub1 = subs[0]

sum = get_target(os.path.join(dir + sub1))

In [54]:
for sub in subs[1:]:
    sum += get_target(os.path.join(dir + sub))

mean = sum / len(subs)

i += 1
sample_sub_csv['successful_utilization'] = mean
sample_sub_csv.to_csv(f'super_cat_mean_{i}_subs_uplift.csv')