In [1]:
import numpy as np
import pandas as pd 
import os 
import json 
from collections import defaultdict

from sklearn.model_selection import train_test_split

# Data Prepairing

In [74]:
TRAIN_DATA_PATH = "../data_MliF/train_data/"
TEST_DATA_PATH = "../data_MliF/test_data/"

TRAIN_TARGET_PATH = "../data_MliF/train_target.csv"

In [50]:
def concat_data(path):
    data_concat = None

    for file in os.listdir(TRAIN_DATA_PATH):
        data = pd.read_parquet(f"{TRAIN_DATA_PATH}/{file}")
        if train_data is None:
            data_concat = data
        else:
            data_concat = pd.concat([data_concat, data], ignore_index=True)
    return data_concat

In [51]:
train_data = concat_data(TRAIN_DATA_PATH)
test_data = concat_data(TEST_DATA_PATH)

In [75]:
train_target = pd.read_csv(TRAIN_TARGET_PATH)

In [52]:
train_data.drop(columns=["id", "rn"], inplace=True)
test_data.drop(columns=["id", "rn"], inplace=True)

In [54]:
data_all = pd.concat([train_data, test_data], ignore_index=True)

In [110]:
col_names = list(data_all.columns) 

In [72]:
unique_val = {col_name: np.sort(data_all[col_name].unique()) for col_name in data_all.columns.values}

In [92]:
train, val = train_test_split(train_target, random_state=20, test_size=0.1)
#train['flag'].sum()/train.size, val['flag'].sum()/val.size

In [115]:
unique_val

{'pre_since_opened': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19], dtype=int64),
 'pre_since_confirmed': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17], dtype=int64),
 'pre_pterm': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17], dtype=int64),
 'pre_fterm': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16],
       dtype=int64),
 'pre_till_pclose': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16],
       dtype=int64),
 'pre_till_fclose': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
       dtype=int64),
 'pre_loans_credit_limit': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19], dtype=int64),
 'pre_loans_next_pay_summ': array([0, 1, 2, 3, 4, 5, 6], dtype=int64),
 'pre_loans_outstanding': array([1, 2, 3, 4, 5], dtype=int64),
 'pre_loans_tot

In [80]:
def create_buckets_from_credits(path_to_dataset, bucket_info, save_to_path, frame_with_ids = None,
                                num_parts_to_preprocess_at_once: int = 1, 
                                num_parts_total=50, has_target=False):
    block = 0
    for step in range(0, num_parts_total, num_parts_to_preprocess_at_once):
        credits_frame = read_parquet_dataset_from_local(path_to_dataset, step, num_parts_to_preprocess_at_once, verbose=True)
        credits_frame.loc[:, features] += 1       
        seq = transform_credits_to_sequences(credits_frame)
        
        if frame_with_ids is not None:
            seq = seq.merge(frame_with_ids, on="id")

        block_as_str = str(block)
        if len(block_as_str) == 1:
            block_as_str = "00" + block_as_str
        else:
            block_as_str = "0" + block_as_str
            
        processed_fragment = create_padded_buckets(seq, bucket_info=bucket_info, has_target=has_target,
                                                    save_to_file_path=os.path.join(save_to_path, 
                                                                                   f"processed_chunk_{block_as_str}.pkl"))
        block += 1

In [100]:
train.size, val.size

(5400000, 600000)

In [124]:
keys_ = list(range(1, 59)) 
lens_ = list(range(1, 41)) + [45] * 5 + [50] * 5 + [58] * 8

In [126]:
bucket_info = dict(zip(keys_, lens_))

In [127]:
bucket_info

{1: 1,
 2: 2,
 3: 3,
 4: 4,
 5: 5,
 6: 6,
 7: 7,
 8: 8,
 9: 9,
 10: 10,
 11: 11,
 12: 12,
 13: 13,
 14: 14,
 15: 15,
 16: 16,
 17: 17,
 18: 18,
 19: 19,
 20: 20,
 21: 21,
 22: 22,
 23: 23,
 24: 24,
 25: 25,
 26: 26,
 27: 27,
 28: 28,
 29: 29,
 30: 30,
 31: 31,
 32: 32,
 33: 33,
 34: 34,
 35: 35,
 36: 36,
 37: 37,
 38: 38,
 39: 39,
 40: 40,
 41: 45,
 42: 45,
 43: 45,
 44: 45,
 45: 45,
 46: 50,
 47: 50,
 48: 50,
 49: 50,
 50: 50,
 51: 58,
 52: 58,
 53: 58,
 54: 58,
 55: 58,
 56: 58,
 57: 58,
 58: 58}

In [204]:
uniques = defaultdict(set)

In [217]:
for feat in data_all.columns.values:
    uniques[feat] = uniques[feat].union(train_data[feat].unique())

In [218]:
embedding_projections = {feat: (int(max(uniq) + 1), min(600, int(round(1.6 * (max(uniq) + 1)) ** 0.56))) for
                             feat, uniq in uniques.items()}

In [102]:
def read_parquet_dataset_from_local(path_to_dataset: str, start_from: int = 0, num_parts_to_read: int = 2, 
                                    columns: [str] = None, verbose: bool = False) -> pd.DataFrame:
    """
    Читает ``num_parts_to_read`` партиций и преобразует их к pandas.DataFrame.

    Параметры:
    -----------
    path_to_dataset: str
        Путь до директории с партициями.
    start_from: int, default=0
        Номер партиции, с которой начать чтение.
    num_parts_to_read: int, default=2
        Число партиций, которые требуется прочитать.
    columns: List[str], default=None
        Список колонок, которые нужно прочитать из каждой партиции. Если None, то считываются все колонки.

    Возвращаемое значение:
    ----------------------
    frame: pandas.DataFrame
        Прочитанные партиции, преобразованные к pandas.DataFrame.
    """

    res = []
    start_from = max(0, start_from)
    # dictionory of format {partition number: partition filename}
    dataset_paths = {int(os.path.splitext(filename)[0].split("_")[-1]): os.path.join(path_to_dataset, filename)
                     for filename in os.listdir(path_to_dataset)}
    chunks = [dataset_paths[num] for num in sorted(dataset_paths.keys()) if num>=start_from][:num_parts_to_read]

    print(f"chunks {chunks}")

    for chunk_path in chunks:
        chunk = pd.read_parquet(chunk_path, columns=columns)
        res.append(chunk)
    return pd.concat(res).reset_index(drop=True)

In [13]:
from typing import Dict
import numpy as np
import pandas as pd
import pickle


features = ["pre_since_opened", "pre_since_confirmed", "pre_pterm", "pre_fterm", "pre_till_pclose", "pre_till_fclose",
            "pre_loans_credit_limit", "pre_loans_next_pay_summ", "pre_loans_outstanding", "pre_loans_total_overdue",
            "pre_loans_max_overdue_sum", "pre_loans_credit_cost_rate",
            "pre_loans5", "pre_loans530", "pre_loans3060", "pre_loans6090", "pre_loans90",
            "is_zero_loans5", "is_zero_loans530", "is_zero_loans3060", "is_zero_loans6090", "is_zero_loans90",
            "pre_util", "pre_over2limit", "pre_maxover2limit", "is_zero_util", "is_zero_over2limit", "is_zero_maxover2limit",
            "enc_paym_0", "enc_paym_1", "enc_paym_2", "enc_paym_3", "enc_paym_4", "enc_paym_5", "enc_paym_6", "enc_paym_7", "enc_paym_8",
            "enc_paym_9", "enc_paym_10", "enc_paym_11", "enc_paym_12", "enc_paym_13", "enc_paym_14", "enc_paym_15", "enc_paym_16",
            "enc_paym_17", "enc_paym_18", "enc_paym_19", "enc_paym_20", "enc_paym_21", "enc_paym_22", "enc_paym_23", "enc_paym_24",
            "enc_loans_account_holder_type", "enc_loans_credit_status", "enc_loans_credit_type", "enc_loans_account_cur",
            "pclose_flag", "fclose_flag"]


def pad_sequence(array: np.ndarray, max_len: int) -> np.ndarray:
    """
    Принимает на вход массив массивов ``array`` и производит padding каждого вложенного массива до ``max_len``.

    Параметры:
    -----------
    array: numpy.ndarray
        Входной массив массивов.
    max_len: int
        Длина, до которой нужно сделать padding вложенных массивов.

    Возвращаемое значение:
    ----------------------
    output: numpy.ndarray
        Выходной массив.
    """
    if isinstance(max_len, float):
        print(max_len)
    output = np.zeros((len(features), max_len))
    output[:, :array.shape[1]] = array
    return output


def truncate(x, num_last_credits: int = 0):
    return pd.Series({"sequences": x.values.transpose()[:, -num_last_credits:]})


def transform_credits_to_sequences(credits_frame: pd.DataFrame,
                                   num_last_credits: int = 0) -> pd.DataFrame:
    """
    Принимает pandas.DataFrame с записями кредитных историй клиентов, сортирует кредиты по клиентам
    (внутри клиента сортирует кредиты от старых к новым), берет ``num_last_credits`` кредитов,
    возвращает новый pandas.DataFrame с двумя колонками: id и sequences.
    Каждое значение в столбце sequences - это массив массивов.
    Каждый вложенный массив - значение одного признака во всех кредитах клиента.
    Всего признаков len(features), поэтому будет len(features) массивов.

    Параметры:
    -----------
    credits_frame: pandas.DataFrame
        Датафрейм с записями кредитных историй клиентов.
    num_last_credits: int, default=0
         Количество кредитов клиента, которые будут включены в выходные данные. Если 0, то берутся все кредиты.

    Возвращаемое значение:
    ----------------------
    output: pandas.DataFrame
        Выходной датафрейм с двумя столбцами: "id", "sequences".
    """
    return credits_frame \
        .sort_values(["id", "rn"]) \
        .groupby(["id"])[features] \
        .apply(lambda x: truncate(x, num_last_credits=num_last_credits)) \
        .reset_index()


def create_padded_buckets(frame_of_sequences: pd.DataFrame, bucket_info: Dict[int, int],
                          save_to_file_path: str = None, has_target: bool = True):
    """
    Реализует Sequence Bucketing технику для обучения рекуррентных нейронных сетей.
    Принимает на вход датафрейм ``frame_of_sequences`` с двумя столбцами: "id", "sequences"
    (результат работы функции transform_credits_to_sequences),
    словарь ``bucket_info``, где для последовательности каждой длины указано, до какой максимальной длины нужно делать
    padding, группирует кредиты по бакетам (на основе длины), производит padding нулями и сохраняет результат
    в pickle файл, если требуется.

    Параметры:
    -----------
    frame_of_sequences: pandas.DataFrame
        Входной датафрейм с двумя столбцами "id", "sequences" (результат работы функции transform_credits_to_sequences).
    bucket_info: Dict[int, int]
        Cловарь, где для последовательности каждой длины указано, до какой максимальной длины нужно делать padding.
    save_to_file_path: str, default=None
        Опциональный путь до файла, куда нужно сохранить результат. Если None, то сохранение не требуется.
    has_target: bool, deafult=True
        Флаг, есть ли в frame_of_sequences целевая переменная или нет. Если есть, то она также будет записана в выходной словарь.

    Возвращаемое значение:
    ----------------------
    dict_result: dict
        Выходной словарь со ключами:  "id", "padded_sequences", "target".
    """
    frame_of_sequences["sequence_length"] = frame_of_sequences["sequences"].apply(lambda x: len(x[1]))
    frame_of_sequences["bucket_idx"] = frame_of_sequences["sequence_length"].map(bucket_info)
    padded_seq = []
    targets = []
    ids = []

    for size, bucket in frame_of_sequences.groupby("bucket_idx"):
        padded_sequences = bucket["sequences"].apply(lambda x: pad_sequence(x, size)).values
        padded_seq.append(np.stack(padded_sequences, axis=0))

        if has_target:
            targets.append(bucket["flag"].values)

        ids.append(bucket["id"].values)

    frame_of_sequences.drop(columns=["bucket_idx"], inplace=True)

    dict_result = {
        "id": np.array(ids, dtype=object),
        "padded_sequences": np.array(padded_seq, dtype=object),
        "target": np.array(targets, dtype=np.object_) if targets else []
    }

    if save_to_file_path:
        with open(save_to_file_path, "wb") as f:
            pickle.dump(dict_result, f)
    return dict_result


In [13]:
data_buff = read_parquet_dataset_from_local("../data_MliF/train_data/", 0, 12)

chunks ['../data_MliF/train_data/train_data_0.pq', '../data_MliF/train_data/train_data_1.pq', '../data_MliF/train_data/train_data_2.pq', '../data_MliF/train_data/train_data_3.pq', '../data_MliF/train_data/train_data_4.pq', '../data_MliF/train_data/train_data_5.pq', '../data_MliF/train_data/train_data_6.pq', '../data_MliF/train_data/train_data_7.pq', '../data_MliF/train_data/train_data_8.pq', '../data_MliF/train_data/train_data_9.pq', '../data_MliF/train_data/train_data_10.pq', '../data_MliF/train_data/train_data_11.pq']


In [34]:
data_buff.groupby("id").agg(seq_len=("rn", "max"))["seq_len"].values

array([10, 14,  3, ..., 10,  5, 12], dtype=int64)

In [8]:
test_target = pd.read_csv(f"MIiF_Кредитный скоринг/test_target.csv")
train_target = pd.read_csv(f"MIiF_Кредитный скоринг/train_target.csv")

In [121]:
def truncate(x, num_last_credits: int = 0):
    return pd.Series({"sequences": x.values.transpose()[:, -num_last_credits:]})

In [129]:
sqe = train_data_0.sort_values(["id", "rn"]).groupby(["id"])[col_names].apply(lambda x: truncate(x, num_last_credits=0)).reset_index()

In [133]:
 
dict_result = create_padded_buckets(sqe, bucket_info, has_target=False)

In [139]:
dict_result['id'][0].shape, dict_result['id'][1].shape 

((20672,), (21377,))

In [144]:
dict_result['padded_sequences'][0].shape, dict_result['padded_sequences'][42].shape, dict_result['padded_sequences'][27].shape 

((20672, 59, 1), (1, 59, 58), (344, 59, 28))

In [3]:
import torch.nn as nn
import torch


class CreditsRNN(nn.Module):
    def __init__(self, features, embedding_projections, rnn_units=128, top_classifier_units=32):
        super(CreditsRNN, self).__init__()
        self._credits_cat_embeddings = nn.ModuleList([self._create_embedding_projection(*embedding_projections[feature]) 
                                                          for feature in features])
                        
        self._gru = nn.GRU(input_size=sum([embedding_projections[x][1] for x in features]),
                             hidden_size=rnn_units, batch_first=True, bidirectional=False)
        self._hidden_size = rnn_units
        self._top_classifier = nn.Linear(in_features=rnn_units, out_features=top_classifier_units)
        self._intermediate_activation = nn.ReLU()
        self._head = nn.Linear(in_features=top_classifier_units, out_features=1)
    
    def forward(self, features):
        batch_size = features[0].shape[0]
        embeddings = [embedding(features[i]) for i, embedding in enumerate(self._credits_cat_embeddings)]
        concated_embeddings = torch.cat(embeddings, dim=-1)
        
        _, last_hidden = self._gru(concated_embeddings)
        last_hidden = torch.reshape(last_hidden.permute(1, 2, 0), shape=(batch_size, self._hidden_size))
                                
        classification_hidden = self._top_classifier(last_hidden)
        activation = self._intermediate_activation(classification_hidden)
        raw_output = self._head(activation)
        return raw_output
    
    @classmethod
    def _create_embedding_projection(cls, cardinality, embed_size, add_missing=True, padding_idx=0):
        add_missing = 1 if add_missing else 0
        return nn.Embedding(num_embeddings=cardinality+add_missing, embedding_dim=embed_size, padding_idx=padding_idx)

In [151]:
def compute_embed_dim(n_cat: int) -> int:
    return min(600, round(1.6 * n_cat**0.56))

In [153]:
embedding_projections = {feat: (max(uniq)+1, compute_embed_dim(max(uniq)+1)) for feat, uniq in uniques.items()}

In [164]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

In [165]:
with open("../data_MliF/embedding_projections.json", "w") as fp:
    json.dump(embedding_projections, fp, cls=NpEncoder)

In [15]:
with open("../data_MliF/embedding_projections.json") as f_in:
    embedding_projections_loaded = json.load(f_in)

In [172]:
from tqdm import tqdm

In [173]:
tqdm.notebook

AttributeError: type object 'tqdm' has no attribute 'notebook'

In [253]:
torch.norm(torch.Tensor([2,2,2]),1)

tensor(6.)

In [252]:
2*(3)**0.5

3.4641016151377544

In [42]:
input_tensor = torch.rand((128,16,175))

In [5]:
input_tensor.shape

torch.Size([128, 4, 175])

In [6]:
nn.MaxPool2d(kernel_size=3)(input_tensor).shape

torch.Size([128, 1, 58])

In [43]:
pooling_emb = torch.reshape(nn.AdaptiveMaxPool2d((1, 128))(input_tensor).permute(0, 2, 1), shape=(128, 128))#.shape 

In [44]:
pooling_emb.shape

torch.Size([128, 128])

In [33]:
nn.AdaptiveMaxPool2d((1, 128))(input_tensor).shape

torch.Size([128, 1, 128])

In [8]:
torch.cat((torch.reshape(out_tensor[1][0].permute(1, 2, 0), shape=(128, 128*2)), emb, -1))     

NameError: name 'out_tensor' is not defined

In [16]:
sum([embedding_projections_loaded[x][1] for x in features])

175

In [17]:
output, (h_n, c_n) = nn.LSTM(input_size=sum([embedding_projections_loaded[x][1] for x in features]),
                             hidden_size=128, batch_first=True, bidirectional=True)(input_tensor)

In [18]:
h_n.shape

torch.Size([2, 128, 128])

In [30]:
h_n.permute(1, 0, 2).shape

torch.Size([128, 2, 128])

In [37]:
pooling_lstm = nn.AdaptiveMaxPool2d((1, 128))(h_n.permute(1, 0, 2)).squeeze()

In [41]:
torch.cat([pooling_emb, pooling_lstm], dim=-1).shape

torch.Size([128, 256])

In [223]:
nn.GRU(input_size=sum([embedding_projections_loaded[x][1] for x in features]),
                             hidden_size=128, batch_first=True, bidirectional=False)

GRU(175, 128, batch_first=True)

In [239]:
out_tensor[0].shape, out_tensor[1][0].shape, out_tensor[1][1].shape

(torch.Size([128, 4, 256]),
 torch.Size([2, 128, 128]),
 torch.Size([2, 128, 128]))

In [270]:
torch.reshape(out_tensor[1][0].permute(1, 2, 0), shape=(128, 128*2)).shape 

torch.Size([128, 256])

In [244]:
128 * 2 if 3>4 else 128*1 

128

In [169]:
embedding_projections

{'pre_since_opened': (20, 9),
 'pre_since_confirmed': (18, 8),
 'pre_pterm': (18, 8),
 'pre_fterm': (17, 8),
 'pre_till_pclose': (17, 8),
 'pre_till_fclose': (16, 8),
 'pre_loans_credit_limit': (20, 9),
 'pre_loans_next_pay_summ': (7, 5),
 'pre_loans_outstanding': (6, 4),
 'pre_loans_total_overdue': (2, 2),
 'pre_loans_max_overdue_sum': (4, 3),
 'pre_loans_credit_cost_rate': (14, 7),
 'pre_loans5': (17, 8),
 'pre_loans530': (20, 9),
 'pre_loans3060': (10, 6),
 'pre_loans6090': (5, 4),
 'pre_loans90': (20, 9),
 'is_zero_loans5': (2, 2),
 'is_zero_loans530': (2, 2),
 'is_zero_loans3060': (2, 2),
 'is_zero_loans6090': (2, 2),
 'is_zero_loans90': (2, 2),
 'pre_util': (20, 9),
 'pre_over2limit': (20, 9),
 'pre_maxover2limit': (20, 9),
 'is_zero_util': (2, 2),
 'is_zero_over2limit': (2, 2),
 'is_zero_maxover2limit': (2, 2),
 'enc_paym_0': (4, 3),
 'enc_paym_1': (4, 3),
 'enc_paym_2': (4, 3),
 'enc_paym_3': (4, 3),
 'enc_paym_4': (4, 3),
 'enc_paym_5': (4, 3),
 'enc_paym_6': (4, 3),
 'enc_pay

In [154]:
CreditsRNN(col_names, embedding_projections)

CreditsRNN(
  (_credits_cat_embeddings): ModuleList(
    (0): Embedding(21, 9, padding_idx=0)
    (1-2): 2 x Embedding(19, 8, padding_idx=0)
    (3-4): 2 x Embedding(18, 8, padding_idx=0)
    (5): Embedding(17, 8, padding_idx=0)
    (6): Embedding(21, 9, padding_idx=0)
    (7): Embedding(8, 5, padding_idx=0)
    (8): Embedding(7, 4, padding_idx=0)
    (9): Embedding(3, 2, padding_idx=0)
    (10): Embedding(5, 3, padding_idx=0)
    (11): Embedding(15, 7, padding_idx=0)
    (12): Embedding(18, 8, padding_idx=0)
    (13): Embedding(21, 9, padding_idx=0)
    (14): Embedding(11, 6, padding_idx=0)
    (15): Embedding(6, 4, padding_idx=0)
    (16): Embedding(21, 9, padding_idx=0)
    (17-21): 5 x Embedding(3, 2, padding_idx=0)
    (22-24): 3 x Embedding(21, 9, padding_idx=0)
    (25-27): 3 x Embedding(3, 2, padding_idx=0)
    (28-38): 11 x Embedding(5, 3, padding_idx=0)
    (39): Embedding(6, 4, padding_idx=0)
    (40-47): 8 x Embedding(5, 3, padding_idx=0)
    (48): Embedding(6, 4, padding_i

In [50]:
train_data_0[train_data_0['id'] == 50]['enc_paym_4']

408    0
409    0
410    0
411    3
412    0
413    3
414    0
415    3
416    0
417    3
Name: enc_paym_4, dtype: int64

In [52]:
test_data_0 = pd.read_parquet(f"MIiF_Кредитный скоринг/test_data/test_data_0.pq")

In [98]:
test_data_0

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,...,enc_paym_21,enc_paym_22,enc_paym_23,enc_paym_24,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_credit_type,enc_loans_account_cur,pclose_flag,fclose_flag
0,3000000,1,11,5,17,14,12,11,3,2,...,3,3,3,4,1,3,4,1,0,0
1,3000000,2,19,16,15,9,12,11,16,3,...,3,3,3,4,1,2,4,1,0,0
2,3000001,1,16,17,8,5,4,9,5,2,...,3,3,3,4,1,3,4,1,0,0
3,3000001,2,16,7,9,0,4,9,1,2,...,3,3,3,4,1,3,4,1,0,0
4,3000001,3,10,0,14,7,11,12,2,2,...,1,1,0,2,1,3,4,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2389768,3249999,2,5,12,16,9,11,12,10,2,...,3,3,3,4,1,3,4,1,0,0
2389769,3249999,3,3,17,17,6,12,0,4,2,...,3,3,3,4,1,3,3,1,0,0
2389770,3249999,4,11,9,4,8,1,11,12,2,...,3,3,3,4,1,2,4,1,1,1
2389771,3249999,5,11,6,12,10,1,11,8,4,...,3,3,3,4,1,2,4,1,0,0


In [149]:
uniques.items()

dict_items([('pre_since_opened', {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}), ('pre_since_confirmed', {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}), ('pre_pterm', {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}), ('pre_fterm', {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), ('pre_till_pclose', {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), ('pre_till_fclose', {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}), ('pre_loans_credit_limit', {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}), ('pre_loans_next_pay_summ', {0, 1, 2, 3, 4, 5, 6}), ('pre_loans_outstanding', {1, 2, 3, 4, 5}), ('pre_loans_total_overdue', {0, 1}), ('pre_loans_max_overdue_sum', {0, 1, 2, 3}), ('pre_loans_credit_cost_rate', {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}), ('pre_loans5', {0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 13, 16}), ('pre_loans530', {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16

In [148]:
{feat: (int(max(uniq)+1), min(600, int(round(1.6 * (max(uniq)+1))**0.56))) for feat, uniq in uniques.items()}

{'pre_since_opened': (20, 6),
 'pre_since_confirmed': (18, 6),
 'pre_pterm': (18, 6),
 'pre_fterm': (17, 6),
 'pre_till_pclose': (17, 6),
 'pre_till_fclose': (16, 6),
 'pre_loans_credit_limit': (20, 6),
 'pre_loans_next_pay_summ': (7, 3),
 'pre_loans_outstanding': (6, 3),
 'pre_loans_total_overdue': (2, 1),
 'pre_loans_max_overdue_sum': (4, 2),
 'pre_loans_credit_cost_rate': (14, 5),
 'pre_loans5': (17, 6),
 'pre_loans530': (20, 6),
 'pre_loans3060': (10, 4),
 'pre_loans6090': (5, 3),
 'pre_loans90': (20, 6),
 'is_zero_loans5': (2, 1),
 'is_zero_loans530': (2, 1),
 'is_zero_loans3060': (2, 1),
 'is_zero_loans6090': (2, 1),
 'is_zero_loans90': (2, 1),
 'pre_util': (20, 6),
 'pre_over2limit': (20, 6),
 'pre_maxover2limit': (20, 6),
 'is_zero_util': (2, 1),
 'is_zero_over2limit': (2, 1),
 'is_zero_maxover2limit': (2, 1),
 'enc_paym_0': (4, 2),
 'enc_paym_1': (4, 2),
 'enc_paym_2': (4, 2),
 'enc_paym_3': (4, 2),
 'enc_paym_4': (4, 2),
 'enc_paym_5': (4, 2),
 'enc_paym_6': (4, 2),
 'enc_pay

In [6]:
train_data_0 = pd.read_parquet(f"../data_MliF/train_data/train_data_0.pq")
train_data_1 = pd.read_parquet(f"../data_MliF/train_data/train_data_1.pq")
train_data_3 = pd.read_parquet(f"../data_MliF/train_data/train_data_3.pq")
train_data_11 = pd.read_parquet(f"../data_MliF/train_data/train_data_11.pq")

In [113]:
train_data_0.loc[:, col_names] + 1

Unnamed: 0,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,pre_loans_outstanding,pre_loans_total_overdue,...,enc_paym_21,enc_paym_22,enc_paym_23,enc_paym_24,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_credit_type,enc_loans_account_cur,pclose_flag,fclose_flag
0,19,10,3,4,17,11,12,4,4,1,...,4,4,4,5,2,4,5,2,1,1
1,19,10,15,15,13,13,1,4,4,1,...,1,1,1,5,2,4,5,2,1,1
2,19,10,5,9,2,12,12,1,6,1,...,1,1,1,5,2,3,4,2,2,2
3,5,2,10,13,17,8,13,3,4,1,...,4,4,4,5,2,4,2,2,1,1
4,6,13,16,3,12,13,11,3,4,1,...,4,4,4,5,2,4,5,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1974719,2,10,15,9,11,12,9,5,3,1,...,4,4,4,5,2,3,4,2,1,2
1974720,8,10,5,9,2,12,20,2,5,1,...,4,4,4,5,2,3,5,2,2,2
1974721,10,1,11,9,11,12,17,3,4,1,...,4,4,4,5,2,3,4,2,1,2
1974722,10,17,11,14,11,5,13,3,4,1,...,4,4,4,5,2,3,4,2,1,1


In [114]:
train_data_0.loc[:, col_names]

Unnamed: 0,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,pre_loans_outstanding,pre_loans_total_overdue,...,enc_paym_21,enc_paym_22,enc_paym_23,enc_paym_24,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_credit_type,enc_loans_account_cur,pclose_flag,fclose_flag
0,18,9,2,3,16,10,11,3,3,0,...,3,3,3,4,1,3,4,1,0,0
1,18,9,14,14,12,12,0,3,3,0,...,0,0,0,4,1,3,4,1,0,0
2,18,9,4,8,1,11,11,0,5,0,...,0,0,0,4,1,2,3,1,1,1
3,4,1,9,12,16,7,12,2,3,0,...,3,3,3,4,1,3,1,1,0,0
4,5,12,15,2,11,12,10,2,3,0,...,3,3,3,4,1,3,4,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1974719,1,9,14,8,10,11,8,4,2,0,...,3,3,3,4,1,2,3,1,0,1
1974720,7,9,4,8,1,11,19,1,4,0,...,3,3,3,4,1,2,4,1,1,1
1974721,9,0,10,8,10,11,16,2,3,0,...,3,3,3,4,1,2,3,1,0,1
1974722,9,16,10,13,10,4,12,2,3,0,...,3,3,3,4,1,2,3,1,0,0


In [4]:
test_labels = pd.read_csv(f"data/test_target.csv")
train_labels = pd.read_csv(f"data/train_target.csv")

In [14]:
train_data_0[train_data_0['id'] == 0] 

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,...,enc_paym_21,enc_paym_22,enc_paym_23,enc_paym_24,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_credit_type,enc_loans_account_cur,pclose_flag,fclose_flag
0,0,1,18,9,2,3,16,10,11,3,...,3,3,3,4,1,3,4,1,0,0
1,0,2,18,9,14,14,12,12,0,3,...,0,0,0,4,1,3,4,1,0,0
2,0,3,18,9,4,8,1,11,11,0,...,0,0,0,4,1,2,3,1,1,1
3,0,4,4,1,9,12,16,7,12,2,...,3,3,3,4,1,3,1,1,0,0
4,0,5,5,12,15,2,11,12,10,2,...,3,3,3,4,1,3,4,1,0,0
5,0,6,5,0,11,8,12,11,4,2,...,3,3,3,4,1,2,3,1,0,1
6,0,7,3,9,1,2,12,14,15,5,...,3,3,3,4,1,3,4,1,0,0
7,0,8,2,9,2,3,12,14,15,5,...,3,3,3,4,1,3,4,1,0,0
8,0,9,1,9,11,13,14,8,2,5,...,3,3,3,4,1,2,4,1,0,0
9,0,10,7,9,2,10,8,8,16,4,...,3,3,3,4,1,2,4,1,0,0


In [16]:
train_data_0.groupby("id").agg(seq_len=("rn", "max"))["seq_len"].values

array([10, 14,  3, ...,  7,  5,  3], dtype=int64)

In [24]:
uniques = defaultdict(set)

In [25]:
round(2.6)

3

In [20]:
train_data_0.columns.values

array(['id', 'rn', 'pre_since_opened', 'pre_since_confirmed', 'pre_pterm',
       'pre_fterm', 'pre_till_pclose', 'pre_till_fclose',
       'pre_loans_credit_limit', 'pre_loans_next_pay_summ',
       'pre_loans_outstanding', 'pre_loans_total_overdue',
       'pre_loans_max_overdue_sum', 'pre_loans_credit_cost_rate',
       'pre_loans5', 'pre_loans530', 'pre_loans3060', 'pre_loans6090',
       'pre_loans90', 'is_zero_loans5', 'is_zero_loans530',
       'is_zero_loans3060', 'is_zero_loans6090', 'is_zero_loans90',
       'pre_util', 'pre_over2limit', 'pre_maxover2limit', 'is_zero_util',
       'is_zero_over2limit', 'is_zero_maxover2limit', 'enc_paym_0',
       'enc_paym_1', 'enc_paym_2', 'enc_paym_3', 'enc_paym_4',
       'enc_paym_5', 'enc_paym_6', 'enc_paym_7', 'enc_paym_8',
       'enc_paym_9', 'enc_paym_10', 'enc_paym_11', 'enc_paym_12',
       'enc_paym_13', 'enc_paym_14', 'enc_paym_15', 'enc_paym_16',
       'enc_paym_17', 'enc_paym_18', 'enc_paym_19', 'enc_paym_20',
       'enc_pay

In [134]:
sample_submission = pd.read_csv(f"MIiF_Кредитный скоринг/sample_submission.csv")

In [132]:
len(np.unique(sample_submission['id'])) 

500000

In [135]:
sample_submission['score'] = 0 

In [111]:
sample_submission[sample_submission['score'] > 0.5]

Unnamed: 0,id,score


In [136]:
sample_submission.to_csv(f"MIiF_Кредитный скоринг/sample_zeros.csv", index=False)

In [137]:
sample_submission_zeros = pd.read_csv(f"MIiF_Кредитный скоринг/sample_zeros.csv")

In [138]:
len(np.unique(sample_submission_zeros['id'])) 

500000

In [130]:
len(sample_submission), len(sample_submission_zeros)

(500000, 500000)

In [139]:
sample_submission = pd.read_csv(f"MIiF_Кредитный скоринг/sample_submission.csv")

In [140]:
sample_submission[sample_submission['score'] > 0.3] 

Unnamed: 0,id,score
4985,3004985,0.307816
6263,3006263,0.567224
9044,3009044,0.339019
9728,3009728,0.460841
12597,3012597,0.357996
...,...,...
498681,3498681,0.371165
498785,3498785,0.302438
498842,3498842,0.429107
499062,3499062,0.306229


In [141]:
from sklearn.metrics import roc_auc_score

In [149]:
# roc_auc_score([0,1,1,1,1], [(0.2, 0.8), (0.49, 0.51), (0.49, 0.51), (0.49, 0.51), (0.49, 0.51)])

In [150]:
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot as plt
# генерируем датасет на 2 класса
X, y = make_classification(n_samples=1000, n_classes=2, random_state=1)
# разделяем его на 2 выборки
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2)
# обучаем модель
model = LogisticRegression(solver='lbfgs')
model.fit(trainX, trainy)

In [151]:
lr_probs = model.predict_proba(testX)

In [152]:
lr_probs

array([[4.02958497e-03, 9.95970415e-01],
       [2.00643855e-03, 9.97993561e-01],
       [9.94696564e-01, 5.30343592e-03],
       [8.47909962e-01, 1.52090038e-01],
       [1.66614096e-03, 9.98333859e-01],
       [4.99240075e-01, 5.00759925e-01],
       [9.80208762e-01, 1.97912375e-02],
       [9.76507391e-01, 2.34926092e-02],
       [1.79866697e-02, 9.82013330e-01],
       [2.93132539e-01, 7.06867461e-01],
       [9.95836132e-01, 4.16386810e-03],
       [2.53456369e-01, 7.46543631e-01],
       [9.92314475e-01, 7.68552501e-03],
       [3.80237693e-02, 9.61976231e-01],
       [6.47256719e-02, 9.35274328e-01],
       [1.91873737e-01, 8.08126263e-01],
       [8.55142720e-01, 1.44857280e-01],
       [4.34150814e-02, 9.56584919e-01],
       [8.43060465e-01, 1.56939535e-01],
       [5.51009943e-02, 9.44899006e-01],
       [1.32996996e-01, 8.67003004e-01],
       [5.89343759e-01, 4.10656241e-01],
       [9.79492824e-01, 2.05071761e-02],
       [3.32717535e-01, 6.67282465e-01],
       [9.957351

In [83]:
data = None
train_dir_path = "MIiF_Кредитный скоринг/train_data/"
test_dir_path = "MIiF_Кредитный скоринг/test_data/" 

for file in os.listdir(train_dir_path):
    if data is None:
        data = pd.read_parquet(f"{train_dir_path}/{file}")
    else:
        data_buff = pd.read_parquet(f"{train_dir_path}/{file}")
        data = pd.concat([data, data_buff])

In [84]:
data 

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,...,enc_paym_21,enc_paym_22,enc_paym_23,enc_paym_24,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_credit_type,enc_loans_account_cur,pclose_flag,fclose_flag
0,0,1,18,9,2,3,16,10,11,3,...,3,3,3,4,1,3,4,1,0,0
1,0,2,18,9,14,14,12,12,0,3,...,0,0,0,4,1,3,4,1,0,0
2,0,3,18,9,4,8,1,11,11,0,...,0,0,0,4,1,2,3,1,1,1
3,0,4,4,1,9,12,16,7,12,2,...,3,3,3,4,1,3,1,1,0,0
4,0,5,5,12,15,2,11,12,10,2,...,3,3,3,4,1,3,4,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2284251,2499999,11,3,9,4,3,1,12,6,2,...,3,3,3,4,1,3,4,1,1,0
2284252,2499999,12,3,9,1,15,12,6,9,3,...,3,3,3,4,1,3,4,1,0,0
2284253,2499999,13,2,9,4,8,1,11,7,3,...,3,3,3,4,1,2,3,1,1,1
2284254,2499999,14,2,9,4,9,1,15,16,2,...,3,3,3,4,1,3,4,1,1,0


In [79]:
pd.concat([train_data_0, train_data_1])

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,...,enc_paym_21,enc_paym_22,enc_paym_23,enc_paym_24,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_credit_type,enc_loans_account_cur,pclose_flag,fclose_flag
0,0,1,18,9,2,3,16,10,11,3,...,3,3,3,4,1,3,4,1,0,0
1,0,2,18,9,14,14,12,12,0,3,...,0,0,0,4,1,3,4,1,0,0
2,0,3,18,9,4,8,1,11,11,0,...,0,0,0,4,1,2,3,1,1,1
3,0,4,4,1,9,12,16,7,12,2,...,3,3,3,4,1,3,1,1,0,0
4,0,5,5,12,15,2,11,12,10,2,...,3,3,3,4,1,3,4,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2107300,499999,8,6,11,14,7,12,11,15,2,...,0,0,3,4,1,3,2,1,0,0
2107301,499999,9,11,13,17,16,8,8,15,2,...,3,3,3,4,1,3,2,1,0,0
2107302,499999,10,1,2,11,8,8,11,11,2,...,3,3,3,4,1,2,3,1,0,1
2107303,499999,11,12,10,1,8,14,11,8,1,...,3,3,3,4,1,2,4,1,0,1


In [76]:
train_data_11

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,...,enc_paym_21,enc_paym_22,enc_paym_23,enc_paym_24,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_credit_type,enc_loans_account_cur,pclose_flag,fclose_flag
0,2750000,1,2,2,2,14,12,11,3,6,...,3,3,3,4,1,2,4,1,0,0
1,2750000,2,1,2,14,7,10,8,10,1,...,3,3,3,4,1,2,3,1,0,0
2,2750000,3,9,2,4,8,1,11,14,2,...,3,3,3,4,1,2,3,1,1,1
3,2750000,4,9,2,11,16,14,8,9,3,...,3,3,3,4,1,2,4,1,0,0
4,2750000,5,9,2,6,1,0,4,4,2,...,3,3,3,4,1,2,3,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450625,2999999,8,6,5,14,13,1,15,16,2,...,0,0,0,1,1,3,4,1,0,0
2450626,2999999,9,5,3,2,10,15,14,17,2,...,0,0,0,4,1,3,4,1,0,0
2450627,2999999,10,3,16,11,13,14,8,15,5,...,0,0,3,4,1,2,4,1,0,0
2450628,2999999,11,3,6,4,8,1,11,0,5,...,3,3,3,4,1,2,3,1,1,1


In [63]:
test_data_0

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,...,enc_paym_21,enc_paym_22,enc_paym_23,enc_paym_24,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_credit_type,enc_loans_account_cur,pclose_flag,fclose_flag
0,3000000,1,11,5,17,14,12,11,3,2,...,3,3,3,4,1,3,4,1,0,0
1,3000000,2,19,16,15,9,12,11,16,3,...,3,3,3,4,1,2,4,1,0,0
2,3000001,1,16,17,8,5,4,9,5,2,...,3,3,3,4,1,3,4,1,0,0
3,3000001,2,16,7,9,0,4,9,1,2,...,3,3,3,4,1,3,4,1,0,0
4,3000001,3,10,0,14,7,11,12,2,2,...,1,1,0,2,1,3,4,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2389768,3249999,2,5,12,16,9,11,12,10,2,...,3,3,3,4,1,3,4,1,0,0
2389769,3249999,3,3,17,17,6,12,0,4,2,...,3,3,3,4,1,3,3,1,0,0
2389770,3249999,4,11,9,4,8,1,11,12,2,...,3,3,3,4,1,2,4,1,1,1
2389771,3249999,5,11,6,12,10,1,11,8,4,...,3,3,3,4,1,2,4,1,0,0


In [25]:
train_data_0 = pd.read_parquet(f"MIiF_Кредитный скоринг/train_data/train_data_0.pq")

In [61]:
train_data_0.isnull().values.any()

False

In [30]:
train_data_0.columns

Index(['id', 'rn', 'pre_since_opened', 'pre_since_confirmed', 'pre_pterm',
       'pre_fterm', 'pre_till_pclose', 'pre_till_fclose',
       'pre_loans_credit_limit', 'pre_loans_next_pay_summ',
       'pre_loans_outstanding', 'pre_loans_total_overdue',
       'pre_loans_max_overdue_sum', 'pre_loans_credit_cost_rate', 'pre_loans5',
       'pre_loans530', 'pre_loans3060', 'pre_loans6090', 'pre_loans90',
       'is_zero_loans5', 'is_zero_loans530', 'is_zero_loans3060',
       'is_zero_loans6090', 'is_zero_loans90', 'pre_util', 'pre_over2limit',
       'pre_maxover2limit', 'is_zero_util', 'is_zero_over2limit',
       'is_zero_maxover2limit', 'enc_paym_0', 'enc_paym_1', 'enc_paym_2',
       'enc_paym_3', 'enc_paym_4', 'enc_paym_5', 'enc_paym_6', 'enc_paym_7',
       'enc_paym_8', 'enc_paym_9', 'enc_paym_10', 'enc_paym_11', 'enc_paym_12',
       'enc_paym_13', 'enc_paym_14', 'enc_paym_15', 'enc_paym_16',
       'enc_paym_17', 'enc_paym_18', 'enc_paym_19', 'enc_paym_20',
       'enc_paym_21', 

In [74]:
np.unique(train_data_0['rn'])  

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51],
      dtype=int64)

In [75]:
np.unique(train_data_11['rn'])  

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58], dtype=int64)

In [29]:
print(train_data_0.apply(lambda col: col.unique()))

id                         [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
rn                         [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
pre_since_opened           [18, 4, 5, 3, 2, 1, 7, 8, 15, 14, 10, 13, 12, ...
pre_since_confirmed        [9, 1, 12, 0, 7, 3, 4, 14, 6, 8, 5, 16, 2, 10,...
pre_pterm                  [2, 14, 4, 9, 15, 11, 1, 12, 13, 7, 8, 0, 16, ...
                                                 ...                        
enc_loans_credit_status                                [3, 2, 5, 4, 1, 0, 6]
enc_loans_credit_type                                     [4, 3, 1, 5, 0, 2]
enc_loans_account_cur                                           [1, 2, 0, 3]
pclose_flag                                                           [0, 1]
fclose_flag                                                           [0, 1]
Length: 61, dtype: object
