In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import gc
import sys
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from joblib import dump, load

import warnings
warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn import init
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler, Sampler
from transformers import AutoModel, AutoTokenizer, AutoConfig

from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

from time import time
from colorama import Fore, Back, Style

r_ = Fore.RED
b_ = Fore.BLUE
g_ = Fore.GREEN
y_ = Fore.YELLOW
w_ = Fore.WHITE
bb_ = Back.BLACK
by_ = Back.YELLOW
sr_ = Style.RESET_ALL

## Global config

In [None]:
class Config:
    epochs = 3
    batch_size = 16
    test_batch = 32
    
    device = 'cuda'
    seed = 42
    max_len = 256
    lr = 2e-5
    weight_decay = 0.01
    
    num_labels = 1

## Dataset

In [None]:
class RoBERTaDataset(Dataset):
    def __init__(self, df, tokenizer, for_test=False):
        super().__init__()
        self.text = df['excerpt'].values
        self.for_test = for_test
        if not for_test:
            self.target = df['target'].values
        self.max_len = Config.max_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.text[index]
        text = ' '.join(text.split())
        inputs = self.tokenizer.encode_plus(text,
                                            None,
                                            truncation=True,
                                            add_special_tokens=True,
                                            max_length=self.max_len,
                                            padding='max_length')

        if not self.for_test:
            return {
                'input_ids':
                    torch.tensor(inputs['input_ids'], dtype=torch.long),
                'attention_mask':
                    torch.tensor(inputs['attention_mask'], dtype=torch.long),
                'label':
                    torch.tensor(self.target[index], dtype=torch.float)
            }
        else:
            return {
                'input_ids':
                    torch.tensor(inputs['input_ids'], dtype=torch.long),
                'attention_mask':
                    torch.tensor(inputs['attention_mask'], dtype=torch.long)
            }

### Model Attn 1

In [None]:
# CLRPModel: 467    BaseOne: 474  Base 2: 475     
class AttnOneConifg:
    model_name = 'roberta-base'
    pretrained_model_path = '/kaggle/input/comlitrobertabasescript/'
    
    epochs = 3
    batch_size = 16
    test_batch = 32
    
    seed = 42
    max_len = 256
    lr = 2e-5
    weight_decay = 0.01

In [None]:
import torch
import torch.nn as nn

# Using Model
# Base Model One         Base Model Two
class AttentionHead_Ori(nn.Module):
    def __init__(self, h_size, hidden_dim=512):
        super().__init__()
        self.W = nn.Linear(h_size, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

class CLRPModel(nn.Module):
    def __init__(self):
        super(CLRPModel, self).__init__()
        config = AutoConfig.from_pretrained(AttnOneConifg.pretrained_model_path)
        config.update({"output_hidden_states":True,
                        "hidden_dropout_prob": 0.0,
                        "layer_norm_eps": 1e-7})
        self.h_size = config.hidden_size
        self.transformer = AutoModel.from_pretrained(AttnOneConifg.pretrained_model_path, config=config)
        self.head = AttentionHead_Ori(self.h_size)
        self.linear = nn.Linear(self.h_size, 1)

    def forward(self, input_ids, attention_mask):
        transformer_out = self.transformer(input_ids, attention_mask)
        context = self.head(transformer_out.last_hidden_state)
        x = self.linear(context)
        return x, context

In [None]:
class LitModel(nn.Module):
    def __init__(self):
        super().__init__()
        config = AutoConfig.from_pretrained(AttnOneConifg.pretrained_model_path)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(AttnOneConifg.pretrained_model_path, config=config)  
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        
        self.regressor = nn.Sequential(nn.Linear(768, 1))
        
    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids, attention_mask)        
        last_hidden_states = roberta_output.hidden_states[-1]
        
        weights = self.attention(last_hidden_states)
        context_vector = torch.sum(weights * last_hidden_states, dim=1)        
        
        return self.regressor(context_vector), context_vector

### Model MeanPooling Large and MeanEmbedding

MeanEmbedding完全可以和Model MeanPooling Large合在一起

- MeanEmbedding V1 + SVM 473

In [None]:
# Mean Model
class MeanLargeConfig:
    model_name = 'roberta-large'
    pretrained_model_path = '../input/comlitrobertalargescript'
    
    epochs = 3
    batch_size = 16
    test_batch = 32
    
    seed = 42
    max_len = 256
    lr = 2e-5
    weight_decay = 0.01
    
    head_hidden = 512
    
    use_multi_sample_dropout = True # this model didn`t help

In [None]:
# V1 large :  inference model  同时输出 logits 和 pool_out embedding，可以选择 如何取舍两个输出。
#             MeanEmbedding V1 + SVM 473
#                    logits          472
class MeanPooling(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        
    def forward(self, hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_state.size()).float()
        sum_embeddings = torch.sum(hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

    
class MeanModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.config = AutoConfig.from_pretrained(MeanLargeConfig.pretrained_model_path)
        self.config.update({"output_hidden_states": True,
                            "hidden_dropout_prob": 0.0,
                            "attention_probs_dropout_prob": 0.1,
                            "layer_norm_eps": 1e-7}) 
        self.roberta = AutoModel.from_pretrained(MeanLargeConfig.pretrained_model_path,
                                                 config=self.config)
        self.layer_norm = nn.LayerNorm(self.config.hidden_size, eps=1e-7)
        self.pooler = MeanPooling(self.config.hidden_size)

        self.low_dropout = nn.Dropout(0.1)
        self.dropout = nn.Dropout(p=0.5)
        self.regressor = nn.Linear(self.config.hidden_size, 1)


    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        
        ## MeanPooling
        hidden_states = outputs[0]
        pool_out = self.pooler(hidden_states, attention_mask)
        # pool_out = self.low_dropout(pool_out)
        
        # didn`t help
        logits = torch.mean(torch.stack([self.regressor(self.dropout(pool_out)) for _ in range(5)], dim=0), dim=0)

        return (logits, pool_out)

### Attn v2

In [None]:
class AttConfig1:
    model_name = 'roberta-large'
    pretrained_model_path = ''
    
    output_hidden_states = True
    epochs = 3
    num_labels = 1
    
    device = 'cuda'
    
    seed = 42
    max_len = 256
    lr = 2e-5
    weight_decay = 0.01
    head_hidden = 512
    
    warmup_steps = 50

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, hidden_state, attention_mask):
        att = torch.tanh(self.W(hidden_state))
        score = self.V(att)

        mask_expanded = attention_mask.unsqueeze(-1)
        score[~mask_expanded] = -1e9

        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * hidden_state
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector
    
    
class AttModel(nn.Module):
    def __init__(self, config, attn_type='tradition'):
        super().__init__()
        self.config = AutoConfig.from_pretrained(config.pretrained_model_path)
        self.config.update({
                "output_hidden_states": True,
                "hidden_dropout_prob": 0.0,
                "attention_probs_dropout_prob": 0.1,
                "layer_norm_eps": 1e-7
                }) 
        self.roberta = AutoModel.from_pretrained(config.pretrained_model_path,
                                                 config=self.config)
        self.head = AttentionHead(self.config.hidden_size, config.head_hidden)
        
        self.layer_norm = nn.LayerNorm(self.config.hidden_size, eps=1e-5)
        self.regressor = nn.Linear(self.config.hidden_size, config.num_labels)
        
        self.dropout = nn.Dropout(p=0.1)
        self.m_dropout = nn.Dropout(p=0.5)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        hidden_states = outputs[2][-1]

        x = self.head(hidden_states, attention_mask)
        logits = self.regressor(x)

        return logits, x

### Attn 3

In [None]:
class AttConfig2:
    model_name = 'roberta-base'
    pretrained_model_path = ''
    
    output_hidden_states = True
    epochs = 3
    num_labels = 1
    
    device = 'cuda'
    
    seed = 42
    max_len = 256
    lr = 2e-5
    weight_decay = 0.01
    head_hidden = 512
    
    warmup_steps = 50

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, hidden_state, attention_mask):

        att = torch.tanh(self.W(hidden_state))
        score = self.V(att)

        mask_expanded = attention_mask.unsqueeze(-1)
        score[~mask_expanded] = -1e9

        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * hidden_state
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector
    
    
class Mish(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x *( torch.tanh(F.softplus(x)))
    
    
class AttModel2(nn.Module):
    def __init__(self, config, attn_type='tradition'):
        super().__init__()
        self.config = AutoConfig.from_pretrained(config.pretrained_model_path)
        self.config.update({
                "output_hidden_states": True,
                "hidden_dropout_prob": 0.0,
                "attention_probs_dropout_prob": 0.1,
                "layer_norm_eps": 1e-7
                }) 
        self.roberta = AutoModel.from_pretrained(config.pretrained_model_path,
                                                 config=self.config)
        self.head = AttentionHead(self.config.hidden_size, config.head_hidden)

        self.layer_norm = nn.LayerNorm(self.config.hidden_size, eps=1e-5)
        self.regressor = nn.Sequential(
                nn.Linear(self.config.hidden_size, 512),
                Mish(),
                nn.Linear(512, 1)
        )
        
        self.dropout = nn.Dropout(p=0.1)
        self.m_dropout = nn.Dropout(p=0.5)
        

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        hidden_states = outputs[2][-1]

        x = self.head(hidden_states, attention_mask)

        x = self.layer_norm(x)
        logits = self.regressor(x)
        
        return logits, x

### Mean v2

In [None]:
# Mean Model
class MeanV2BaseConfig:
    model_name = 'roberta-base'
    pretrained_model_path = ''
    
    epochs = 3
    batch_size = 16
    test_batch = 32
    
    seed = 42
    max_len = 256
    lr = 2e-5
    weight_decay = 0.01
    
    head_hidden = 512

In [None]:
# Mean Large Model
class MeanV2LargeConfig:
    model_name = 'roberta-large'
    pretrained_model_path = ''
    
    epochs = 3
    batch_size = 16
    test_batch = 32
    
    seed = 42
    max_len = 256
    lr = 2e-5
    weight_decay = 0.01
    
    head_hidden = 512

In [None]:
# V2 base :  inference model

class MeanPooling(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        
    def forward(self, hidden_state, attention_mask):
        # last_hidden_state = outputs[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_state.size()).float()
        sum_embeddings = torch.sum(hidden_state * input_mask_expanded, 1)

        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        
        return mean_embeddings


#Mish - "Mish: A Self Regularized Non-Monotonic Neural Activation Function"
#https://arxiv.org/abs/1908.08681v1
class Mish(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x *( torch.tanh(F.softplus(x)))

    
class MeanModel_v2(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = AutoConfig.from_pretrained(config.pretrained_model_path)
        self.config.update({
                "output_hidden_states": True,
                "hidden_dropout_prob": 0.0,
                "attention_probs_dropout_prob": 0.1,
                "layer_norm_eps": 1e-7
                }) 
        self.roberta = AutoModel.from_pretrained(config.pretrained_model_path,
                                                 config=self.config)
        
        self.pooler = MeanPooling(self.config.hidden_size)
        self.layer_norm = nn.LayerNorm(self.config.hidden_size, eps=1e-5)
        
        self.dropout = nn.Dropout(0.5)
        self.regressor = nn.Sequential(
                nn.Linear(self.config.hidden_size, 512),
                Mish(),
                nn.Linear(512, 1)
        )
        
        self.std = 0.02
        self._init_weights(self.regressor)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            # module.weight.data.normal_(mean=0.0, std=self.std)
            init.kaiming_normal_(module.weight, mode='fan_in')
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        
        ## MeanPooling
        hidden_states = outputs[0]
        pool_out = self.pooler(hidden_states, attention_mask)
        # pool_out = self.dropout(pool_out)
        pool_out = self.layer_norm(pool_out)
        logits = self.regressor(pool_out)

        return logits, pool_out

### Infer func

In [None]:
def get_test_data(df):
    tokenizer = torch.load('/kaggle/input/tokenizer/roberta_tk.pt') 
    test_dataset = RoBERTaDataset(df, tokenizer, for_test=True)
    test_loader = DataLoader(test_dataset, batch_size=32,
                             num_workers=4, shuffle=False, pin_memory=True,
                             drop_last=False)
    return test_loader

In [None]:
def reset_memory():
    gc.collect()
    torch.cuda.synchronize()
    torch.cuda.empty_cache()

In [None]:
def inference(test_dataloader, model_dirs, model=None, n_models=5, ckpt_bias=0, with_embedding=False):
    models_preds = []
    models_embedding = []
    for model_num in range(n_models):
        print(f'{by_}{r_}  >>> Inference # {model_num+1}/{n_models}  {sr_}')
        torch.cuda.synchronize()

        # load
        model_path = model_dirs[model_num]
        print(f" ### Using {model_path}")
        if model:
            model.load_state_dict(torch.load(model_path, map_location=Config.device))
        else:
            model = torch.load(model_path)
        model.to(Config.device)

        # predict
        fold_preds = []
        embeddings = []
        model.eval()
        with torch.no_grad():
            for step, batch in enumerate(test_dataloader):
                sent_id, mask = batch['input_ids'].to(Config.device), batch['attention_mask'].to(Config.device)
                preds = model(sent_id, mask)
                
                if with_embedding and len(preds) == 2:
                    preds, embed = preds[0], preds[1]
                    embed = embed.detach().cpu().numpy()
                    embeddings.extend(embed)
                if len(preds) == 2:
                    preds = preds[0]
                fold_preds += preds.flatten().cpu().tolist()

        # records
        models_preds.append(fold_preds)
        if with_embedding:
            models_embedding.append(np.array(embeddings))

        if not model:  # load_state_dict 方式，不能在这里删除
            del model
            gc.collect()
            torch.cuda.synchronize()
            torch.cuda.empty_cache()

        print(f'! Model Complete. ++++++++++')
    print()

    # output
    models_preds = np.array(models_preds).mean(axis=0)
    if not with_embedding:
        return models_preds
    else:
        return models_preds, models_embedding

In [None]:
def embedding_svr_test(embeddings, num_pred, bert_nums=5, svr_nfolds=10):
    # SVM predict: 5 SVR model
    results = np.zeros(num_pred)
    for index, X_test in enumerate(embeddings):
        print(f'{by_}{r_}  SVR#{index+1} predicting {sr_}')
        for i in range(svr_nfolds):
            svr = load(save_dir + f'svr_{index}_{i}.bin')
            preds = svr.predict(X_test)
            results += preds
    print(f'SVR Complete.')

    return results / bert_nums / svr_nfolds

In [None]:
# !pip install textstat --no-index --find-links=file:///kaggle/input/textstat-local/textstat 

# import textstat

# def add_textstat_features(df):
#     ### You can add/remove any feature below and it will be used in training and test
#     df['coleman_liau_index'] = df['excerpt'].apply(lambda x: textstat.coleman_liau_index(x))
#     df['flesch_reading_ease'] = df['excerpt'].apply(lambda x: textstat.flesch_reading_ease(x))
#     df['smog_index'] = df['excerpt'].apply(lambda x: textstat.smog_index(x))
#     df['dale_chall_readability_score'] = df['excerpt'].apply(lambda x: textstat.dale_chall_readability_score(x))
#     return df

# def difficult_words_ratio(df, input_col='excerpt', output_col='difficult_words_ratio'):
#     print(f"Applying {output_col} to data set.")
#     df[output_col] = df[input_col].apply(lambda x: textstat.difficult_words(x))
#     df[output_col] = df.apply(lambda x: x[output_col] / textstat.lexicon_count(x[input_col]), axis=1)
#     return df 

# def syllable_ratio(df, input_col='excerpt', output_col='syllable_ratio'):
#     print(f"Applying {output_col} to data set.")
#     df[output_col] = df[input_col].apply(lambda x: textstat.syllable_count(x))
#     df[output_col] = df.apply(lambda x: x[output_col] / textstat.lexicon_count(x[input_col]), axis=1)
#     return df

In [None]:
def save_embeddings(embeds, path):
    import pickle
    with open(path, "wb") as f:
        pickle.dump(embeds, f)

def load_embeddings(path):
    import pickle
    with open(path, "rb") as f:
        emb = pickle.load(f)
    return emb

## Train SVR

In [None]:
# test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
test_df = pd.read_csv("/kaggle/input/cmlit-fold/train_data.csv")
test_df['excerpt'] = test_df['excerpt'].apply(lambda x: x.replace('\n',' '))



test_dataloader = get_test_data(test_df)
#### LitModel:
Lit = [f'../input/comlitothers/model_{i + 1}.bin' for i in range(5)]
litmodel = LitModel()  # if use load_state_dict, init model
pred_litm, embed1 = inference(test_dataloader, Lit, model=litmodel, ckpt_bias=1,with_embedding=True)
test_df["m1"] = pred_litm
save_embeddings(embed1, "./embed1.bin")

del litmodel, test_dataloader
reset_memory()


# #### mean large v2:
# MeanL_3 = [ f'/kaggle/input/newmeanlarge/model_{i}.bin' for i in range(5)]
# config = MeanV2LargeConfig()
# meanmodel3 = MeanModel_v2(config)
# pred_mean_v3, embed2 = inference(test_dataloader, MeanL_3, model=meanmodel3, with_embedding=True)
# # svr_preds = embedding_svr_test(embeddings, len(test_df))
# test_df["m2"] = pred_mean_v3
# save_embeddings(embed2, "./embed2.bin")

# del meanmodel3
# reset_memory()


test_dataloader = get_test_data(test_df)
#### attn large 1:
AttL_1 = [ f'/kaggle/input/largeattnlit/model_{i}.bin' for i in range(5)]
config = AttConfig1()
attmodel1 = AttModel(config)
pred_attL_v1, embed3 = inference(test_dataloader, AttL_1, model=attmodel1, with_embedding=True)
# svr_preds = embedding_svr_test(embeddings, len(test_df))
test_df["m3"] = pred_attL_v1
save_embeddings(embed3, "./embed3.bin")

del attmodel1, test_dataloader
reset_memory()



test_dataloader = get_test_data(test_df)
#### mean large v2 reinit:
MeanL_4 = [ f'/kaggle/input/meanlargereinit/model_{i}.bin' for i in range(5)]
config = MeanV2LargeConfig()
meanmodel4 = MeanModel_v2(config)
pred_mean_v4, embed4 = inference(test_dataloader, MeanL_4, model=meanmodel4, with_embedding=True)
# svr_preds = embedding_svr_test(embeddings, len(test_df))
test_df["m4"] = pred_mean_v4
save_embeddings(embed4, "./embed4.bin")

del meanmodel4, test_dataloader
reset_memory()



test_dataloader = get_test_data(test_df)
#### attn large reinit 2:
AttL_2 = [ f'/kaggle/input/attlargereinit/model_{i}.bin' for i in range(5)]
config = AttConfig1()
attmodel2 = AttModel(config)
pred_attL_v2, embed2  = inference(test_dataloader, AttL_2, model=attmodel2, with_embedding=True)
# svr_preds = embedding_svr_test(embeddings, len(test_df))
test_df["m2"] = pred_attL_v2
save_embeddings(embed2, "./embed2.bin")

del attmodel2, test_dataloader
reset_memory()

In [None]:
# embedding
# features = embed3

# 预测值
# text stats features
# test_df = add_textstat_features(test_df)
# test_df = difficult_words_ratio(test_df)
# test_df = syllable_ratio(test_df)

# print(test_df.head())
# test_df.to_csv("./mldata.csv", index=False)

In [None]:
# import os

# # !pip install kaggle

# os.environ["KAGGLE_USERNAME"] = "racleray"
# os.environ["KAGGLE_KEY"] = "d1c80c062506c912e369893b207eaca9"

# !kaggle datasets metadata racleray/comlitmldata
# # !mv dataset-metadata.json model/

# # 最好不要有文件夹
# !kaggle datasets version -p ./ -m "Updated data base fine"

### ML

In [None]:
embed1 = load_embeddings("./embed1.bin")
print(len(embed1))

embed2 = load_embeddings("./embed2.bin")
print(len(embed2))

embed3 = load_embeddings("./embed3.bin")
print(len(embed3))

embed4 = load_embeddings("./embed4.bin")
print(len(embed4))

SVR

In [None]:
# rmse_score SVR
from joblib import dump, load
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold


train_data = test_df

## Reset bins
num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:, 'bins'] = pd.cut(train_data['target'], bins=num_bins, labels=False)

target = train_data['target'].to_numpy()
bins = train_data.bins.to_numpy()

In [None]:
train_data.head()

In [None]:
def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))


# input whole train_data not shuffled
def embedding_svr_train(df, bins, models_embedding, save_dir="./", emb=1, bert_nums=5, svr_nfolds=10, C=8, kernel='rbf'):
    mean_scores = []
    records = []

    # get embeddings
    #print(models_embedding[0].shape)
    #print(type(models_embedding[0]))    
    print("Embedding got.")    

    
    # SVM training: 5 SVR model
    for index, X in enumerate(models_embedding):
        print(f'{by_}{r_}  SVR#{index+1} training {sr_}')
        scores = []
        model = SVR(C=C, kernel=kernel, gamma='auto')
        # new kfold
        kfold = StratifiedKFold(n_splits=svr_nfolds, shuffle=True, random_state=42)
        for i, (train_idx, valid_idx) in tqdm(enumerate(kfold.split(X, bins))):
#             model = SVR(C=C, kernel=kernel, gamma='auto')
            X_train, y_train = X[train_idx], target[train_idx]
            X_valid, y_valid = X[valid_idx], target[valid_idx]
            
            model.fit(X_train, y_train)
            
            prediction = model.predict(X_valid)
            score = rmse_score(prediction, y_valid)
            scores.append(score)
            print(f'\t\t{y_}SVR {index} Fold {i} , rmse score: {score:.4f} {sr_}')

        os.makedirs(save_dir, exist_ok=True)
        dump(model, save_dir + f'svr_{emb}_{index}.bin')

        mean_score = np.mean(scores)
        print(f'\t{r_}SVR {index} mean rmse score: {mean_score:.4f} {sr_}')
        mean_scores.append(mean_score)
        records.append(scores)

    print(f'Avg rmse score of 5 SVR: {np.mean(mean_scores):.4f}')

    return records

In [None]:
embedding_svr_train(train_data, bins, embed1, emb=1, svr_nfolds=5, C=100) 

In [None]:
embedding_svr_train(train_data, bins, embed1, svr_nfolds=5, C=10)

In [None]:
embedding_svr_train(train_data, bins, embed1, emb=1, svr_nfolds=5, C=200)

In [None]:
embedding_svr_train(train_data, bins, embed2, emb=2, svr_nfolds=5, C=200)

In [None]:
embedding_svr_train(train_data, bins, embed3, emb=3, svr_nfolds=5, C=200)

In [None]:
embedding_svr_train(train_data, bins, embed3, svr_nfolds=5, C=250)

In [None]:
embedding_svr_train(train_data, bins, embed4, emb=4, svr_nfolds=5, C=200)

In [None]:
# import os

# # !pip install kaggle

# os.environ["KAGGLE_USERNAME"] = ""
# os.environ["KAGGLE_KEY"] = ""

# !kaggle datasets metadata racleray/svrmodel
# # !mv dataset-metadata.json model/

# # 最好不要有文件夹
# !kaggle datasets version -p ./ -m "Updated data base fine"

### Light GBM

过拟合

In [None]:
# ml_df = pd.read_csv("../input/comlitmldata/mldata.csv")
# ml_df.head()

In [None]:
# Smaller maxbin: reduces train accuracy but has potential to increase generalization
# Bigger min_data_in_leaf: has potential to reduce overfitting

# params = {
# 'boosting_type': 'gbdt',
# 'objective': 'regression',
# 'metric': 'rmse',
#     # baseline at 100 for min_data_in_leaf
# 'min_data_in_leaf': 100,
#     # baseline at .8
# 'feature_fraction': .8,
#     # baseline at .8
# 'bagging_fraction': 0.8,
# 'bagging_freq': 10,
# 'max_depth': 10,
# 'num_leaves': 32,
# 'learning_rate': 0.05,
#     # baseline at max_bin 256
# "max_bin": 100,
# "n_estimators": 10000,
# }


# params = {
#     'boosting_type': 'gbdt',
#     'metric': 'rmse',
#     'objective': 'regression',
#     'verbose': -1,
#     'learning_rate': 0.05,
#     'max_depth': 10,
#     'feature_pre_filter': False,
#     'lambda_l1': 2.215942517163985,
#     'lambda_l2': 0.0015606472088872934,
#     'num_leaves': 2,
#     'feature_fraction': 0.8999999999999999,
#     'bagging_fraction': 1.0,
#     'bagging_freq': 0,
#     'min_child_samples': 30,
# }


# lgm_data = ml_df.copy()

# cols2remove = ['url_legal'
#                , 'excerpt'
#                , 'id'
#                , 'license'
#                , 'kfold'
#                , 'bins'
#                , 'standard_error'
# #                , 'm2', 'm3', 'm4', 'm1'
#               ]

# lgm_data = lgm_data.drop(columns=cols2remove)


# lgm_data.head()

In [None]:
# import lightgbm as lgm

# ## Reset bins
# num_bins = int(np.floor(1 + np.log2(len(lgm_data))))
# lgm_data.loc[:, 'bins'] = pd.cut(lgm_data['target'], bins=num_bins, labels=False)

# X = lgm_data.loc[:, lgm_data.columns != 'target'].to_numpy()
# target = lgm_data['target'].to_numpy()
# bins = lgm_data.bins.to_numpy()


# pred = np.zeros(ml_df.shape[0])
# rmses = []
# kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# for i, (train_idx, valid_idx) in tqdm(enumerate(kfold.split(X, bins))):
#     X_train, y_train = X[train_idx], target[train_idx]
#     X_valid, y_valid = X[valid_idx], target[valid_idx]
    
#     lgm_train_set = lgm.Dataset(data=X_train, label=y_train)
#     lgm_valid_set = lgm.Dataset(data=X_valid, label=y_valid, reference=lgm_train_set)
    
#     model = lgb.train(
#         params,
#         lgm_train_set, 
#         num_boost_round=1000,
#         early_stopping_rounds=10,
#         valid_sets=[lgm_train_set, lgm_valid_set], 
#         verbose_eval=-1
#     )

#     y_pred = model.predict(X_valid)
#     rmse = rmse_score(y_pred, y_valid)
#     rmses.append(rmse)
    
#     # tmp_pred = model.predict(X_test)
#     # pred += tmp_pred / 5
    
# print("\n", "Mean Fold RMSE:", np.mean(rmses))

#### Only SVR inference[Test using]

-   id    target
- 0  c0f722661 -0.367501
- 1  f0953f0a5 -0.398820
- 2  0df072751 -0.537474
- 3  04caf4e0c -2.299636
- 4  0e63f8bea -1.962926
- 5  12537fe78 -1.093182
- 6  965e592c0  0.077126

问题在于 需要合适的 embedding， 容易过拟合，需要bagging

In [None]:
# # rmse_score SVR
# from joblib import dump, load
# from sklearn.svm import SVR
# from sklearn.metrics import mean_squared_error
# from sklearn.model_selection import StratifiedKFold


# def rmse_score(y_true,y_pred):
#     return np.sqrt(mean_squared_error(y_true,y_pred))


# def embedding_svr_test(df, save_dir, bert_path, bert_nums=5, svr_nfolds=10):
#     # get embeddings
#     models_embedding = []
#     for fold_num in range(bert_nums):
#         print(f'{by_}{r_}  Model#{fold_num+1} inferencing {sr_}')
#         device = Config.device

#         test_dataloader = get_test_data(df)

#         model = MeanModelEmbedding()
#         model.load_state_dict(torch.load(bert_path + f'model_{fold_num}.bin'))
#         model.to(device)
#         model.eval()

#         embeddings = []
#         with torch.no_grad():
#             for i, batch in tqdm(enumerate(test_dataloader)):
#                 sent_id, mask = batch['input_ids'].to(Config.device), batch['attention_mask'].to(Config.device)
#                 outputs = model(sent_id, mask)
#                 outputs = outputs.detach().cpu().numpy()
#                 embeddings.extend(outputs)
#             embeddings = np.array(embeddings)
#         models_embedding.append(embeddings)

#         del model
#         gc.collect()
#         torch.cuda.empty_cache()
        
#     print(f'Embedding got.')

#     # SVM predict: 5 SVR model
#     results = np.zeros((df.shape[0]))
#     for index, X_test in enumerate(models_embedding):
#         print(f'{by_}{r_}  SVR#{index+1} predicting {sr_}')
#         for i in range(svr_nfolds):
#             svr = load(save_dir + f'svr_{index}_{i}.bin')
#             preds = svr.predict(X_test)
#             results += preds
            
#     print(f'Complete.')

#     return results / bert_nums / svr_nfolds