1. データの読み込み

In [3]:
import math
import random
import time
import warnings
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import os
import transformers as T
from random import shuffle
from pathlib import Path
from sklearn.metrics import fbeta_score
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, Dataset
from sklearn.linear_model import LogisticRegression
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import string
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import fbeta_score
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import tqdm
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import naive_bayes
from scipy.optimize import minimize, minimize_scalar
import texthero as hero

In [4]:
warnings.filterwarnings("ignore")

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [6]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed = 471
seed_torch(seed)

In [7]:
class config:
    DATA_DIR = './dataset/data1'
    OUTPUT_DIR ='./result/result9'
if not os.path.isdir(config.OUTPUT_DIR):
    os.makedirs(config.OUTPUT_DIR)

In [8]:
def opt_fbeta_threshold(y_true, y_pred):
    """fbeta score計算時のthresholdを最適化"""
    def opt_(x): 
        return -fbeta_score(y_true, y_pred >= x, beta=7)
    result = minimize(opt_, x0=np.array([0.1]), method='Powell')
    best_threshold = result['x'].item()
    return best_threshold


def metrics(y_true, y_pred):
    """fbeta(beta=7)の閾値最適化評価関数"""
    bt = opt_fbeta_threshold(y_true, y_pred)
    print(f"bt:{bt}")
    score = fbeta_score(y_true, y_pred >= bt, beta=7)
    return score, bt

In [9]:

#stop words list
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 
			'ours', 'ourselves', 'you', 'your', 'yours', 
			'yourself', 'yourselves', 'he', 'him', 'his', 
			'himself', 'she', 'her', 'hers', 'herself', 
			'it', 'its', 'itself', 'they', 'them', 'their', 
			'theirs', 'themselves', 'what', 'which', 'who', 
			'whom', 'this', 'that', 'these', 'those', 'am', 
			'is', 'are', 'was', 'were', 'be', 'been', 'being', 
			'have', 'has', 'had', 'having', 'do', 'does', 'did',
			'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
			'because', 'as', 'until', 'while', 'of', 'at', 
			'by', 'for', 'with', 'about', 'against', 'between',
			'into', 'through', 'during', 'before', 'after', 
			'above', 'below', 'to', 'from', 'up', 'down', 'in',
			'out', 'on', 'off', 'over', 'under', 'again', 
			'further', 'then', 'once', 'here', 'there', 'when', 
			'where', 'why', 'how', 'all', 'any', 'both', 'each', 
			'few', 'more', 'most', 'other', 'some', 'such', 'no', 
			'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 
			'very', 's', 't', 'can', 'will', 'just', 'don', 
			'should', 'now', '']

#cleaning up text
import re
def get_only_chars(line):

    clean_line = ""

    line = line.replace("’", "")
    line = line.replace("'", "")
    line = line.replace("-", " ") #replace hyphens with spaces
    line = line.replace("\t", " ")
    line = line.replace("\n", " ")
    line = line.lower()

    for char in line:
        if char in 'qwertyuiopasdfghjklzxcvbnm ':
            clean_line += char
        else:
            clean_line += ' '

    clean_line = re.sub(' +',' ',clean_line) #delete extra spaces
    if clean_line[0] == ' ':
        clean_line = clean_line[1:]
    return clean_line

########################################################################
# Synonym replacement
# Replace n words in the sentence with synonyms from wordnet
########################################################################

#for the first time you use wordnet
#import nltk
#nltk.download('wordnet')
from nltk.corpus import wordnet 

def synonym_replacement(words, n):
	new_words = words.copy()
	random_word_list = list(set([word for word in words if word not in stop_words]))
	random.shuffle(random_word_list)
	num_replaced = 0
	for random_word in random_word_list:
		synonyms = get_synonyms(random_word)
		if len(synonyms) >= 1:
			synonym = random.choice(list(synonyms))
			new_words = [synonym if word == random_word else word for word in new_words]
			#print("replaced", random_word, "with", synonym)
			num_replaced += 1
		if num_replaced >= n: #only replace up to n words
			break

	#this is stupid but we need it, trust me
	sentence = ' '.join(new_words)
	new_words = sentence.split(' ')

	return new_words

def get_synonyms(word):
	synonyms = set()
	for syn in wordnet.synsets(word): 
		for l in syn.lemmas(): 
			synonym = l.name().replace("_", " ").replace("-", " ").lower()
			synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
			synonyms.add(synonym) 
	if word in synonyms:
		synonyms.remove(word)
	return list(synonyms)

########################################################################
# Random deletion
# Randomly delete words from the sentence with probability p
########################################################################

def random_deletion(words, p):

	#obviously, if there's only one word, don't delete it
	if len(words) == 1:
		return words

	#randomly delete words with probability p
	new_words = []
	for word in words:
		r = random.uniform(0, 1)
		if r > p:
			new_words.append(word)

	#if you end up deleting all words, just return a random word
	if len(new_words) == 0:
		rand_int = random.randint(0, len(words)-1)
		return [words[rand_int]]

	return new_words

########################################################################
# Random swap
# Randomly swap two words in the sentence n times
########################################################################

def random_swap(words, n):
	new_words = words.copy()
	for _ in range(n):
		new_words = swap_word(new_words)
	return new_words

def swap_word(new_words):
	random_idx_1 = random.randint(0, len(new_words)-1)
	random_idx_2 = random_idx_1
	counter = 0
	while random_idx_2 == random_idx_1:
		random_idx_2 = random.randint(0, len(new_words)-1)
		counter += 1
		if counter > 3:
			return new_words
	new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] 
	return new_words

########################################################################
# Random insertion
# Randomly insert n words into the sentence
########################################################################

def random_insertion(words, n):
	new_words = words.copy()
	for _ in range(n):
		add_word(new_words)
	return new_words

def add_word(new_words):
	synonyms = []
	counter = 0
	while len(synonyms) < 1:
		random_word = new_words[random.randint(0, len(new_words)-1)]
		synonyms = get_synonyms(random_word)
		counter += 1
		if counter >= 10:
			return
	random_synonym = synonyms[0]
	random_idx = random.randint(0, len(new_words)-1)
	new_words.insert(random_idx, random_synonym)

########################################################################
# main data augmentation function
########################################################################

def eda(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9):
	
	sentence = get_only_chars(sentence)
	words = sentence.split(' ')
	words = [word for word in words if word is not '']
	num_words = len(words)
	
	augmented_sentences = []
	num_new_per_technique = int(num_aug/4)+1

	#sr
	if (alpha_sr > 0):
		n_sr = max(1, int(alpha_sr*num_words))
		for _ in range(num_new_per_technique):
			a_words = synonym_replacement(words, n_sr)
			augmented_sentences.append(' '.join(a_words))

	#ri
	if (alpha_ri > 0):
		n_ri = max(1, int(alpha_ri*num_words))
		for _ in range(num_new_per_technique):
			a_words = random_insertion(words, n_ri)
			augmented_sentences.append(' '.join(a_words))

	#rs
	if (alpha_rs > 0):
		n_rs = max(1, int(alpha_rs*num_words))
		for _ in range(num_new_per_technique):
			a_words = random_swap(words, n_rs)
			augmented_sentences.append(' '.join(a_words))

	#rd
	if (p_rd > 0):
		for _ in range(num_new_per_technique):
			a_words = random_deletion(words, p_rd)
			augmented_sentences.append(' '.join(a_words))

	augmented_sentences = [get_only_chars(sentence) for sentence in augmented_sentences]
	shuffle(augmented_sentences)

	#trim so that we have the desired number of augmented sentences
	if num_aug >= 1:
		augmented_sentences = augmented_sentences[:num_aug]
	else:
		keep_prob = num_aug / len(augmented_sentences)
		augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]

	#append the original sentence
	augmented_sentences.append(sentence)

	return augmented_sentences
def set_eda(df_):
    df = df_[df_.judgement==1]
    new_lines = []
    eda_df = pd.DataFrame()
    for line, label in tqdm.tqdm(zip(df['text'].tolist(), df['judgement'].tolist()), total=len(df['judgement'].tolist())):
        new_line = eda(line, alpha_sr=0.05, alpha_ri=0.0, alpha_rs=0.0, p_rd=0.1, num_aug=16)
        for l in new_line:
            eda_df = eda_df.append({'text': l, 'judgement': label}, ignore_index=True)
    eda_df['judgement'] = eda_df['judgement'].astype(np.int64)
    new_df = pd.concat([df_, eda_df], axis=0)
    
    return new_df

In [10]:
def cleansing_hero_only_text(input_df, text_col):
    ## get text only 
    custom_pipeline = [
        hero.preprocessing.fillna,
        hero.preprocessing.lowercase,
        hero.preprocessing.remove_digits,
        hero.preprocessing.remove_punctuation,
        hero.preprocessing.remove_diacritics,
        hero.preprocessing.remove_stopwords,
        hero.preprocessing.remove_whitespace,
        hero.preprocessing.stem
    ]
    texts = hero.clean(input_df[text_col], custom_pipeline)
    return texts

def basic_text_features_transforme(input_df, text_columns, cleansing_hero=None, name=""):
    """basic な text 特徴量"""
    def _get_features(dataframe, column):
        _df = pd.DataFrame()
        _df[column + name + '_num_chars'] = dataframe[column].apply(len)
        _df[column + name + '_num_exclamation_marks'] = dataframe[column].apply(lambda x: x.count('!'))
        _df[column + name + '_num_question_marks'] = dataframe[column].apply(lambda x: x.count('?'))
        _df[column + name + '_num_punctuation'] = dataframe[column].apply(lambda x: sum(x.count(w) for w in '.,;:'))
        _df[column + name + '_num_symbols'] = dataframe[column].apply(lambda x: sum(x.count(w) for w in '*&$%'))
        _df[column + name + '_num_words'] = dataframe[column].apply(lambda x: len(x.split()))
        _df[column + name + '_num_unique_words'] = dataframe[column].apply(lambda x: len(set(w for w in x.split())))
        _df[column + name + '_words_vs_unique'] = _df[column + name + '_num_unique_words'] / _df[column + name + '_num_words']
        _df[column + name + '_words_vs_chars'] = _df[column + name + '_num_words'] / _df[column + name + '_num_chars']
        return _df
    
    # main の処理
    output_df_ = pd.DataFrame()
    output_df_[text_columns] = input_df[text_columns].fillna('missing').astype(str)
    output_lst = []
    for c in text_columns:
        if cleansing_hero is not None:
            output_df_[c] = cleansing_hero(output_df_, c)
        output_df = _get_features(output_df_, c)
        output_lst.append(output_df)
    output_df = pd.concat(output_lst, axis=1)
    return output_df

In [27]:
def text_process(mess):
    STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', 'im', 'dont', 'doin', 'ure']
    nopunc = [char for char in mess if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return ' '.join([word.lower() for word in nopunc.split() if word.lower() not in STOPWORDS])
#データの読み込み
df_train = pd.read_csv(config.DATA_DIR  +"/train.csv", index_col=0).fillna(" ")
df_test = pd.read_csv(config.DATA_DIR + "/test.csv", index_col=0).fillna(" ")
sub = pd.read_csv(config.DATA_DIR + "/sample_submit.csv", index_col=0)
df_train["text"] = df_train["title"] + " " + df_train["abstract"].fillna("")
df_test["text"] = df_test["title"] + " " + df_test["abstract"].fillna("")
df_train.text = df_train.text.apply(text_process)
df_test.text = df_test.text.apply(text_process)

In [32]:
df = pd.concat([df_train, df_test]).reset_index(drop=True)
output_df = basic_text_features_transforme(df, text_columns=["text"], cleansing_hero=cleansing_hero_only_text)

In [60]:
df_ = pd.concat([df, output_df], axis=1)
df_train, df_test = df_.iloc[:len(df_train)], df_.iloc[len(df_train):].drop(columns=['judgement'])
df_train.judgement =df_train.judgement.astype(int)

In [62]:
def get_result(result_df):
    preds = result_df["preds"].values
    labels = result_df["judgement"].values
    score, border = metrics(labels, preds)
    return border, score

In [64]:
sub_prob = []
kf= StratifiedKFold(n_splits=5,random_state=42)
oof_df = pd.DataFrame()
for fold,(train_index, test_index) in enumerate(kf.split(df_train.loc[:, 'text'], df_train.loc[:, 'judgement'])):
    train = df_train.iloc[train_index].reset_index(drop=True)
    val = df_train.iloc[test_index].reset_index(drop=True)
    #edaを適用
    train = set_eda(train)
    X_train, y_train = train.loc[:, 'text'], train.loc[:, 'judgement']
    X_val, y_val = val.loc[:, 'text'], val.loc[:, 'judgement']
    X_test = df_test.loc[:, 'text']
    rus = RandomUnderSampler(random_state=71)
    pipline = Pipeline([('tfidf', TfidfVectorizer()), ('svd', TruncatedSVD(n_components=500, random_state=0)), 
            ('sc',MinMaxScaler())])
    X_train = pipline.fit_transform(X_train)
    X_val = pipline.transform(X_val)
    X_test = pipline.transform(X_test)
    X_train, y_train = rus.fit_resample(X_train,y_train)
    max_score = 0
    columns_num = X_train.shape[1]
    model = LogisticRegression()
    model.fit(X_train, y_train)
    prob = model.predict_proba(X_val)[:,1]
    score, border = metrics(y_val, prob)
    val['preds'] = prob
    if score < max_score:
        break
    else:
        max_score = score
        X_tes_fix = X_test
        model_fix = model
    prediction = model_fix.predict_proba(X_tes_fix)[:,1]
    sub_prob.append(prediction)
    oof_df_ = val
    oof_df = pd.concat([oof_df, oof_df_])
    print('fold{}::{}::{}'.format(fold, border, score))
border, score=get_result(oof_df)
print('CV::{}::{}'.format(border, score))

100%|█████████████████████████████████████████| 506/506 [00:10<00:00, 49.58it/s]


bt:0.10129415499221002
fold0::0.10129415499221002::0.8717579250720462


100%|█████████████████████████████████████████| 506/506 [00:08<00:00, 60.93it/s]


bt:0.10370603044699417
fold1::0.10370603044699417::0.876938686766198


100%|█████████████████████████████████████████| 506/506 [00:08<00:00, 61.35it/s]


bt:0.11342505630843458
fold2::0.11342505630843458::0.8718395815170008


100%|█████████████████████████████████████████| 505/505 [00:08<00:00, 61.96it/s]


bt:0.21930926553414098
fold3::0.21930926553414098::0.8979776637488681


100%|█████████████████████████████████████████| 505/505 [00:08<00:00, 62.13it/s]


bt:0.10986002711831909
fold4::0.10986002711831909::0.8727639930755915
bt:0.10986317562199474
CV::0.10986317562199474::0.8696156298915879


In [65]:
from scipy import stats
test_prob = np.array(sub_prob).mean(axis=0)
predictions = np.where(test_prob < border, 0, 1)
sub = pd.read_csv(config.DATA_DIR + "/sample_submit.csv", header = None)
sub.columns = ["id", "judgement"]
sub["judgement"] = predictions
#保存先のディレクトリ作成
sub.to_csv(config.OUTPUT_DIR + '/submission.csv', header=None, index=None)

In [57]:
import oconcatna.integration.lightgbm as lgb_o
import lightgbm as lgb_o
params = dict(n_estimators=10000,
                        num_leaves=31,
                        objective="binary", 
                        learning_rate=0.01,
                        colsample_bytree=0.3, 
                        class_weight="balanced",
                        importance_type="gain")

sub_prob = []
kf= StratifiedKFold(n_splits=5,random_state=42)
oof_df = pd.DataFrame()
for fold,(train_index, test_index) in enumerate(kf.split(df_train.loc[:, 'text'], df_train.loc[:, 'judgement'])):
    train = df_train.iloc[train_index].reset_index(drop=True)
    val = df_train.iloc[test_index].reset_index(drop=True)
    #edaを適用
    train = set_eda(train)
    X_train, y_train = train.loc[:, 'text'], train.loc[:, 'judgement']
    X_val, y_val = val.loc[:, 'text'], val.loc[:, 'judgement']
    X_test = df_test.loc[:, 'text']
    pipline = Pipeline([('tfidf', TfidfVectorizer()), ('svd', TruncatedSVD(n_components=500, random_state=0))])
    X_train = pipline.fit_transform(X_train)
    X_val = pipline.transform(X_val)
    X_test = pipline.transform(X_test)
    rus = RandomUnderSampler(random_state=71)
    X_train, y_train = rus.fit_resample(X_train,y_train)
    lgb_train = lgb_o.Dataset(X_train, y_train)
    lgb_eval = lgb_o.Dataset(X_val, y_val)
    evals_result = {}
                         
    model = lgb_o.train(params,train_set=lgb_train,
                 # 評価データ
                 valid_sets = lgb_eval,
                evals_result=evals_result, early_stopping_rounds=100)
    prob = model.predict(X_val)
    score, border = metrics(y_val, prob)
    val['preds'] = prob
    prediction = model.predict(X_test)
    sub_prob.append(prediction)
    oof_df_ = val
    oof_df = pd.concat([oof_df, oof_df_])
    print('fold{}::{}::{}'.format(fold, border, score))
border, score=get_result(oof_df)
print('CV::{}::{}'.format(border, score))

100%|█████████████████████████████████████████| 506/506 [00:08<00:00, 61.52it/s]


[LightGBM] [Info] Number of positive: 6072, number of negative: 21210
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 127500
[LightGBM] [Info] Number of data points in the train set: 27282, number of used features: 500
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.222564 -> initscore=-1.250785
[LightGBM] [Info] Start training from score -1.250785
[1]	valid_0's binary_logloss: 0.278987
Training until validation scores don't improve for 100 rounds
[2]	valid_0's binary_logloss: 0.277595
[3]	valid_0's binary_logloss: 0.275717
[4]	valid_0's binary_logloss: 0.274187
[5]	valid_0's binary_logloss: 0.272544
[6]	valid_0's binary_logloss: 0.270319
[7]	valid_0's binary_logloss: 0.268718
[8]	valid_0's binary_logloss: 0.266839
[9]	valid_0's binary_logloss: 0.264631
[10]	valid_0's binary_logloss: 0.262701
[11]	valid_0's binary_logloss: 0.261132
[12]	valid_0's binary_logloss: 0.259241
[13]	valid_0's binary_logloss: 0.257192
[14]	valid_0's binary_logloss: 0.25

100%|█████████████████████████████████████████| 506/506 [00:08<00:00, 63.11it/s]


[LightGBM] [Info] Number of positive: 6072, number of negative: 21210
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 127500
[LightGBM] [Info] Number of data points in the train set: 27282, number of used features: 500
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.222564 -> initscore=-1.250785
[LightGBM] [Info] Start training from score -1.250785
[1]	valid_0's binary_logloss: 0.279266
Training until validation scores don't improve for 100 rounds
[2]	valid_0's binary_logloss: 0.277957
[3]	valid_0's binary_logloss: 0.275891
[4]	valid_0's binary_logloss: 0.274471
[5]	valid_0's binary_logloss: 0.272813
[6]	valid_0's binary_logloss: 0.270559
[7]	valid_0's binary_logloss: 0.269061
[8]	valid_0's binary_logloss: 0.267587
[9]	valid_0's binary_logloss: 0.265436
[10]	valid_0's binary_logloss: 0.26354
[11]	valid_0's binary_logloss: 0.261764
[12]	valid_0's binary_logloss: 0.259934
[13]	valid_0's binary_logloss: 0.257948
[14]	valid_0's binary_logloss: 0.255

100%|█████████████████████████████████████████| 506/506 [00:08<00:00, 63.12it/s]


[LightGBM] [Info] Number of positive: 6072, number of negative: 21210
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 127500
[LightGBM] [Info] Number of data points in the train set: 27282, number of used features: 500
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.222564 -> initscore=-1.250785
[LightGBM] [Info] Start training from score -1.250785
[1]	valid_0's binary_logloss: 0.279158
Training until validation scores don't improve for 100 rounds
[2]	valid_0's binary_logloss: 0.277514
[3]	valid_0's binary_logloss: 0.275615
[4]	valid_0's binary_logloss: 0.274202
[5]	valid_0's binary_logloss: 0.272451
[6]	valid_0's binary_logloss: 0.270175
[7]	valid_0's binary_logloss: 0.268729
[8]	valid_0's binary_logloss: 0.266967
[9]	valid_0's binary_logloss: 0.264728
[10]	valid_0's binary_logloss: 0.262868
[11]	valid_0's binary_logloss: 0.261352
[12]	valid_0's binary_logloss: 0.259644
[13]	valid_0's binary_logloss: 0.257667
[14]	valid_0's binary_logloss: 0.25

100%|█████████████████████████████████████████| 505/505 [00:08<00:00, 62.82it/s]




KeyboardInterrupt: 