# Part1~3 정리 및 모델 개선

앞선 Part1~3 을 통해서 3가지 전처리 방법과 Random Forest를 통해 성능을 테스트하였다. 
1. Part1: bag of words (CounterVectorizer) ```review 개수 X max_features```
2. Part2: Word2Vec을 통해 review별 단어들의 vector를 평균으로하여 사용 ```review 개수 X max_features```
3. Part3: K-means clustering을 통한 review별 단어들의 군집을 count하여 feature 생성 ```review 개수 X cluster 개수```

Public Leaderboard Score

Name | Binary | Proba
-------|-------|-------
CounterVectorizer | 0.84392 | 0.92104
Word2Vec AverageVectors | 0.78028 | 0.85884
Word2Vec AverageVectors stemming | 0.81984 | 0.89700
Bag Of Centroids | 0.80816 | 0.88930
BagOfCentroids_stopwords | 0.81192 | 0.89310



이번 notebook에서는 시퀀스 모델링을 통한 성능개선을 한다.

# Module

In [55]:
# 전처리
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from KaggleWord2VecUtility import KaggleWord2VecUtility
kaggle_utils = KaggleWord2VecUtility()
## Pipeline 1
from sklearn.feature_extraction.text import CountVectorizer
## Pipeline 3
from sklearn.cluster import KMeans

# multiprocessing
from multiprocessing import Pool

# word2vec
## PIpeline 2 & 3 
from gensim.models import word2vec

# graphs
import matplotlib.pyplot as plt
import seaborn as sns

# model
import xgboost as xgb
import lightgbm as lgb

# evaluation
from sklearn import metrics

# warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

# Load data

In [2]:
# quoting = 3 은 
train = pd.read_csv('../dataset/labeledTrainData.tsv',delimiter='\t', quoting=3)
unlabeled_train = pd.read_csv('../dataset/unlabeledTrainData.tsv', delimiter='\t', quoting=3)
test = pd.read_csv('../dataset/testData.tsv',delimiter='\t')
print('train shape: ',train.shape)
print('unlabeled_train shape:',unlabeled_train.shape)
print('test shape: ',test.shape)

train shape:  (25000, 3)
unlabeled_train shape: (50000, 2)
test shape:  (25000, 2)


# Pipeline1

In [3]:
def pipeline1(data, workers, train=False):
    x_data = kaggle_utils.getCleanReviews(reviews=data, 
                                          func=kaggle_utils.review_to_join_words,
                                          workers=workers)
    vectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None,
                             stop_words = None,   
                             max_features = 5000) 
    x_data = vectorizer.fit_transform(x_data).toarray()
    if train:
        return x_data, data['sentiment']
    else:
        return x_data

# Pipeline2

In [4]:
def pipeline2(data, workers, train=False):
    model = word2vec.Word2Vec.load("../saved_model/300features_40minwords_10context_add_stemming")
    x_data = kaggle_utils.getCleanReviews(reviews=data, 
                                          func=kaggle_utils.review_to_wordlist,
                                          workers=workers)
    x_data = kaggle_utils.getAvgFeatureVecs(x_data, model, 300)
    if train:
        return x_data, data['sentiment']
    else:
        return x_data

# Pipeline3

In [20]:
def pipeline3(train, test, workers, num_words):
    model = word2vec.Word2Vec.load("../saved_model/300features_40minwords_10context_add_stemming")
    word_vectors = model.wv.syn0
    num_clusters = int(word_vectors.shape[0] / num_words)

    # Initalize a k-means object and use it to extract centroids
    kmeans_clustering = KMeans( n_clusters = num_clusters , n_jobs=workers)
    idx = kmeans_clustering.fit_predict( word_vectors )
    
    # Create a Word / Index dictionary, mapping each vocabulary word to
    # a cluster number    
    word_centroid_map = dict(zip( model.wv.index2word, idx ))
    
    # clean_reviews to wordlist
    train_clean_reviews = kaggle_utils.getCleanReviews(reviews=train, 
                                                     func=kaggle_utils.review_to_wordlist,
                                                     workers=workers)
    test_clean_reviews = kaggle_utils.getCleanReviews(reviews=test, 
                                                     func=kaggle_utils.review_to_wordlist,
                                                     workers=workers)
    
    # Pre-allocate an array for the training set bags of centroids (for speed)
    x_data = np.zeros((train["review"].size, num_clusters), dtype="float32")
    x_test = np.zeros((test["review"].size, num_clusters), dtype="float32")

    # Transform the training set reviews into bags of centroids
    counter = 0
    for review in train_clean_reviews:
        x_data[counter] = kaggle_utils.create_bag_of_centroids( review, word_centroid_map )
        counter += 1
        
    counter = 0
    for review in test_clean_reviews:
        x_test[counter] = kaggle_utils.create_bag_of_centroids( review, word_centroid_map )
        counter += 1
    
    
    return x_data, train['sentiment'], x_test
    

# Data Preprocessing

In [48]:
x_data1, y_data = pipeline1(train, workers=12, train=True)
x_test1 = pipeline1(test, workers=12)

In [49]:
x_data2, y_data = pipeline2(train, workers=12, train=True)
x_test2 = pipeline2(test, workers=12)

Review 0 of 25000
Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000
Review 0 of 25000
Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 o

In [21]:
x_data3, y_data, x_test3 = pipeline3(train, test, workers=12, num_words=5)

In [29]:
print('x_data1.shape: {}  / x_test1.shape: {}'.format(x_data1.shape, x_test1.shape))
print('x_data2.shape: {}  / x_test2.shape: {}'.format(x_data2.shape, x_test2.shape))
print('x_data3.shape: {}  / x_test3.shape: {}'.format(x_data3.shape, x_test3.shape))

x_data1.shape: (25000, 5000)  / x_test1.shape: (25000, 5000)
x_data2.shape: (25000, 300)  / x_test2.shape: (25000, 300)
x_data3.shape: (25000, 2397)  / x_test3.shape: (25000, 2397)


# Models

## Config

In [8]:
config = {
    "seed":223,
    "k_folds":5,
    "early_stopping_rounds":100
}

## XGboost

In [9]:
xgb_params = {
    "learning_rate": 0.1,
    "n_estimators": 10000,
    "max_depth": 3,
    "min_child_weight": 3,
    "subsample": 0.8,
    "colsample_bytree": 1.0,
    "colsample_bylevel": 1.0,
    "alpha": 0,
    "lambda": 1,
    "objective": "gpu:binary:logistic",
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor"
}

In [56]:
def xgb_kfold(x_data, y_data, x_test, folds):
    xgb_auc_list = list()
    xgb_acc_list = list()
    xgb_S_prediction = np.zeros(len(x_data))
    xgb_prediction = np.zeros(len(x_test))

    for fold, (train_idx, valid_idx) in enumerate(folds.split(X=x_data, y=y_data)):
        x_train, y_train = x_data[train_idx,:], y_data[train_idx]
        x_valid, y_valid = x_data[valid_idx,:], y_data[valid_idx] 
        xgb_model = xgb.XGBClassifier(**xgb_params)
        xgb_model.fit(x_train, y_train, 
                  eval_set=[(x_valid, y_valid)],  
                  eval_metric="auc",
                  early_stopping_rounds=config['early_stopping_rounds'], 
                  verbose=False)

        prob = xgb_model.predict_proba(x_valid, ntree_limit=xgb_model.best_iteration)[:,1]
        test_prob = xgb_model.predict_proba(x_test, ntree_limit=xgb_model.best_iteration)[:,1]

        auc = metrics.roc_auc_score(y_true=y_valid, y_score=prob)
        pred = [1 if p > 0.5 else 0 for p in prob]
        acc = metrics.accuracy_score(y_true=y_valid, y_pred=pred)
        xgb_auc_list.append(auc)
        xgb_acc_list.append(acc)

        xgb_S_prediction[valid_idx] = prob
        xgb_prediction += test_prob / folds.n_splits

        print("{} fold's AUC: {}".format(fold+1, auc))
        print("{} fold's ACC: {}".format(fold+1, acc))

    print('='*100)
    print('AUC_list')
    print(xgb_auc_list)

    print('-'*100)
    print('Mean AUC: {}'.format(np.mean(xgb_auc_list)))

    print('='*100)
    print('ACC_list')
    print(xgb_acc_list)

    print('-'*100)
    print('Mean ACC: {}'.format(np.mean(xgb_acc_list)))
    
    return xgb_prediction, xgb_auc_list, xgb_acc_list, xgb_S_prediction

### pipeline1

In [57]:
folds = KFold(n_splits=config['k_folds'], random_state=config['seed'], shuffle=True)
xgb_prediction1, xgb_auc_list1, xgb_acc_list1, xgb_S_prediction1 = xgb_kfold(x_data1, y_data, x_test1, folds)

1 fold's AUC: 0.9519849217123374
1 fold's ACC: 0.8848
2 fold's AUC: 0.9487462390371935
2 fold's ACC: 0.8776
3 fold's AUC: 0.9466345726151382
3 fold's ACC: 0.8764
4 fold's AUC: 0.9509904048356468
4 fold's ACC: 0.881
5 fold's AUC: 0.9471876987571551
5 fold's ACC: 0.8808
AUC_list
[0.9519849217123374, 0.9487462390371935, 0.9466345726151382, 0.9509904048356468, 0.9471876987571551]
----------------------------------------------------------------------------------------------------
Mean AUC: 0.9491087673914942
ACC_list
[0.8848, 0.8776, 0.8764, 0.881, 0.8808]
----------------------------------------------------------------------------------------------------
Mean ACC: 0.88012


### pipeline2

In [60]:
folds = KFold(n_splits=config['k_folds'], random_state=config['seed'], shuffle=True)
xgb_prediction2, xgb_auc_list2, xgb_acc_list2, xgb_S_prediction2 = xgb_kfold(x_data2, y_data, x_test2, folds)

1 fold's AUC: 0.9422203255420101
1 fold's ACC: 0.874
2 fold's AUC: 0.9361186863837142
2 fold's ACC: 0.8578
3 fold's AUC: 0.940035814606292
3 fold's ACC: 0.8692
4 fold's AUC: 0.9424573342470518
4 fold's ACC: 0.8692
5 fold's AUC: 0.9409689148809358
5 fold's ACC: 0.8664
AUC_list
[0.9422203255420101, 0.9361186863837142, 0.940035814606292, 0.9424573342470518, 0.9409689148809358]
----------------------------------------------------------------------------------------------------
Mean AUC: 0.9403602151320009
ACC_list
[0.874, 0.8578, 0.8692, 0.8692, 0.8664]
----------------------------------------------------------------------------------------------------
Mean ACC: 0.86732


### pipeline3

In [61]:
folds = KFold(n_splits=config['k_folds'], random_state=config['seed'], shuffle=True)
xgb_prediction3, xgb_auc_list3, xgb_acc_list3, xgb_S_prediction3 = xgb_kfold(x_data3, y_data, x_test3, folds)

1 fold's AUC: 0.9451600731960682
1 fold's ACC: 0.8794
2 fold's AUC: 0.9410829972472953
2 fold's ACC: 0.8644
3 fold's AUC: 0.9411604610842558
3 fold's ACC: 0.8724
4 fold's AUC: 0.9467065093163608
4 fold's ACC: 0.8758
5 fold's AUC: 0.9401607257861283
5 fold's ACC: 0.8692
AUC_list
[0.9451600731960682, 0.9410829972472953, 0.9411604610842558, 0.9467065093163608, 0.9401607257861283]
----------------------------------------------------------------------------------------------------
Mean AUC: 0.9428541533260217
ACC_list
[0.8794, 0.8644, 0.8724, 0.8758, 0.8692]
----------------------------------------------------------------------------------------------------
Mean ACC: 0.8722399999999999


## LightGBM

In [51]:
lgb_params = {
    'learning_rate': 0.1,
    'n_estimators': 10000,
    'max_depth': 3,
    'num_leaves': 5,
    'subsample': 0.8,
    'colsample_bytree': 1.0,
    'feature_fraction': 1.0,
    'bagging_fraction': 1.0,
    'reg_alpha': 0,
    'reg_lambda': 1,
    'min_child_weight': 3,
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'objective': 'regression',
}

In [65]:
def lgb_kfold(x_data, y_data, x_test, folds):
    lgb_auc_list = list()
    lgb_acc_list = list()
    lgb_S_prediction = np.zeros(len(x_data))
    lgb_prediction = np.zeros(len(x_test))

    for fold, (train_idx, valid_idx) in enumerate(folds.split(X=x_data, y=y_data)):
        x_train, y_train = x_data[train_idx,:], y_data[train_idx]
        x_valid, y_valid = x_data[valid_idx,:], y_data[valid_idx] 
        lgb_model = lgb.LGBMClassifier(**lgb_params)
        lgb_model.fit(x_train, y_train,
                      eval_set=[(x_train, y_train), (x_valid, y_valid)],
                      early_stopping_rounds=config['early_stopping_rounds'],
                      verbose=False)

        prob = lgb_model.predict_proba(x_valid, num_iteration=lgb_model.best_iteration_)[:,1]
        test_prob = lgb_model.predict_proba(x_test, num_iteration=lgb_model.best_iteration_)[:,1]

        auc = metrics.roc_auc_score(y_true=y_valid, y_score=prob)
        pred = [1 if p > 0.5 else 0 for p in prob]
        acc = metrics.accuracy_score(y_true=y_valid, y_pred=pred)
        lgb_auc_list.append(auc)
        lgb_acc_list.append(acc)
        lgb_S_prediction[valid_idx] = prob
        lgb_prediction += test_prob / folds.n_splits

        print("{} fold's AUC: {}".format(fold+1, auc))
        print("{} fold's ACC: {}".format(fold+1, acc))

    print('='*100)
    print('AUC_list')
    print(lgb_auc_list)

    print('-'*100)
    print('Mean AUC: {}'.format(np.mean(lgb_auc_list)))

    print('='*100)
    print('ACC_list')
    print(lgb_acc_list)

    print('-'*100)
    print('Mean ACC: {}'.format(np.mean(lgb_acc_list)))
    
    return lgb_prediction, lgb_auc_list, lgb_acc_list, lgb_S_prediction

### pipeline1

In [66]:
folds = KFold(n_splits=config['k_folds'], random_state=config['seed'], shuffle=True)
lgb_prediction1, lgb_auc_list1, lgb_acc_list1, lgb_S_prediction1 = lgb_kfold(x_data1, y_data, x_test1, folds)

1 fold's AUC: 0.9472599158078803
1 fold's ACC: 0.87
2 fold's AUC: 0.946468856027143
2 fold's ACC: 0.8724
3 fold's AUC: 0.9449108826866841
3 fold's ACC: 0.874
4 fold's AUC: 0.9479569255217546
4 fold's ACC: 0.8744
5 fold's AUC: 0.9450604221751984
5 fold's ACC: 0.8748
AUC_list
[0.9472599158078803, 0.946468856027143, 0.9449108826866841, 0.9479569255217546, 0.9450604221751984]
----------------------------------------------------------------------------------------------------
Mean AUC: 0.9463314004437322
ACC_list
[0.87, 0.8724, 0.874, 0.8744, 0.8748]
----------------------------------------------------------------------------------------------------
Mean ACC: 0.8731200000000001


### pipeline2

In [67]:
folds = KFold(n_splits=config['k_folds'], random_state=config['seed'], shuffle=True)
lgb_prediction2, lgb_auc_list2, lgb_acc_list2, lgb_S_prediction2 = lgb_kfold(x_data2, y_data, x_test2, folds)

1 fold's AUC: 0.9378817495626862
1 fold's ACC: 0.8714
2 fold's AUC: 0.9305446194225722
2 fold's ACC: 0.8528
3 fold's AUC: 0.9369804370073171
3 fold's ACC: 0.8606
4 fold's AUC: 0.9386568049921927
4 fold's ACC: 0.867
5 fold's AUC: 0.9353152713497684
5 fold's ACC: 0.8566
AUC_list
[0.9378817495626862, 0.9305446194225722, 0.9369804370073171, 0.9386568049921927, 0.9353152713497684]
----------------------------------------------------------------------------------------------------
Mean AUC: 0.9358757764669073
ACC_list
[0.8714, 0.8528, 0.8606, 0.867, 0.8566]
----------------------------------------------------------------------------------------------------
Mean ACC: 0.86168


### pipeline3

In [68]:
folds = KFold(n_splits=config['k_folds'], random_state=config['seed'], shuffle=True)
lgb_prediction3, lgb_auc_list3, lgb_acc_list3, lgb_S_prediction3 = lgb_kfold(x_data3, y_data, x_test3, folds)

1 fold's AUC: 0.9419857473762769
1 fold's ACC: 0.8666
2 fold's AUC: 0.9400518532744383
2 fold's ACC: 0.8618
3 fold's AUC: 0.9361735523596616
3 fold's ACC: 0.866
4 fold's AUC: 0.9386388847599465
4 fold's ACC: 0.8634
5 fold's AUC: 0.9374663887899964
5 fold's ACC: 0.863
AUC_list
[0.9419857473762769, 0.9400518532744383, 0.9361735523596616, 0.9386388847599465, 0.9374663887899964]
----------------------------------------------------------------------------------------------------
Mean AUC: 0.938863285312064
ACC_list
[0.8666, 0.8618, 0.866, 0.8634, 0.863]
----------------------------------------------------------------------------------------------------
Mean ACC: 0.86416


# Evaluation

In [71]:
def score_df(names, models, score_lst, score_name):
    df = pd.DataFrame({'Name':names, 'Model':models, score_name:np.zeros(len(names))})
    for i in range(len(names)):
        mean_ = np.mean(score_lst[i])
        std_ = np.std(score_lst[i])
        df.loc[i,score_name] = '{0:.4f}({1:.4f})'.format(mean_, std_)
    return df

In [72]:
name = ['Part1','Part2','Part3'] * 2
models = ['XGboost'] * 3 + ['LightGBM'] * 3
auc_score_lst = [xgb_auc_list1, xgb_auc_list2, xgb_auc_list3, lgb_auc_list1, lgb_auc_list2, lgb_auc_list3]
auc_df = score_df(name, models, score_lst, score_name='AUC')
auc_df

Unnamed: 0,Name,Model,AUC
0,Part1,XGboost,0.9491(0.0021)
1,Part2,XGboost,0.9404(0.0023)
2,Part3,XGboost,0.9429(0.0026)
3,Part1,LightGBM,0.9463(0.0012)
4,Part2,LightGBM,0.9359(0.0029)
5,Part3,LightGBM,0.9389(0.0020)


In [80]:
name = ['Part1','Part2','Part3'] * 2
models = ['XGboost'] * 3 + ['LightGBM'] * 3
acc_score_lst = [xgb_acc_list1, xgb_acc_list2, xgb_acc_list3, lgb_acc_list1, lgb_acc_list2, lgb_acc_list3]
acc_df = score_df(name, models, score_lst, score_name='ACC')
acc_df

Unnamed: 0,Name,Model,ACC
0,Part1,XGboost,0.8801(0.0029)
1,Part2,XGboost,0.8673(0.0054)
2,Part3,XGboost,0.8722(0.0052)
3,Part1,LightGBM,0.8731(0.0018)
4,Part2,LightGBM,0.8617(0.0068)
5,Part3,LightGBM,0.8642(0.0018)


# Submission

In [73]:
preds = [xgb_prediction1, xgb_prediction2, xgb_prediction3, lgb_prediction1, lgb_prediction2, lgb_prediction3]
for i in range(len(preds)):
    output = pd.DataFrame(data={"id":test["id"], "sentiment":preds[i]})
    output.to_csv("../submit/{0:s}+{1:s}+{2:.4f}.csv".format(name[i], models[i], np.mean(score_lst[i])), index=False, quoting=3)

In [79]:
preds = [xgb_prediction1, xgb_prediction2, xgb_prediction3, lgb_prediction1, lgb_prediction2, lgb_prediction3]
for i in range(len(preds)):
    output = pd.DataFrame(data={"id":test["id"], "sentiment":[1 if x > 0.5 else 0 for x in preds[i]]})
    output.to_csv("../submit/{0:s}+{1:s}+threshold_0.5_{2:.4f}.csv".format(name[i], models[i], np.mean(acc_score_lst[i])), index=False, quoting=3)

NameError: name 'acc_score_lst' is not defined

In [74]:
xgb_prediction1

array([0.31078544, 0.02359064, 0.22037972, ..., 0.23561364, 0.20817214,
       0.17305508])