## <center>DataLab Cup 1: Text Feature Engineering</center>

#### - To import the libraries

In [1]:
import pandas as pd
import numpy as np
import os

from bs4 import BeautifulSoup
from datetime import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import nltk
import re

from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score

import joblib
import threading
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import itertools

from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier


import warnings
warnings.filterwarnings("ignore")

#### 1. To load the datasets

In [2]:
train_data = pd.read_csv('../dataset/train.csv')
test_data  = pd.read_csv('../dataset/test.csv')


#### 2. To extract the features from the dataset

將一些我們想要用到的feature從dataset中提取出來。以下為提取的特徵:

- title
- time(year/month/day/hour/minute/second)
- number of images (num_img)
- number of videos (num_video)
- author name
- topic
- channel
- length of content 

--- 

* beautiful soup
    - conda install -c conda-forge beautifulsoup4
    
<br>

* vadersentiment
    - conda install -c conda-forge vadersentiment

---

In [3]:
# to get the attribute of the 'title', 'year/month/date/day/hour/minute/second/is_weekend', 'num_img', 'num_video', 'author name', 'topic', 'channel', 'content length', 'title_sentiment'

def preprocessor(text):
    soup = BeautifulSoup(text, 'html.parser')

    # 1. to find the 'title' (body > h1)
    title = soup.find('h1').string.strip().lower()

    # 2. to find time(body > div > span > time)
    
    date_string = soup.find('time')
    try:
        date_string = date_string['datetime']
    except:
        date_string = 'wed, 10 oct 2014 15:00:43 +0000'
        
    date_string = date_string.strip().lower()
    datetimes = datetime.strptime(date_string, '%a, %d %b %Y %H:%M:%S %z')
    
    year = datetimes.year
    month = datetimes.month
    date = datetimes.day
    day = pd.Timestamp(str(year)+'-'+str(month)+'-'+str(date)).dayofweek+1
    is_weekend = 1 if (day==6 or day==7) else 0
    hour = datetimes.hour
    minute = datetimes.minute
    second = datetimes.second

    # 3. to find the number of images
    num_img  = len(soup.find_all('img'))
    

    # 4. to find the number of videos
    num_video = len(soup.find_all('iframe'))
    
    # 5. to find the author name
    article_info = soup.find('div', class_='article-info')
    author = article_info.find('span', class_='author_name') or article_info.find('span', class_='byline basic')

    if (author != None):
        if (author.find('a') != None):
            author = author.find('a')
            author_name = author.get_text().lower()
        else :
            author_name = author.get_text().lower()
    else :
        author_name = 'not found'

    # 6. to find the article topic
    footer = soup.find('footer', class_='article-topics')
    topic = footer.get_text().split(': ')[1]
    
    # 7. to find the channel
    channel = soup.find('article')['data-channel'].strip().lower()
    
    
    # 8. to find the content length
    content = soup.body.find('section', class_='article-content').get_text()
    len_content = len(content)

    # print('topic = ', topic, type(topic))
    
    # 9. to find the sentiment of title
    analyzer = SentimentIntensityAnalyzer()
    title_sentiment = analyzer.polarity_scores(topic)
    sentiment_neg = title_sentiment['neg']
    sentiment_neu = title_sentiment['neu']
    sentiment_pos = title_sentiment['pos']
    sentiment_compound = title_sentiment['compound']

    return title, author_name, channel, topic, year, month, date, day, is_weekend, hour, minute, second, num_img, num_video, len_content, sentiment_neg, sentiment_neu, sentiment_pos, sentiment_compound

In [4]:
feature_train_list = []
feature_test_list = []

for content in (train_data['Page content']):
    feature_train_list.append(preprocessor(content))
for content in (test_data['Page content']):
    feature_test_list.append(preprocessor(content))

df_train_x = pd.DataFrame(
        feature_train_list, 
                columns=['title', 'author_name', 'channel', 'topic', 'year', 'month', 'date', 'day', \
                 'is_weekend', 'hour', 'minute', 'second', 'num_img', 'num_video', 'len_content', \
                 'sentiment_neg', 'sentiment_neu', 'sentiment_pos', 'sentiment_compound']).values
df_test_x = pd.DataFrame(
        feature_test_list, 
        columns=['title', 'author_name', 'channel', 'topic', 'year', 'month', 'date', 'day', \
                 'is_weekend', 'hour', 'minute', 'second', 'num_img', 'num_video', 'len_content', \
                 'sentiment_neg', 'sentiment_neu', 'sentiment_pos', 'sentiment_compound']).values

df_train_y = train_data['Popularity'].values
df_train_y[df_train_y == -1] = 0



### 3-2 . Preprocessing - Word Stemming

 WordNetLemmatizer(Lemmatization): Stem the words will better performance, while time-consuming


In [5]:
nltk.download('wordnet')
nltk.download('omw-1.4')

def word_stemming(text):
    if type(text) == np.ndarray:
        text = text[0]
    lm = WordNetLemmatizer()
    words = re.split('\s', text.strip())
    lemmatized_words = [lm.lemmatize(word) for word in words]
    return lemmatized_words

[nltk_data] Downloading package wordnet to
[nltk_data]     /users/student/mr111//mfhsieh22/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /users/student/mr111//mfhsieh22/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


#### 3-5 Create TF-IDF feature representation ([ref](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html))

In [6]:
xgb_transformer = ColumnTransformer(
    [
        ('drop process', 'drop', [0, 2, 10, 11, 12]),
        ('author name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,2), lowercase=False), [1]),
        ('topic name process' , TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,2), lowercase=False), [3]),
    ],
    remainder='passthrough', # do not touch the remaining data
    n_jobs=-1
)
lgb_transformer = ColumnTransformer(
    [
        ('drop process', 'drop', [0, 2, 10, 11, 12, 15, 16, 17, 18]),
        ('author name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,2), lowercase=False), [1]),
        ('topic name process' , TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,2), lowercase=False), [3])
    ],
    remainder='passthrough', # do not touch the remaining data
    n_jobs=-1
)
cat_transformer = ColumnTransformer(
    [
        ('drop process', 'drop', [0, 2, 10, 11, 12]),
        ('author name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,2), lowercase=False), [1]),
        ('topic name process' , TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,1), lowercase=False), [3])
    ],
    remainder='passthrough', # do not touch the remaining data
    n_jobs=-1
)
ada_transformer = ColumnTransformer(
    [
        ('drop process', 'drop', [0, 2, 10, 11, 12]),
        ('author name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,2), lowercase=False), [1]),
        ('topic name process' , TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,2), lowercase=False), [3])
    ],
    remainder='passthrough', # do not touch the remaining data
    n_jobs=-1
)
rfc_transformer = ColumnTransformer(
    [
        ('drop process', 'drop', [0, 2, 10, 11, 12]),
        ('author name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,2), lowercase=False), [1]),
        ('topic name process' , TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,2), lowercase=False), [3])
    ],
    remainder='passthrough', # do not touch the remaining data
    n_jobs=-1
)
voting_transformer = ColumnTransformer(
    [
        ('title name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,2), lowercase=False), [0]),
        ('author name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,2), lowercase=False), [1]),
        ('channel name process', TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,2), lowercase=False), [2]),
        ('topic name process' , TfidfVectorizer(tokenizer=word_stemming, token_pattern=None, ngram_range=(1,2), lowercase=False), [3])
    ],
    remainder='passthrough', # do not touch the remaining data
    n_jobs=-1
)

### 4. Model training

- XGBoost

- LightGBM

- CatBoost

- AdaBoost

- Random Forest

- VotingClassifier


In [7]:
# pipeline_xgb = Pipeline([('vect', xgb_transformer), ('clf', XGBClassifier())])
# pipeline_lgb = Pipeline([('vect', lgb_transformer), ('clf',  LGBMClassifier())])
# pipeline_cat = Pipeline([('vect', cat_transformer), ('clf',  CatBoostClassifier())])
# pipeline_ada = Pipeline([('vect', ada_transformer), ('clf',  AdaBoostClassifier())])
# pipeline_rfc = Pipeline([('vect', rfc_transformer), ('clf',  RandomForestClassifier())])
pipeline_xgb = XGBClassifier()
pipeline_lgb = LGBMClassifier()
pipeline_cat = CatBoostClassifier()
pipeline_ada = AdaBoostClassifier(estimator=DecisionTreeClassifier(random_state=0))
pipeline_rfc = RandomForestClassifier()

In [8]:

def training(X_train, y_train, clf):

    score = cross_validate(clf, X_train, y_train, scoring='roc_auc', return_train_score=True, return_estimator=True, cv=2)
    print('train score: {:.6f} (+/-{:.6f})'.format(
        np.mean(score['train_score']), np.std(score['train_score'])))
    print('valid score: {:.6f} (+/-{:.6f})'.format(
        np.mean(score['test_score']), np.std(score['test_score'])))

    clf.fit(X_train, y_train)
    
    return clf, np.mean(score['train_score']), np.mean(score['test_score'])

#### - To contruct the grid search function

In [9]:
def grid_search_cv(ct, X_train, y_train, clf, param_grid, verbose_=False):
    
    X_train_ct = ct.fit_transform(X_train)
    
    # to report the grid search information
    if(verbose_):
        gs = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc', n_jobs=-1, cv=2, return_train_score=True, verbose = 3)
    else:
        gs = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc', n_jobs=-1, cv=5, return_train_score=True)
    
    gs.fit(X_train_ct, y_train)
    
    results, idx = gs.cv_results_, gs.best_index_
    print('train score: {:.6f} (+/-{:.6f})'.format(results['mean_train_score'][idx], results['std_train_score'][idx]))
    print('valid score: {:.6f} (+/-{:.6f})'.format(results['mean_test_score'][idx], results['std_test_score'][idx]))
    print('best params:', gs.best_params_)
    return gs.best_params_, gs.best_estimator_, results['mean_test_score'][idx]

#### - to store the best parameter to a file

In [10]:
def parameter_storage(dict_path, file_name, best_param, best_validation, remaining_dict, best_estimator_list=None):
    if not os.path.exists(dict_path):
        os.makedirs(dict_path)
        
    file_path = os.path.join(dict_path, file_name + ".txt")
    
    with open(file_path, "w") as file:
        file.write(f'The features: {remaining_dict}\n')
        file.write(f'The best parameter: {best_param}\n')
        file.write(f'ngram_range : {ngram_range_}\n')
        if not (best_estimator_list==None):
            file.write('The best estimator_list: ')
            for i in range(len(best_estimator_list)):
                file.write(f' {best_estimator_list[i][0]}')
            file.write('\n')
        file.write(f'The best validation: {best_validation}\n')
        file.write('--------------------------------------------------------\n')

### 4-1. XGBOOST

4-1-1. Grid sizing for XGBoost

In [11]:
param_grid_xgb = {
    'gamma' : [1.1],
    'lambda' : [2.4],
    'n_estimators': [97],
    'max_depth': [7],
    'learning_rate' : [0.14],
}

if (1):
    best_xgb_param, best_xgb, best_xgb_valid = grid_search_cv(xgb_transformer, df_train_x, df_train_y, pipeline_xgb, param_grid_xgb, True)
    

Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV 1/2] END gamma=1.1, lambda=2.4, learning_rate=0.14, max_depth=7, n_estimators=97;, score=(train=0.821, test=0.584) total time=   8.7s
[CV 2/2] END gamma=1.1, lambda=2.4, learning_rate=0.14, max_depth=7, n_estimators=97;, score=(train=0.825, test=0.585) total time=   8.8s
train score: 0.823161 (+/-0.002146)
valid score: 0.584373 (+/-0.000395)
best params: {'gamma': 1.1, 'lambda': 2.4, 'learning_rate': 0.14, 'max_depth': 7, 'n_estimators': 97}


4-1-2. Training for XGBoost

In [12]:
param_static_xgb = {
    'gamma' : 1.2,
    'lambda' : 2.5,
    'n_estimators': 97,
    'max_depth': 7,
    'learning_rate' : 0.141,
    'n_jobs' : -1,
    'random_state' : 0
}

if (0):
    xgboost = Pipeline([('vect', xgb_transformer), ('clf', best_xgb)])
else :
    xgboost = Pipeline([('vect', xgb_transformer), ('clf', XGBClassifier(**param_static_xgb))])
    
_ = training(df_train_x, df_train_y, xgboost)

train score: 0.783027 (+/-0.003699)
valid score: 0.589135 (+/-0.005340)


### 4-2. LightGBM

4-2-1. Grid sizing for LightGBM

In [13]:
param_grid_lgbm = {
    'learning_rate' : [0.13, 0.0131], 
    'n_estimators' : [230, 231],
    'objective' : ['regression', 'poisson']
}

if (1):
    best_lgbm_param, best_lgbm, best_lgbm_valid = grid_search_cv(lgb_transformer, df_train_x, df_train_y, pipeline_lgb, param_grid_lgbm, True)

Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV 2/2] END learning_rate=0.0131, n_estimators=230, objective=poisson;, score=(train=0.689, test=0.594) total time=   3.1s
[CV 1/2] END learning_rate=0.0131, n_estimators=231, objective=regression;, score=(train=0.732, test=0.592) total time=   3.1s
[CV 1/2] END learning_rate=0.0131, n_estimators=231, objective=poisson;, score=(train=0.690, test=0.592) total time=   3.2s
[CV 2/2] END learning_rate=0.0131, n_estimators=230, objective=regression;, score=(train=0.731, test=0.591) total time=   3.2s
[CV 2/2] END learning_rate=0.13, n_estimators=230, objective=regression;, score=(train=0.927, test=0.567) total time=   3.0s
[CV 1/2] END learning_rate=0.0131, n_estimators=230, objective=poisson;, score=(train=0.690, test=0.592) total time=   3.3s
[CV 2/2] END learning_rate=0.0131, n_estimators=231, objective=poisson;, score=(train=0.689, test=0.594) total time=   3.3s
[CV 2/2] END learning_rate=0.0131, n_estimators=231, objective=re

4-2-2. Training for LightGBM

In [14]:
params_static_lgbm = {
    'random_state': 0, 
    'learning_rate' : 0.013,
    'n_estimators' : 230,
    'n_jobs' : -1,
    'objective' : 'poisson'
}

if (0):
    lgbm = Pipeline([('vect', lgb_transformer), ('clf', best_lgbm)])
else :
    lgbm = Pipeline([('vect', lgb_transformer), ('clf', LGBMClassifier(**params_static_lgbm))])


_ = training(df_train_x, df_train_y, lgbm)

train score: 0.664134 (+/-0.002286)
valid score: 0.597305 (+/-0.008708)


### 4-3. CatBoost

4-3-1. Grid sizing for Catboost

In [15]:
param_grid_cat = {
    'learning_rate' : [0.01], 
    'n_estimators' : [700],
    'depth' : [12]
}

if (1):
    best_cat_params, best_cat_estimator, best_cat_valid= grid_search_cv(cat_transformer, df_train_x, df_train_y, pipeline_cat, param_grid_cat, True)

Fitting 2 folds for each of 1 candidates, totalling 2 fits
0:	learn: 0.6925111	total: 829ms	remaining: 9m 39s
0:	learn: 0.6926031	total: 839ms	remaining: 9m 46s
1:	learn: 0.6920114	total: 1.54s	remaining: 8m 58s
1:	learn: 0.6921981	total: 1.55s	remaining: 9m
2:	learn: 0.6915026	total: 2.28s	remaining: 8m 50s
2:	learn: 0.6909394	total: 2.28s	remaining: 8m 50s
3:	learn: 0.6909922	total: 3.02s	remaining: 8m 45s
3:	learn: 0.6906376	total: 3.03s	remaining: 8m 47s
4:	learn: 0.6902948	total: 3.74s	remaining: 8m 39s
4:	learn: 0.6901053	total: 3.79s	remaining: 8m 47s
5:	learn: 0.6899069	total: 4.45s	remaining: 8m 34s
5:	learn: 0.6898346	total: 4.54s	remaining: 8m 44s
6:	learn: 0.6896002	total: 5.13s	remaining: 8m 28s
6:	learn: 0.6894964	total: 5.3s	remaining: 8m 45s
7:	learn: 0.6889079	total: 5.86s	remaining: 8m 26s
7:	learn: 0.6890035	total: 6.02s	remaining: 8m 40s
8:	learn: 0.6884298	total: 6.56s	remaining: 8m 23s
8:	learn: 0.6885649	total: 6.7s	remaining: 8m 34s
9:	learn: 0.6878762	total: 7.

4-3-2. Training for CatBoost

In [16]:
params_static_cat = {
    'eval_metric' : 'AUC',
    'n_estimators' : 700,
    'depth' : 12,
    'learning_rate' : 0.01,
    'random_state' : 0,
    'verbose' : False
}

if (0):
    catboost = Pipeline([('ct', cat_transformer),('clf', best_cat_estimator)])
else :
    catboost = Pipeline([('ct', cat_transformer),('clf', CatBoostClassifier(**params_static_cat))])

_ = training(df_train_x, df_train_y, catboost)

KeyboardInterrupt: 

### 4-4. AdaBoost

4-4-1. Grid sizing for AdaBoost

In [19]:
params_grid_ada = {
    'estimator__max_depth' : [4],
    'n_estimators': [1500],
    'learning_rate': [0.001],
    'random_state' : [0]
}

if (1):
    best_ada_params, best_ada_estimator, best_ada_valid = grid_search_cv(ada_transformer, df_train_x, df_train_y, pipeline_ada, params_grid_ada, True)

Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV 2/2] END estimator__max_depth=4, learning_rate=0.001, n_estimators=1500, random_state=0;, score=(train=0.646, test=0.588) total time=16.6min
[CV 1/2] END estimator__max_depth=4, learning_rate=0.001, n_estimators=1500, random_state=0;, score=(train=0.662, test=0.584) total time=17.2min
train score: 0.654205 (+/-0.008009)
valid score: 0.586208 (+/-0.002057)
best params: {'estimator__max_depth': 4, 'learning_rate': 0.001, 'n_estimators': 1500, 'random_state': 0}


4-4-2. Training for AdaBoost

In [20]:
param_static_ada = {
    'estimator' : DecisionTreeClassifier(max_depth = 4), 
    'learning_rate' : 0.001, 
    'n_estimators' : 1560
}

if (0):
    adaBoost = Pipeline([('vect', ada_transformer), ('clf', best_ada_estimator)])
else :
    adaBoost = Pipeline([('vect', ada_transformer), ('clf', AdaBoostClassifier(**param_static_ada))])
    
_ = training(df_train_x, df_train_y, adaBoost)

train score: 0.655976 (+/-0.004430)
valid score: 0.581028 (+/-0.002394)


### 4-5. Random Forest Classifier

4-5-1. Grid sizing for RF

In [21]:

params_grid_rf = {
    'n_estimators' : [1150],
    'max_depth' : [90],
    'min_samples_leaf' : [1]
}

if (1):
    best_RF_params, best_RF_estimator, best_RF_valid = grid_search_cv(rfc_transformer, df_train_x, df_train_y, pipeline_rfc, params_grid_rf, True)

Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV 1/2] END max_depth=90, min_samples_leaf=1, n_estimators=1150;, score=(train=0.998, test=0.578) total time= 6.4min
[CV 2/2] END max_depth=90, min_samples_leaf=1, n_estimators=1150;, score=(train=0.999, test=0.578) total time= 6.6min
train score: 0.998176 (+/-0.000360)
valid score: 0.577714 (+/-0.000072)
best params: {'max_depth': 90, 'min_samples_leaf': 1, 'n_estimators': 1150}


4-5-2. Training for RF

In [22]:
params_static_rf = {
    'n_jobs' : -1,
    'random_state' : 0,
    'n_estimators' : 1200,
    'max_depth' : 100,
    'min_samples_leaf': 1
}

if (0):
    RF = Pipeline([('vect', rfc_transformer), ('clf', best_RF_estimator)])
else :
    RF = Pipeline([('vect', rfc_transformer), ('clf', RandomForestClassifier(**params_static_rf))])

_ = training(df_train_x, df_train_y, RF)

train score: 0.999345 (+/-0.000056)
valid score: 0.573417 (+/-0.000696)


### 4-6. VotingClassifier

4-2-1. Grid sizing for Voting classifier

In [11]:
param_static_xgb = {
    'gamma' : 1.2,
    'lambda' : 2.5,
    'n_estimators': 97,
    'max_depth': 7,
    'learning_rate' : 0.141,
    'n_jobs' : -1,
    'random_state' : 0
}

params_static_LGBM = {
    'random_state': 0, 
    'learning_rate' : 0.013,
    'n_estimators' : 230,
    'n_jobs' : -1,
    'objective' : 'poisson'
}

params_static_cat = {
    'eval_metric' : 'AUC',
    'n_estimators' : 700,
    'depth' : 12,
    'learning_rate' : 0.01,
    'random_state' : 0,
    'verbose' : False
}

param_static_Adaboost = {
    'estimator' : DecisionTreeClassifier(max_depth = 4), 
    'learning_rate' : 0.001, 
    'n_estimators' : 1560
}

param_static_RF = {
    'n_jobs' : -1,
    'random_state' : 0,
    'n_estimators' : 1200,
    'max_depth' : 100,
    'min_samples_leaf': 1
}

voting_pipeline_xgb = Pipeline([('vect', xgb_transformer), ('clf', XGBClassifier(**param_static_xgb))])
voting_pipeline_lgb = Pipeline([('vect', lgb_transformer), ('clf',  LGBMClassifier(**params_static_LGBM))])
voting_pipeline_cat = Pipeline([('vect', cat_transformer), ('clf',  CatBoostClassifier(**params_static_cat))])
voting_pipeline_ada = Pipeline([('vect', ada_transformer), ('clf',  AdaBoostClassifier(**param_static_Adaboost))])
voting_pipeline_rfc = Pipeline([('vect', rfc_transformer), ('clf',  RandomForestClassifier(**param_static_RF))])

In [12]:
num_classifier = 2
# voting_estimator_list = [('pip1', pipeline_xgb), ('pip2', pipeline_lgb), ('pip3', pipeline_cat)]
voting_estimator_list = [('pip1', pipeline_xgb), ('pip2', pipeline_lgb)]

- to find the weight combinations

In [13]:
def weight_list_generator(num_classifier_):   
    weight_list = []

    binary_values = (1, 3, 5)
    weight_list = [list(i) for i in list(itertools.product(binary_values, repeat=num_classifier_))]

    print('weight list = ', weight_list)
    print('length of weight list = ', len(weight_list))
    return weight_list

In [14]:
weight_list = weight_list_generator(num_classifier_ = num_classifier)

weight list =  [[1, 1], [1, 3], [1, 5], [3, 1], [3, 3], [3, 5], [5, 1], [5, 3], [5, 5]]
length of weight list =  9


* multi-thread grid search for voting classifier

In [18]:
param_grid_voting_static = {
    'estimators' : voting_estimator_list,
    'voting' : 'soft',
    'flatten_transform' : True, 
    'verbose' : True
}

lock = threading.Lock()


def process_weight(weight):
    # share in multiple threads
    global best_valid_score, best_weight, best_voting

    voting = Pipeline([('vect', voting_transformer), ('clf', VotingClassifier(**prarms_static_voting, weights=weight))])
    print('current weight =', weight)
    clf_voting, _, valid_voting = training(df_train_x, df_train_y, voting)

    # to protect the safety of shared variables
    with lock:
        if valid_voting > best_valid_score:
            best_valid_score = valid_voting
            best_weight = weight
            best_voting = clf_voting
    
    print(f'{weight} Finish!!')
    threadmax.release()
        
if (1):
    best_valid_score = 0
    best_weight = None
    best_voting = None
    mem = []
    threadmax = threading.BoundedSemaphore(32)
    
    for weight in weight_list:
        threadmax.acquire()
        thread = threading.Thread(target=process_weight, args=(weight,))
    
        thread.start()
        mem.append(thread)

    for thread in mem:
        thread.join()
        mem.remove(thread)
    
    print('end once')

    print('best_valid_score = %.6f' % best_valid_score)
    print('best_weight = ', best_weight)

Exception in thread Thread-4 (process_weight):
Traceback (most recent call last):
  File "/users/student/mr111/mfhsieh22/miniconda3/envs/DL/lib/python3.11/threading.py", line 1038, in _bootstrap_inner
Exception in thread Thread-5 (process_weight):
Traceback (most recent call last):
  File "/users/student/mr111/mfhsieh22/miniconda3/envs/DL/lib/python3.11/threading.py", line 1038, in _bootstrap_inner
Exception in thread Thread-6 (process_weight):
Traceback (most recent call last):
  File "/users/student/mr111/mfhsieh22/miniconda3/envs/DL/lib/python3.11/threading.py", line 1038, in _bootstrap_inner
Exception in thread Thread-7 (process_weight):
Traceback (most recent call last):
  File "/users/student/mr111/mfhsieh22/miniconda3/envs/DL/lib/python3.11/threading.py", line 1038, in _bootstrap_inner
    self.run()
  File "/users/student/mr111/mfhsieh22/miniconda3/envs/DL/lib/python3.11/threading.py", line 975, in run
Exception in thread Thread-8 (process_weight):
Traceback (most recent call l

current weight = [1, 1]
current weight = [1, 3]
current weight = [1, 5]
current weight = [3, 1]
current weight = [3, 3]
current weight = [3, 5]
current weight = [5, 1]
current weight = [5, 3]
current weight = [5, 5]
end once
best_valid_score = 0.000000
best_weight =  None


In [15]:
# no n_jobs
prarms_static_voting = {
    'estimators' : voting_estimator_list, 
    'voting' : 'soft',
    'weights' : [1, 2],
    'flatten_transform' : True,
    'verbose' : True
}

if (0):
    voting = Pipeline([('vect', voting_transformer), ('clf', VotingClassifier(**param_grid_voting_static, weights=best_weight))])
    
else :
    voting = Pipeline([('vect', voting_transformer), ('clf', VotingClassifier(**prarms_static_voting))])

_ = training(df_train_x, df_train_y, voting)


[Voting] ..................... (1 of 2) Processing pip1, total=   2.2s
[Voting] ..................... (2 of 2) Processing pip2, total=   1.4s
[Voting] ..................... (1 of 2) Processing pip1, total=   2.1s
[Voting] ..................... (2 of 2) Processing pip2, total=   1.4s
train score: 0.887048 (+/-0.003507)
valid score: 0.573097 (+/-0.001854)
[Voting] ..................... (1 of 2) Processing pip1, total=   4.0s
[Voting] ..................... (2 of 2) Processing pip2, total=   2.7s


### 5. Testing data prediction

In [16]:
best_model = voting

y_score = best_model.predict_proba(df_test_x)[:, 1]

df_pred = pd.DataFrame({'Id': test_data['Id'], 'Popularity': y_score})
df_pred.to_csv('../output/test_pred.csv', index=False)