<a href="https://colab.research.google.com/github/TA-aiacademy/course_3.0/blob/v2-5_nlp/09_v2-5_NLP/Part2/06-2_predict_ans.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# build model prediction and compare each features set

In [None]:
import pandas as pd
import xgboost as xgb
import pickle
import numpy as np
import os

from gensim.models import Doc2Vec, doc2vec

In [None]:
# 上傳資料
!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part2_6.zip
!unzip -q NLP_part2_6.zip

In [None]:
# read preprocess article df
df = pd.read_csv('Data/article_preprocessed.csv')

In [None]:
# drop data
diff_threshold = 20
df = df[abs(df['push']-df['boo']) > diff_threshold].copy()

In [None]:
# define y
df['type'] = np.clip(df['push']-df['boo'], 0, 1)
df = df.reset_index(drop=True)

In [None]:
df['type'].value_counts()

In [None]:
# create a numpy format data
basic_data = np.zeros((df.shape[0], 258))

In [None]:
basic_data[:, 0] = df['idx']
basic_data[:, 1] = df['type']

## bag of words

In [None]:
# load bag of words result
with open('Data/article_count', 'rb') as file:
    _, count = pickle.load(file)

In [None]:
# select top 256 words (counts of document)
most_count_id = np.array((count > 0).sum(axis=0))[0].argsort()[::-1][:256]

In [None]:
# subset data
count = count[:, most_count_id]

In [None]:
count_data = basic_data.copy().astype('int')

In [None]:
# subset bag of words matrix
count_data[:, 2:] = count[count_data[:, 0]].toarray()

## TF-IDF

In [None]:
# load tf-idf result
with open('Data/article_tfidf', 'rb') as file:
    _, tfidf = pickle.load(file)

In [None]:
# select top 256 words (counts of document)
most_tfidf_id = np.array((tfidf > 0).sum(axis=0))[0].argsort()[::-1][:256]

In [None]:
# subset data
tfidf = tfidf[:, most_tfidf_id]

In [None]:
tfidf_data = basic_data.copy().astype('int')

In [None]:
# subset tf-idf matrix
tfidf_data[:, 2:] = tfidf[tfidf_data[:, 0]].toarray()

## average word2vec

In [None]:
# load average word2vec result
with open('Data/avg_article_vector', 'rb') as file:
    avg_vector = pickle.load(file)

In [None]:
avg_data = basic_data.copy()

In [None]:
# select rows of average word2vec
for i, row in df.iterrows():
    avg_data[i, 2:] = avg_vector[row['idx']]

## doc2vec

In [None]:
# load doc2vec model
model = Doc2Vec.load('word2vec_model/doc2vec')

In [None]:
doc2vec_data = basic_data.copy()

In [None]:
# select idx of doc2vec
for i, row in df.iterrows():
    doc2vec_data[i, 2:] = model.docvecs[str(row['idx'])]

# prediction model

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# split data to training and testing data
train, test = train_test_split(df, test_size=0.2, stratify=df['type'])
train_idx = np.array(train.index)
test_idx = np.array(test.index)

In [None]:
# define a dictionary to collect model result
result = {}

## train model use xgboost

In [None]:
# bag of words
model = xgb.XGBClassifier()
model.fit(count_data[train_idx, 2:], count_data[train_idx, 1],
          eval_set=[(count_data[test_idx, 2:], count_data[test_idx, 1])], eval_metric='auc'
         )

# testing auc
result['bag_of_words'] = model.evals_result()['validation_0']['auc'][-1]

In [None]:
# tf-idf
model = xgb.XGBClassifier()
model.fit(tfidf_data[train_idx, 2:], tfidf_data[train_idx, 1],
          eval_set=[(tfidf_data[test_idx, 2:], tfidf_data[test_idx, 1])], eval_metric='auc'
         )

# testing auc
result['tf-idf'] = model.evals_result()['validation_0']['auc'][-1]

In [None]:
# average word2vec
model = xgb.XGBClassifier()
model.fit(avg_data[train_idx, 2:], avg_data[train_idx, 1],
          eval_set=[(avg_data[test_idx, 2:], avg_data[test_idx, 1])], eval_metric='auc'
         )

# testing auc
result['avg_word2vec'] = model.evals_result()['validation_0']['auc'][-1]

In [None]:
# doc2vec
model = xgb.XGBClassifier()
model.fit(doc2vec_data[train_idx, 2:], doc2vec_data[train_idx, 1],
          eval_set=[(doc2vec_data[test_idx, 2:], doc2vec_data[test_idx, 1])], eval_metric='auc'
         )

# testing auc
result['doc2vec'] = model.evals_result()['validation_0']['auc'][-1]

## plot result

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.bar(np.arange(4), result.values())
plt.xticks(np.arange(4), result.keys())
plt.show()