In [None]:
#@title Installing and upgrading packages
!pip install lightgbm --upgrade
!pip install optuna
!pip install xgboost --upgrade

In [None]:
#@title Importing the libraries and loading the data
# Main libraries
import pandas as pd
pd.set_option("display.precision", 4)
import numpy as np
from collections import Counter

# Text processing libraries
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

# Machine Learning libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer
# Optuna packages
import optuna.integration.lightgbm as lgb
from lightgbm import early_stopping
from lightgbm import log_evaluation
import optuna
import xgboost as xgb

# Importing the dataset
df = pd.read_csv('reviews.tsv', delimiter='\t',
                 quoting = 3) # 3 is for ignoring "" (double quotes)

In [3]:
#@title NLP object
class NLP(object):
  def __init__(self, data: pd.DataFrame):
    self.df = data
    self.y = self.df.iloc[:, -1].values
  
  def create_stopwords(self) -> list:
    """
    Creates English stopwords from nltk package,
    Excludes negative words for further sentiment analysis
    Returns a list of stopwords
    """
    self.all_stopwords = stopwords.words('english')
    negative_words = ['not', "don't", "aren't", "didn't", "hadn't", "hasn't", 
                  "haven't", "wasn't", "weren't"]
    for word in negative_words:
      self.all_stopwords.remove(word)
    return self.all_stopwords

  def process_review(self, index: int) -> list:
    """
    Takes the index to the DataFrame,
    first removes punctuations,
    and second lowers the letters.
    Returns a list including strings.
    """
    review = re.sub('[^a-zA-Z]', ' ', self.df.iloc[:, 0][index])
    review = review.lower()
    return review.split()

  def remove_special_words(self, reviews: list) -> list:
    """
    Takes the stemmed or lemmatized reviews,
    Identifies the words used once as a special word,
    Returns the reviews without the special words.
    """
    words = [word for s in reviews for word in s.split()]
    counted = Counter(words)
    special_words = [word for word in counted.keys() if counted[word] == 1]
    final_reviews = reviews[:]
    for i, sentence in enumerate(final_reviews):
      new_sentence = []
      for word in sentence.split():
        if word not in special_words:
          new_sentence.append(word)
      final_reviews[i] = ' '.join(new_sentence)
    return final_reviews

  def apply_stemmer(self) -> list:
    """
    Fetches the reviews from the DataFrame,
    (I) process the individual review via process_review(),
    (II) removes the stopwords taken from create_stopwords(),
    (III) removes the conjuctions of verbs (e.g. loved -> love) via Stemmer,
    (IV) removes the special_words via remove_special_words().
    Returns a list for Bag-of-words model
    """
    corpus, stop_words = [], self.create_stopwords()
    for i in range(self.df.shape[0]):
      # Step I
      review = self.process_review(i)
      # Step II and III
      stemmer = SnowballStemmer("english", ignore_stopwords=True)
      review = [stemmer.stem(word) for word in review
              if not word in set(stop_words)]
      corpus.append(' '.join(review))

    # Step IV
    review = self.remove_special_words(corpus)
    return corpus
  
  def apply_lemmatizer(self) -> list:
    """
    Fetches the reviews from the DataFrame,
    (I) process the individual review via process_review(),
    (II) removes the stopwords taken from create_stopwords(),
    (III) removes the conjuctions of verbs (e.g. loved -> love) via Lemmatizer,
    (IV) removes the special_words via remove_special_words().
    Returns a list for Bag-of-words model
    """
    corpus, stop_words = [], self.create_stopwords()
    for i in range(self.df.shape[0]):
      # Step I
      review = self.process_review(i)
      # Step II and III
      lemmatizer = WordNetLemmatizer()
      review = [lemmatizer.lemmatize(word, pos = 'v') for word in review
              if not word in set(stop_words)]
      corpus.append(' '.join(review))
    
    # Step IV
    review = self.remove_special_words(corpus)
    return corpus

  def bow(self) -> dict:
    """
    Fetches the data and applies first stemmer and then lemmatizer
    to build the bag-of-word vector.
    Returns a dict including vectorized data for the stemmer and lemmatizer.
    """
    reviews_stemmer = self.apply_stemmer()
    reviews_lemmatizer = self.apply_lemmatizer()
    dataset = {'Stemmer': reviews_stemmer, 'Lemmatizer' : reviews_lemmatizer}
    for name, r in dataset.items():
      cv = CountVectorizer()
      dataset[name] = cv.fit_transform(r).toarray()
    return dataset

In [4]:
#@title Optuna Lightgbm tuner
def tuner(data, target):
  # Splitting the data
  X_train, X_test, y_train, y_test = train_test_split(data, target,
                                                    test_size = 0.25,
                                                    random_state = 42)
  
  # Setting up the data for the model
  dtrain = lgb.Dataset(X_train, label = y_train)
  dtest = lgb.Dataset(X_test, label = y_test)

  # Parameters for classification
  params = {
      'objective' : 'binary',
      'metric' : 'binary_logloss',
      'verbosity' : -1,
      'boosting_type' : 'gbdt'
  }

  # Training the model
  model = lgb.train(
      params,
      dtrain,
      valid_sets = [dtrain, dtest],
      callbacks = [early_stopping(100), log_evaluation(100)]
  )

  # Results
  y_pred = np.rint(model.predict(X_test, 
                                     num_iteration = model.best_iteration))
  score = f1_score(y_test, y_pred)
  return score

In [5]:
#@title Optuna XGBoost tuner

def objective(trial):
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_test, label=y_test)

    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
    }

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    # Add a callback for pruning.
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-auc")
    model = xgb.train(param, dtrain, evals=[(dvalid, "validation")], callbacks=[pruning_callback])

    y_pred = np.rint(model.predict(dvalid))
    score = f1_score(y_test, y_pred)
    return score

In [None]:
#@title Training the LightGBM
model = NLP(df)
dataset = model.bow()
lgbm_results = {}
for name, X in dataset.items():
  lgbm_results[name] = tuner(X, model.y)

In [None]:
#@title Training the XGBoost
xgb_results = {}
for name, X in dataset.items():
  X_train, X_test, y_train, y_test = train_test_split(X, model.y, 
                                                          test_size=0.25, 
                                                          random_state = 42)
  # Find the best parameters with Optuna
  study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="maximize"
    )
  study.optimize(objective, n_trials=100)
  best_params =  study.best_trial.params

  # Train the XGBClassifier with the best params
  classifier = xgb.XGBClassifier(objective = "binary:logistic", eval_metric = "auc",
                                 use_label_encoder=False ,**best_params)
  classifier.fit(X_train, y_train)
  y_pred = np.rint(classifier.predict(X_test))
  xgb_results[name] = f1_score(y_test, y_pred)

In [9]:
#@title Results
scores = np.concatenate((np.array([list(lgbm_results.values())]), 
                         np.array([list(xgb_results.values())])))
results = pd.DataFrame(scores, columns = list(lgbm_results.keys()),
                       index = ['LightGBM', 'XGBoost'])
results.style

Unnamed: 0,Stemmer,Lemmatizer
LightGBM,0.6762,0.6827
XGBoost,0.7706,0.7897
