<a href="https://colab.research.google.com/github/christianvadillo/InfoVac/blob/main/train_model_lgbm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install libraries

In [None]:
!pip install spacy --upgrade -qqq # need 2.3 for download es_core_news_lg
!pip install unidecode -qqq
# Installing the W&B library
!pip install wandb -qqq

# !python -m spacy download es_core_news_lg -qqq
# !python -m spacy download es_core_news_md -qqq
!python -m spacy download es_core_news_sm -qqq

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('es_core_news_sm')


In [None]:
import pandas as pd
import numpy as np 
import re
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import joblib


import spacy
import nltk
import unicodedata

# import es_core_news_lg
import es_core_news_sm

from nltk.corpus import stopwords


warnings.filterwarnings('ignore')
nltk.download('stopwords')
nltk.download('punkt')

nlp = es_core_news_sm.load()

# Build stop words list
stop_words_es = stopwords.words('spanish')  # Spanish's stop words 
stop_words_es = stop_words_es + ['cnn', 'mas', 'si']
sw_es = nlp.Defaults.stop_words 
stop_words_es = sw_es.union(stop_words_es)
stop_words_es = [word for word in stop_words_es if len(word)<4]



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
stop_words_es

['tus',
 'el',
 'hoy',
 'mis',
 'ha',
 'le',
 'y',
 'un',
 'mi',
 'así',
 'he',
 'ver',
 'uno',
 'más',
 'dio',
 'con',
 'e',
 'te',
 'ex',
 'las',
 'ir',
 'qué',
 'día',
 'os',
 'mia',
 'cnn',
 'mal',
 'sus',
 'ni',
 'ser',
 'ti',
 'voy',
 'tal',
 'no',
 'hay',
 'tú',
 'van',
 'son',
 'fin',
 'por',
 'ahí',
 'tu',
 'es',
 'eso',
 'vez',
 'en',
 'tan',
 'se',
 'yo',
 'da',
 'sea',
 'ya',
 'que',
 'de',
 'nos',
 'muy',
 'me',
 'sé',
 'mio',
 'lo',
 'has',
 'al',
 'dia',
 'del',
 'mí',
 'asi',
 'ese',
 'fui',
 'ahi',
 'les',
 'mas',
 'mío',
 'esa',
 'sí',
 'soy',
 'una',
 'fue',
 'o',
 'usa',
 'va',
 'era',
 'ésa',
 'mía',
 'aun',
 'ése',
 'sin',
 'dan',
 'su',
 'si',
 'los',
 'aún',
 'la',
 'qeu',
 'él',
 'a',
 'dar',
 'dos',
 'uso',
 'han']

In [None]:
# For Reproducibility
import os
import random
# Set seed for reproducability
SEED = 88
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
np.random.RandomState(SEED);

# Initialize W&B


In [None]:
import wandb

# Start a new run
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mchristian_vadillo[0m (use `wandb login --relogin` to force relogin)


True

# Load Data

In [None]:
file = 'https://raw.githubusercontent.com/christianvadillo/InfoVac/main/data/processed/covid_fakenews_es_utf_21_Oct_2020_17_29.csv'

df = pd.read_csv(file)
df.head()

Unnamed: 0,url,titulo,texto,categoria,fecha_aparicion,organizacion
0,https://www.newtral.es/bulos-coronavirus-pcr-k...,no title,"Estos vídeos de Kary Mullis, inventor del PCR,...",no confiable,2020-10-13 00:00:00+00:00,Newtral
1,https://colombiacheck.com/chequeos/no-ecuador-...,"No, Ecuador no encontró la cura para el COVID-...",Ecuador venció al coronavirus,no confiable,2020-10-08 18:36:17+00:00,ColombiaCheck
2,https://www.telemundo.com/noticias/noticias-te...,Las afirmaciones falsas y engañosas del debate...,"""Trump llamó al coronavirus un ""fraude"".",no confiable,2020-10-08 00:00:00+00:00,Telemundo
3,https://factual.afp.com/la-inscripcion-en-caja...,La inscripción en cajas de tapabocas no prueba...,La inscripción en las cajas de mascarillas pru...,no confiable,2020-10-07 18:36:00+00:00,AFP Factual
4,https://chequeado.com/ultimas-noticias/alesso-...,Alesso (CTERA): “En los lugares donde hay poco...,En los lugares donde hay pocos casos de corona...,no confiable,2020-10-07 00:00:00+00:00,Chequeado


# Metrics

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

from sklearn.metrics import accuracy_score, plot_confusion_matrix
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics import make_scorer
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

from functools import wraps



def plot_boxplot(scores, metric, ax=None):
    """ Plot the boxplot of each model given a metric"""
    data = [model[metric] for model in scores]
    name = metric.split('_')[1]
    if ax:
        ax.boxplot(data, labels=names, showmeans=True)
        ax.set_title(f'{name.capitalize()} scores by model')
        ax.set_xlabel('Model')
        ax.set_ylabel(name.capitalize())
    else:
        plt.boxplot(data, labels=names, showmeans=True)
        plt.set_title(f'{name.capitalize()} scores by model')
        plt.set_xlabel('Model')
        plt.set_ylabel(name.capitalize())
        plt.show()

        
def roc_auc(y_true, probs_pred):
    """ Calculate ROC area under curve """
    return roc_auc_score(y_true, probs_pred)


def pr_auc(y_true, probs_pred):
    """Calculate precision-recall area under curve"""
    # calculate area under curve
    return average_precision_score(y_true, probs_pred)


def evaluate_model(model, x, y):
    """ Evaluate the model using KFold """
    scoring = {'accuracy': make_scorer(accuracy_score),
               'f1': make_scorer(f1_score), 
               'roc_auc': make_scorer(roc_auc, needs_proba=True),
               'pr_auc': make_scorer(pr_auc, needs_proba=True)
              }
              
    cv = KFold(n_splits=10, shuffle=True, random_state=SEED)
    scores = cross_validate(model, x, y, cv=cv, 
                            scoring=scoring,
                            n_jobs=-1, 
                            verbose=0)
    
    return scores


def track_scores(scores):
    # Metrics to log
    metrics = ['test_accuracy', 'test_f1', 'test_roc_auc', 'test_pr_auc']
    for metric in metrics:
      for score in scores[metric]:
        wandb.log({f'cv_score_{metric}': score})
        wandb.log({f'mean_{metric}': scores[metric].mean()})
      wandb.termlog(f'mean_{metric} logged')


def display_scores(scores, metric):
    """ Display the avg and std score of the model"""
    print('='*60)
    for m in metric:
        name = 'test_'+m.lower()
        print(f"Metric: {m:>10} {'|':>5} Mean: {scores[name].mean():.3f} (+/- {scores[name].std()*2:.2f})")



# Support functions


In [None]:
# Support libraries
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin, BaseEstimator 

In [None]:
def normalize_document(doc:str, rm_sw:bool=True, 
                       rm_symb:bool=True,
                       lemmatize:bool=True) -> str:
    '''
    Cleans text including removal of whitespace, punctuation, accented characters,
    special characters and transforming to lowercase. Assumes input text is string.
    Returns clean string.
    '''

    # lower case and remove special characters\whitespaces
    if rm_symb:
      # Remove accents
      doc = unicodedata.normalize(u'NFKD', doc)\
                       .encode('ascii', 'ignore')\
                       .decode('utf8')
      # Remove numbers and punctuations
      doc = re.sub(r'[^a-zA-Z\s]', '', doc)
    # To lowercase
    doc = doc.lower()
    # Remove extra space
    doc = doc.strip()

    if rm_sw:
      # tokenize document
      tokens = nltk.word_tokenize(doc)
      # filter stopwords out of document
      filtered_tokens = [token for token in tokens if token not in stop_words_es]
      # re-create document from filtered tokens
      doc = ' '.join(filtered_tokens)

    if lemmatize:
      doc = nlp(doc)
      tokens = [token.lemma_.strip() for token in doc]
      doc = ' '.join(tokens)

    return doc


class DenseTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()


class NormalizeTextTransformer(BaseEstimator, TransformerMixin):
  def __init__ (self, **params):
    print(params)
    self.rm_sw = params.get('rm_sw')
    self.rm_symb = params.get('rm_symb')
    self.lemmatize = params.get('lemmatize')

    # print(self.rm_sw, self.rm_symb, self.lemmatize)
  def fit(self, X, y=None, **fit_params):
    return self


  def transform(self, X, y=None, **transf_params):
    X_norm = normalize_corpus(X, 
                            rm_sw=self.rm_sw,
                            rm_symb=self.rm_symb,
                            lemmatize=self.lemmatize)
    return X_norm

normalize_corpus = np.vectorize(normalize_document)

# Splitting the data

In [None]:
from sklearn.utils import shuffle

In [None]:
df['categoria'] = df['categoria'].map({'confiable': 1, 'no confiable': 0})

In [None]:
df = shuffle(df, random_state=SEED)
df.reset_index(drop=True, inplace=True)
df.to_csv('covid-es-dataset.csv', index=False)

X = df['texto'].values
y = df['categoria'].values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test =\
     train_test_split(X, y, test_size=0.2, random_state=SEED)

X_train, X_val, y_train, y_val =\
    train_test_split(X_train, y_train, test_size=0.25, random_state=SEED)

print("Train shapes:", X_train.shape, y_train.shape)
print("Validation shapes:", X_val.shape, y_val.shape)
print("Test shapes:", X_test.shape, y_test.shape)

Train shapes: (4488,) (4488,)
Validation shapes: (1497,) (1497,)
Test shapes: (1497,) (1497,)


# Train model


In [None]:
from lightgbm import LGBMClassifier

# Initial configuration for the training phase 
config = {
    "rm_sw": False,            # No stopwords
    "rm_symb": True,          # Remove numbers, special characters, accents
    "lemmatize": True,        # Lemmatize the words
    "ngram_range": (1, 1),    # Windows
    'features': 'texto',      # Column used as the input feature
    'random_state': SEED,     # To reproducibility
    'kfolds': 10              # Amount of folds for evaluation
    }


# Pipeline
steps = [
         ('norm', NormalizeTextTransformer(rm_sw=config.get('rm_sw'), 
                                           rm_symb=config.get('rm_symb'),
                                           lemmatize=config.get('lemmatize'))),
         
         ('vect', CountVectorizer(analyzer='word',
                                  ngram_range=config.get('ngram_range'))),
         ('tfidf', TfidfTransformer()),
         ('clf', LGBMClassifier(random_state=SEED, n_jobs=-1))
        ]

# model_pipe = Pipeline(steps=steps, verbose=1)
# model_pipe.fit(X=X_train, y=y_train)

{'rm_sw': False, 'rm_symb': True, 'lemmatize': True}


# Get predictions

In [None]:
# y_pred = model_pipe.predict(X_test)
# print(accuracy_score(y_test, y_pred))
# plot_confusion_matrix(model_pipe, X_test, y_test, values_format='.0f')

In [None]:
# model_pipe.predict_proba(X_test)

WANDB TRACK

In [None]:
# initialize wandb run
wandb.init(project='test')
# y_pred = model_pipe.predict(X_test)
# wandb.sklearn.plot_confusion_matrix(y_test, y_pred, model_pipe.classes_)

In [None]:
# Configuration for hyperparameter sweep
sweep_config = {
   'method': 'bayes',
   'metric': {
       'name': 'val_accuracy_score',
       'goal': 'maximize', 
       'target': 0.93
        },
    'early_terminate':{
        'type': 'hyperband',
        's': 2,
        'eta': 3,
        'max_iter': 27
    },
    #https://neptune.ai/blog/lightgbm-parameters-guide
    #early stopping does not work with 'dart', 'goss' boosting, try it in a separate run
    #without early stopping
   'parameters': {
        "boosting_type": {
           'distribution': 'categorical',
           'values': ['dart']
       }, 
        "n_estimators": {
           'distribution': 'int_uniform',
           'min': 200,
           'max': 400
       }, 
        "max_depth": {
           'distribution': 'int_uniform',
           'min': 25,
           'max': 35,
       },
       "num_leaves": {
           'distribution': 'int_uniform',
           'min': 10,
           'max': 30
       }, 
       "learning_rate": {
           'distribution': 'log_uniform',
           'min': -2.5,
           'max': 0.5
       },
       "reg_lambda": {
           'distribution': 'uniform',
           'min': 1,
           'max': 6
       },
        "reg_alpha": {
           'distribution': 'uniform',
           'min': 3,
           'max': 6
       },
        "colsample_bytree": {
           'distribution': 'uniform',
           'min': 0.1,
           'max': 1.0
        },
        "subsample_for_bin": {
           'distribution': 'int_uniform',
           'min': 200000,
           'max': 800000
        },
        "subsample": {
           'distribution': 'uniform',
           'min': 0.6,
           'max': 1.0
        },
        "min_child_weight": {
           'distribution': 'int_uniform',
           'min': 1,
           'max': 5
        },

   }
}

sweep_id = wandb.sweep(sweep_config, project='test')



Create sweep with ID: lntzcr95
Sweep URL: https://wandb.ai/christian_vadillo/test/sweeps/lntzcr95


In [None]:
X_train_norm = normalize_corpus(X_train)
X_val_norm = normalize_corpus(X_val)
X_test_norm = normalize_corpus(X_test)

In [None]:
# Prepare data for LightGBM
# dtrain = lgb.Dataset(X_train.values, label=y_train.values)
# dvalid = lgb.Dataset(X_test.values, label=y_test.values)
# watchlist = [dtrain, dvalid]


def _train():
    # Configure and train model
    wandb.init(name="LightGBM_sweep")


    clf = LGBMClassifier(boosting_type=wandb.config.boosting_type,
                          n_estimators=wandb.config.n_estimators,                 
                          max_depth=wandb.config.max_depth,
                          learning_rate=wandb.config.learning_rate,
                          num_leaves=wandb.config.num_leaves,
                          min_child_weight=wandb.config.min_child_weight,
                          subsample=wandb.config.subsample,
                          reg_alpha=wandb.config.reg_alpha,
                          reg_lambda=wandb.config.reg_lambda,
                          colsample_bytree=wandb.config.colsample_bytree,
                          subsample_for_bin=wandb.config.subsample_for_bin,
                          n_jobs=-1, random_state=SEED)

    steps = [
             ('vect', CountVectorizer(analyzer='word',
                                      ngram_range=config.get('ngram_range'))),
             ('tfidf', TfidfTransformer()),
             ('clf', clf)
        ]
    pipe = Pipeline(steps=steps)
    pipe.fit(X_train_norm, y_train)

    # lgbm_config = {"num_leaves": wandb.config.num_leaves,
    #                'boosting': wandb.config.boosting,
    #                "max_depth": wandb.config.max_depth, 
    #                "learning_rate": wandb.config.learning_rate,
    #                "bagging_freq": wandb.config.bagging_freq, 
    #                "bagging_fraction": wandb.config.bagging_fraction,
    #                "feature_fraction": wandb.config.feature_fraction,
    #                "metric": ['mse','accuracy'],
    #                "random_state": seed}

    # lgbm_model = lgb.train(lgbm_config, 
    #                        train_set=dtrain, 
    #                        num_boost_round=999,
    #                        valid_sets=watchlist, 
    #                        callbacks=[wandb_callback()],
    #                        verbose_eval=100,
    #                        early_stopping_rounds=10)
    
    # Create predictions for evaluation
    y_val_preds = pipe.predict(X_val_norm)
    y_test_preds = pipe.predict(X_test_norm)
    
    # W&B log metrics
    wandb.log({'val_accuracy_score': accuracy_score(y_val, y_val_preds)})
    wandb.log({'test_accuracy_score': accuracy_score(y_test, y_test_preds)})


In [None]:
# Run hyperparameter sweep (grid search)
wandb.agent(sweep_id, function=_train)

[34m[1mwandb[0m: Agent Starting Run: 9nvdvsfv with config:
[34m[1mwandb[0m: 	boosting_type: dart
[34m[1mwandb[0m: 	colsample_bytree: 0.5441965326524222
[34m[1mwandb[0m: 	learning_rate: 0.5565163570942929
[34m[1mwandb[0m: 	max_depth: 33
[34m[1mwandb[0m: 	min_child_weight: 2
[34m[1mwandb[0m: 	n_estimators: 311
[34m[1mwandb[0m: 	num_leaves: 15
[34m[1mwandb[0m: 	reg_alpha: 5.7059618388530895
[34m[1mwandb[0m: 	reg_lambda: 1.883514542070742
[34m[1mwandb[0m: 	subsample: 0.9347514977946791
[34m[1mwandb[0m: 	subsample_for_bin: 311964


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
val_accuracy_score,0.91784
_step,1.0
_runtime,13.0
_timestamp,1604703610.0
test_accuracy_score,0.90648


0,1
val_accuracy_score,▁
_step,▁█
_runtime,▁▁
_timestamp,▁▁
test_accuracy_score,▁


[34m[1mwandb[0m: Agent Starting Run: jd8sqyot with config:
[34m[1mwandb[0m: 	boosting_type: dart
[34m[1mwandb[0m: 	colsample_bytree: 0.13300907499877512
[34m[1mwandb[0m: 	learning_rate: 0.3146544241014905
[34m[1mwandb[0m: 	max_depth: 27
[34m[1mwandb[0m: 	min_child_weight: 5
[34m[1mwandb[0m: 	n_estimators: 348
[34m[1mwandb[0m: 	num_leaves: 18
[34m[1mwandb[0m: 	reg_alpha: 3.131946422340626
[34m[1mwandb[0m: 	reg_lambda: 2.4578824606997207
[34m[1mwandb[0m: 	subsample: 0.6998829680702567
[34m[1mwandb[0m: 	subsample_for_bin: 538058


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
val_accuracy_score,0.92251
_step,1.0
_runtime,11.0
_timestamp,1604703625.0
test_accuracy_score,0.9018


0,1
val_accuracy_score,▁
_step,▁█
_runtime,▁▁
_timestamp,▁▁
test_accuracy_score,▁


[34m[1mwandb[0m: Agent Starting Run: 7hrri35t with config:
[34m[1mwandb[0m: 	boosting_type: dart
[34m[1mwandb[0m: 	colsample_bytree: 0.6495472318621289
[34m[1mwandb[0m: 	learning_rate: 0.09626151938740096
[34m[1mwandb[0m: 	max_depth: 29
[34m[1mwandb[0m: 	min_child_weight: 3
[34m[1mwandb[0m: 	n_estimators: 334
[34m[1mwandb[0m: 	num_leaves: 17
[34m[1mwandb[0m: 	reg_alpha: 4.540320489843758
[34m[1mwandb[0m: 	reg_lambda: 4.97721486071872
[34m[1mwandb[0m: 	subsample: 0.6578055175852378
[34m[1mwandb[0m: 	subsample_for_bin: 633325


Individual run

In [None]:
# # Initial configuration for the training phase 
# config = {
#     "rm_sw": False,
#     "rm_symb": False,
#     "lemmatize": True,
#     "ngram_range": (1, 1),
#     'features': 'title',
#     'random_state': SEED,
#     'kfolds': 10
#     }

# config['model'] = 'KNN'

# steps = [('norm', NormalizeTextTransformer(rm_sw=config.get('rm_sw'), 
#                                            rm_symb=config.get('rm_symb'),
#                                            lemmatize=config.get('lemmatize'))),
#          ('vect', CountVectorizer(ngram_range=config.get('ngram_range'))),
#          ('tfidf', TfidfTransformer()),
#          ('KNN', KNeighborsClassifier())]

# pipe = Pipeline(steps=steps)
# run_experiment(config, pipe, X, y)

In [None]:
fig, axs = plt.subplots(2, 2, sharex=False, sharey=False, figsize=(10,8))
axs = axs.ravel()  # To get a flattened array

for i, m in enumerate(['test_accuracy', 'test_f1', 'test_roc_auc', 'test_pr_auc']):
    plot_boxplot(scores_all, m, axs[i])
plt.tight_layout()