#### Libraries

In [None]:
import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt

import sklearn.model_selection
import sklearn.preprocessing as preproc
from sklearn.feature_extraction import text

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, classification_report, plot_confusion_matrix, ConfusionMatrixDisplay, confusion_matrix

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, Dense, Dropout, MaxPooling1D
from tensorflow import keras
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras import layers
from scikeras.wrappers import KerasClassifier
from gc import callbacks

import pickle
import warnings

#### Data preprocessing

#### Feaure engineering

#### Machine learning 

In [None]:
# Logger

import wandb
from wandb.keras import WandbCallback

wandb.login()
wandb.init(project="Title classifier")

In [None]:

# Logistic regression 

param_log = {'solver':['lbfgs'],
              'C':[1, 5, 10, 50, 100],
              'penalty': ['12']}

# Create model
logr_clf = LogisticRegression(max_iter=10000)

# 10 fold cv hyper-parameters tuning
clf_log = GridSearchCV(logr_clf,
                       param_grid=param_log, 
                       cv=10, 
                       scoring='accuracy',
                       refit=True) 

# Fit
clf_log.fit(X_train, y_train)

print("Best parameters", clf_log.best_params_)
print("Best score", clf_log.best_score_)

# Predict on test
logr_y_pred = clf_log.predict(X_test)

print('Accuracy Score: ' + str(accuracy_score(y_test,logr_y_pred)))
print('Classification report\n', classification_report(y_test, logr_y_pred)) 

wandb.sklearn.plot_classifier(clf_log, X_train, X_test, y_train, y_test, logr_y_pred,
                                                         model_name='Logistic regression', feature_names=None)

In [None]:

# Svc

param_svc = {'kernel':['rbf'],
              'C':[0.1, 1, 5, 10, 50, 100],
              'gamma': [0.01, 0.1, 1, 5, 10, 50, 100]}

# Create model
svc_clf = SVC()

# 10 fold cv hyper-parameters tuning
clf_svc = GridSearchCV(svc_clf,
                       param_grid=param_svc, 
                       cv=10, 
                       scoring='accuracy',
                       refit=True) 
# Fit
clf_svc.fit(X_train, y_train)

print("Best parameters", clf_svc.best_params_)
print("Best score", clf_svc.best_score_)

# Predict on test
svc_y_pred = clf_svc.predict(X_test)

print('Accuracy Score: ' + str(accuracy_score(y_test,svc_y_pred)))
print('Classification report\n', classification_report(y_test, svc_y_pred)) 

wandb.sklearn.plot_classifier(clf_svc, X_train, X_test, y_train, y_test, svc_y_pred,
                                                         model_name='Support vector', feature_names=None)

In [None]:

# Random Forest

param_rf = { 
    'n_estimators': [5, 10, 20, 50, 100, 150, 200],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [1,2,3,4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

rf_clf = RandomForestClassifier(random_state=41)

clf_rf = GridSearchCV(estimator=rf_clf, 
                      param_grid=param_rf, 
                      scoring='accuracy',
                      cv=10,
                      refit=True)

clf_rf.fit(X_train, y_train)

print("Best parameters", clf_rf.best_params_)
print("Best score", clf_rf.best_score_)

rfc_y_pred = clf_rf.predict(X_test)

print('Accuracy Score: ' + str(accuracy_score(y_test, rfc_y_pred)))
print("Classification report: \n", classification_report(y_test, rfc_y_pred))

wandb.sklearn.plot_classifier(clf_rf, X_train, X_test, y_train, y_test, rfc_y_pred,
                                                         model_name='Random forest', feature_names=None)

In [None]:

# Decision tree

param_dt = {'max_features': ['sqrt', 'log2'],
              'ccp_alpha': [0.5, 0.1, .01, .001],
              'max_depth' : [1,2,3,4,5,6,7,8,9],
              'criterion' :['gini', 'entropy']
             }

dt_clf = DecisionTreeClassifier()

clf_dt = GridSearchCV(dt_clf, 
                      param_grid=param_dt, 
                      scoring='accuracy',
                      cv=10,
                      refit=True)

clf_dt.fit(X_train, y_train)

print("Best parameters", clf_dt.best_params_)
print("Best score", clf_dt.best_score_)

dt_y_pred = clf_dt.predict(X_test)

print('Accuracy Score: ' + str(accuracy_score(y_test, dt_y_pred)))
print("Classification report: \n", classification_report(y_test, dt_y_pred))

wandb.sklearn.plot_classifier(clf_dt, X_train, X_test, y_train, y_test, dt_y_pred,
                                                         model_name='Decision tree', feature_names=None)

In [None]:
# KNN

param_knn = { 
    'n_neighbors': list(range(1,11)),
}

knn_clf = KNeighborsClassifier()

clf_knn = GridSearchCV(estimator=rf_clf, 
                      param_grid=param_rf, 
                      scoring='accuracy',
                      cv=10,
                      refit=True)

clf_knn.fit(X_train, y_train)

print("Best parameters", clf_knn.best_params_)
print("Best score", clf_knn.best_score_)

knn_y_pred = clf_knn.predict(X_test)

print('Accuracy Score: ' + str(accuracy_score(y_test, knn_y_pred)))
print("Classification report: \n", classification_report(y_test, knn_y_pred))

wandb.sklearn.plot_classifier(clf_knn, X_train, X_test, y_train, y_test, knn_y_pred,
                                                         model_name='KNN', feature_names=None)

In [None]:
# XGB

param_xgb = {
        'n_estimators': range(20, 400, 20),
        'learning_rate': [1e-1, 1e-2, 5e-3, 5e-4],
        'min_child_weight': [1, 3, 5, 7, 9],
        'gamma': [0.5, 1, 1.5, 2],
        'subsample': [0.5, 0.75, 1.0],
        'colsample_bytree': [0.5, 0.75, 1.0],
        'max_depth': [2, 3, 4, 5, 6, 7, 8]
        }

xgb_clf = XGBClassifier(objective='binary:logistic', seed=41)

clf_xgb = GridSearchCV(estimator=xgb_clf, 
                      param_grid=param_xgb, 
                      scoring='accuracy',
                      cv=10,
                      refit=True)

clf_xgb.fit(X_train, y_train)

print("Best parameters", clf_xgb.best_params_)
print("Best score", clf_xgb.best_score_)

xgb_y_pred = clf_xgb.predict(X_test)

print('Accuracy Score: ' + str(accuracy_score(y_test, xgb_y_pred)))
print("Classification report: \n", classification_report(y_test, xgb_y_pred))

wandb.sklearn.plot_classifier(clf_xgb, X_train, X_test, y_train, y_test, xgb_y_pred,
                                                         model_name='XGB', feature_names=None)

#### Deep learning

##### Tuning

In [None]:
embed_len = 512
max_tokens = 300
max_words = 100
n_classes = 2

sweep_config = {
   'method': 'grid',
   
   'parameters': {
       
       'neurons': {
           'values': [32, 64, 128]
       },
       
       'f': {
           'values': [64, 128, 256]
       },
       
       'bs': {
           'values': [8, 16, 32, 64]
       }
   }
}

sweep_id = wandb.sweep(sweep_config)

In [None]:
def train():
       
   configs = {
       'neurons': 32,
       'f': 64,
       'bs': 8
   }
   
   # Specify the other hyperparameters to the configuration
   config = wandb.config
   config.epochs = 50
   
   model = keras.Sequential(
        [
            keras.Input(shape=(max_tokens, )),
            
            layers.Embedding(input_dim=max_words, 
                             output_dim=embed_len,  
                             input_length=max_tokens),
            
            # Block 1
            layers.Conv1D(wandb.config.f, kernel_size=(3, 3), activation="relu"),
            layers.MaxPooling1D(pool_size=(3, 3)),
            layers.Dropout(0.3),                  
            
            # Block 2
            layers.Conv1D(wandb.config.f, kernel_size=(3, 3), activation="relu"),
            layers.MaxPooling1D(pool_size=(3, 3)),
            layers.Dropout(0.3),                
            
            # FC 1
            layers.Flatten(),                                     
            layers.Dense(wandb.config.neurons, activation="relu"),             
            
            # Output 
            layers.Dense(n_classes, activation="softmax")     
        ]
    )

   model.compile(optimizer='adam',
                 loss='binary_crossentropy',
                 metrics=['accuracy'])
   
   model.fit(X_train, y_train, epochs=config.epochs, batch_size=config.bs,
             validation_split=0.1, callbacks=[WandbCallback()])
   

In [None]:
wandb.agent(sweep_id, function=train)

In [None]:
# # Create model

# embed_len = 512
# max_tokens = 300
# max_words = 100
# n_classes = 2

# def create_model(f, neuron):
    
#     model = keras.Sequential(
#         [
#             keras.Input(shape=(max_tokens, )),
            
#             layers.Embedding(input_dim=max_words, 
#                              output_dim=embed_len,  
#                              input_length=max_tokens),
            
#             # Block 1
#             layers.Conv1D(f, kernel_size=(3, 3), activation="relu"),
#             layers.MaxPooling1D(pool_size=(3, 3)),
#             layers.Dropout(0.3),                  
            
#             # Block 2
#             layers.Conv1D(f, kernel_size=(3, 3), activation="relu"),
#             layers.MaxPooling1D(pool_size=(3, 3)),
#             layers.Dropout(0.3),                
            
#             # FC 1
#             layers.Flatten(),                                     
#             layers.Dense(neuron, activation="relu"),             
            
#             # Output 
#             layers.Dense(n_classes, activation="softmax")     
#         ]
#     )
    
#     model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

#     return model

In [None]:
# model = KerasClassifier(build_fn=create_model,
#                         f=[128,256],
#                         epochs=[50],
#                         neuron=[32,64],
#                         batch_size=[32,64],)

# # Model params
# params={'batch_size':[32,64], 
#         'epochs':[50],
#         'neuron':[32,64],
#         'f':[128,256],
#         }

# # Grid search cv
# gs = GridSearchCV(estimator=model, 
#                   param_grid=params, 
#                   cv=10,
#                   refit=True)
# # Fit
# gs = gs.fit(X_train, y_train)

# print('Best acc:', gs.best_score_)
# print('Best params:', gs.best_params_)

# mean = gs.cv_results_['mean_test_score']
# stds = gs.cv_results_['std_test_score']
# params = gs.cv_results_['params']

# for mean, stdev, param in zip(mean, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))


##### Training best model

In [None]:
# Retrain with best parameters

model = keras.Sequential(
    [
        keras.Input(shape=(max_tokens, )),
        
        layers.Embedding(input_dim=max_words, 
                    output_dim=embed_len,  
                    input_length=max_tokens),
        
        # Block 1
        layers.Conv1D(256, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling1D(pool_size=(3, 3)),                  
        layers.Dropout(0.3),                  

        # Block 2
        layers.Conv1D(256, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling1D(pool_size=(3, 3)),                
        layers.Dropout(0.3),                  

        # FC 1
        layers.Flatten(),                                     
        layers.Dense(64, activation="relu"),                
        
        # Output 
        layers.Dense(n_classes, activation="softmax")  
    ]
)

# View summary
model.summary()

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Reduce lr
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=5, min_lr=1e-6)
# Train
history = model.fit(X_train, y_train, epochs=50, batch_size=32,
                    validation_split=0.1, callbacks=[reduce_lr, WandbCallback()])

In [None]:
# # View plots
# plt.plot(history.history['accuracy'])
# plt.plot(history.history['val_accuracy'])
# plt.title('Model accuracy')
# plt.ylabel('Accuracy')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Validation'], loc='upper left')
# plt.show()

# # Plot training & validation loss values
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('Model loss')
# plt.ylabel('Loss')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Validation'], loc='upper left')
# plt.show()

In [None]:
# View loss

# y_test_p = np.argmax(model.predict(X_test), axis=-1)
y_test_p = model.predict(X_test)

test_loss, test_acc = model.evaluate(X_test, y_test_p, verbose=1)
print("Test loss:", test_loss)
print("Test accuracy:", test_acc)

In [None]:
# # Prediction
# # y_test_p = np.argmax(model.predict(X_test), axis=-1)

# print(f"Classification report:\n"
#       f"{classification_report(y_test, y_test_p)}\n")

##### Export model for deployment

In [None]:
model_dir = './best_model.h5'
model.save(model_dir)