#### Libraries

In [6]:
import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
import gensim

import sklearn.preprocessing as preproc
from sklearn.feature_extraction import text
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, plot_confusion_matrix, ConfusionMatrixDisplay, confusion_matrix

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, Dense, Dropout, MaxPooling1D
from tensorflow import keras
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras import layers
from scikeras.wrappers import KerasClassifier
from gc import callbacks

import pickle
import warnings

#### Data preprocessing

#### Feaure engineering

#### Dummy data to verify


In [7]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

def clean_text(text):
    '''Text preprocessing/cleaning function'''
    
    # Convert to lower case
    text = text.lower()
    
    # Remove symbols, special characters
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    
    # Remove stop words
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)

    # Tokenize 
    text =  nltk.WordPunctTokenizer().tokenize(text)
        
    return text

train_df['Cleaned'] = list(map(clean_text, train_df.text))
test_df['Cleaned'] = list(map(clean_text, test_df.text))

X_train, y_train = train_df['Cleaned'], train_df['label']
X_test, y_test = test_df['Cleaned'], test_df['label']

w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=256,
                                   window=4,
                                   min_count=1)

words = set(w2v_model.wv.index_to_key)

X_train_vec = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train])

X_test_vec = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test])

X_train = []
for v in X_train_vec:
    if v.size:
        X_train.append(v.mean(axis=0))
    else:
        X_train.append(np.zeros(256, dtype=float))
  
X_test = []
for v in X_test_vec:
    if v.size:
        X_test.append(v.mean(axis=0))
    else:
        X_test.append(np.zeros(256, dtype=float))

  X_train_vec = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
  X_test_vec = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])


#### Machine learning 

In [3]:
# Logger

import wandb
from wandb.keras import WandbCallback

wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\j/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mwasabee[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [42]:
labels=['0', '1']

# Logistic regression 

param_log = {'solver':['lbfgs'],
              'C':[1, 5, 10, 50, 100]}

# Create model
logr_clf = LogisticRegression(max_iter=10000)

# 10 fold cv hyper-parameters tuning
clf_log = GridSearchCV(logr_clf,
                       param_grid=param_log, 
                       cv=10, 
                       scoring='accuracy',
                       refit=True) 

# Fit
clf_log.fit(X_train, y_train)

print("Best parameters", clf_log.best_params_)
print("Best score", clf_log.best_score_)

# Predict on test
logr_y_pred = clf_log.predict(X_test)
logr_y_proba = clf_log.predict_proba(X_test)

print('Accuracy Score: ' + str(accuracy_score(y_test,logr_y_pred)))
print('Classification report\n', classification_report(y_test, logr_y_pred)) 

wandb.init(project="Title classifier", reinit=True)
wandb.run.name = 'Logistic regression'
wandb.sklearn.plot_classifier(clf_log, X_train, X_test, y_train, y_test, logr_y_pred, logr_y_proba, labels,
                                                         model_name='Logistic regression', feature_names=None)
wandb.run.save
wandb.run.finish()

Best parameters {'C': 100, 'solver': 'lbfgs'}
Best score 0.8933666961044059
Accuracy Score: 0.6666666666666666
Classification report
               precision    recall  f1-score   support

           0       1.00      0.25      0.40         4
           1       0.62      1.00      0.77         5

    accuracy                           0.67         9
   macro avg       0.81      0.62      0.58         9
weighted avg       0.79      0.67      0.61         9



[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting Logistic regression.
[34m[1mwandb[0m: Logged feature importances.
[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision-recall curve.


In [43]:

# Svc

param_svc = {'kernel':['rbf'],
              'C':[0.1, 1, 5, 10],
              'gamma': [0.01, 0.1, 1, 5]}

# Create model
svc_clf = SVC(probability=True)

# 10 fold cv hyper-parameters tuning
clf_svc = GridSearchCV(svc_clf,
                       param_grid=param_svc, 
                       cv=10, 
                       scoring='accuracy',
                       refit=True) 
# Fit
clf_svc.fit(X_train, y_train)

print("Best parameters", clf_svc.best_params_)
print("Best score", clf_svc.best_score_)

# Predict on test
svc_y_pred = clf_svc.predict(X_test)
svc_y_proba = clf_svc.predict_proba(X_test)

print('Accuracy Score: ' + str(accuracy_score(y_test,svc_y_pred)))
print('Classification report\n', classification_report(y_test, svc_y_pred)) 

wandb.init(project="Title classifier", reinit=True)
wandb.run.name = 'Support vector'
wandb.sklearn.plot_classifier(clf_svc, X_train, X_test, y_train, y_test, svc_y_pred, svc_y_proba, labels,
                                                         model_name='Support vector', feature_names=None)
wandb.run.save
wandb.run.finish()

Best parameters {'C': 10, 'gamma': 5, 'kernel': 'rbf'}
Best score 0.9267272370451647
Accuracy Score: 0.7777777777777778
Classification report
               precision    recall  f1-score   support

           0       1.00      0.50      0.67         4
           1       0.71      1.00      0.83         5

    accuracy                           0.78         9
   macro avg       0.86      0.75      0.75         9
weighted avg       0.84      0.78      0.76         9



[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting Support vector.
[34m[1mwandb[0m: Logged feature importances.
[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision-recall curve.


In [None]:

# Random Forest

param_rf = { 
    'n_estimators': [5, 10, 20, 50, 100, 150],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [1,2,3,4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

rf_clf = RandomForestClassifier(random_state=41)

clf_rf = GridSearchCV(estimator=rf_clf, 
                      param_grid=param_rf, 
                      scoring='accuracy',
                      cv=10,
                      refit=True)

clf_rf.fit(X_train, y_train)

print("Best parameters", clf_rf.best_params_)
print("Best score", clf_rf.best_score_)

rfc_y_pred = clf_rf.predict(X_test)
rfc_y_proba = clf_rf.predict_proba(X_test)

print('Accuracy Score: ' + str(accuracy_score(y_test, rfc_y_pred)))
print("Classification report: \n", classification_report(y_test, rfc_y_pred))

wandb.init(project="Title classifier", reinit=True)
wandb.run.name = 'Random forest'
wandb.sklearn.plot_classifier(clf_rf, X_train, X_test, y_train, y_test, rfc_y_pred, rfc_y_proba, labels
                                                         model_name='Random forest', feature_names=None)
wandb.run.save
wandb.run.finish()

In [None]:

# Decision tree

param_dt = {'max_features': ['sqrt', 'log2'],
              'ccp_alpha': [0.5, 0.1, .01, .001],
              'max_depth' : [1,2,3,4,5,6,7,8,9],
              'criterion' :['gini', 'entropy']
             }

dt_clf = DecisionTreeClassifier()

clf_dt = GridSearchCV(dt_clf, 
                      param_grid=param_dt, 
                      scoring='accuracy',
                      cv=10,
                      refit=True)

clf_dt.fit(X_train, y_train)

print("Best parameters", clf_dt.best_params_)
print("Best score", clf_dt.best_score_)

dt_y_pred = clf_dt.predict(X_test)
dt_y_proba = clf_dt.predict_proba(X_test)

print('Accuracy Score: ' + str(accuracy_score(y_test, dt_y_pred)))
print("Classification report: \n", classification_report(y_test, dt_y_pred))

wandb.init(project="Title classifier", reinit=True)
wandb.run.name = 'Decision tree'
wandb.sklearn.plot_classifier(clf_dt, X_train, X_test, y_train, y_test, dt_y_pred, dt_y_proba, labels,
                                                         model_name='Decision tree', feature_names=None)
wandb.run.save
wandb.run.finish()

In [None]:
# KNN

param_knn = { 
    'n_neighbors': list(range(1,11)),
}

knn_clf = KNeighborsClassifier()

clf_knn = GridSearchCV(estimator=rf_clf, 
                      param_grid=param_rf, 
                      scoring='accuracy',
                      cv=10,
                      refit=True)

clf_knn.fit(X_train, y_train)

print("Best parameters", clf_knn.best_params_)
print("Best score", clf_knn.best_score_)

knn_y_pred = clf_knn.predict(X_test)
knn_y_proba = clf_knn.predict_proba(X_test)

print('Accuracy Score: ' + str(accuracy_score(y_test, knn_y_pred)))
print("Classification report: \n", classification_report(y_test, knn_y_pred))

wandb.init(project="Title classifier", reinit=True)
wandb.run.name = 'KNN'
wandb.sklearn.plot_classifier(clf_knn, X_train, X_test, y_train, y_test, knn_y_pred, knn_y_proba, labels,
                                                         model_name='KNN', feature_names=None)
wandb.run.save
wandb.run.finish()

In [None]:
# XGB

param_xgb = {
        'n_estimators': range(20, 400, 20),
        'learning_rate': [1e-1, 1e-2, 5e-3, 5e-4],
        'min_child_weight': [1, 3, 5, 7, 9],
        'gamma': [0.5, 1, 1.5, 2],
        'subsample': [0.5, 0.75, 1.0],
        'colsample_bytree': [0.5, 0.75, 1.0],
        'max_depth': [2, 3, 4, 5, 6, 7, 8]
        }

xgb_clf = XGBClassifier(objective='binary:logistic', seed=41)

clf_xgb = GridSearchCV(estimator=xgb_clf, 
                      param_grid=param_xgb, 
                      scoring='accuracy',
                      cv=10,
                      refit=True)

clf_xgb.fit(X_train, y_train)

print("Best parameters", clf_xgb.best_params_)
print("Best score", clf_xgb.best_score_)

xgb_y_pred = clf_xgb.predict(X_test)
xgb_y_proba = clf_xgb.predict_proba(X_test)

print('Accuracy Score: ' + str(accuracy_score(y_test, xgb_y_pred)))
print("Classification report: \n", classification_report(y_test, xgb_y_pred))

wandb.init(project="Title classifier", reinit=True)
wandb.run.name = 'XGB'
wandb.sklearn.plot_classifier(clf_xgb, X_train, X_test, y_train, y_test, xgb_y_pred, xgb_y_proba, labels,
                                                         model_name='XGB', feature_names=None)
wandb.run.save
wandb.run.finish()

#### Deep learning

##### Tuning

In [None]:
embed_len = 512
max_tokens = 300
max_words = 100
n_classes = 2

sweep_config = {
   'method': 'grid',
   
   'parameters': {
       
       'neurons': {
           'values': [32, 64, 128]
       },
       
       'f': {
           'values': [64, 128, 256]
       },
       
       'bs': {
           'values': [8, 16, 32, 64]
       }
   }
}

wandb.init(project="Title classifier", reinit=True)
wandb.run.name = 'CNN model tuning'

sweep_id = wandb.sweep(sweep_config)

In [45]:
def train():
       
   configs = {
       'neurons': 32,
       'f': 64,
       'bs': 8
   }
   
   # Specify the other hyperparameters to the configuration
   config = wandb.config
   config.epochs = 50
   
   model = keras.Sequential(
        [
            keras.Input(shape=(max_tokens, )),
            
            layers.Embedding(input_dim=max_words, 
                             output_dim=embed_len,  
                             input_length=max_tokens),
            
            # Block 1
            layers.Conv1D(wandb.config.f, kernel_size=(3, 3), activation="relu"),
            layers.MaxPooling1D(pool_size=(3, 3)),
            layers.Dropout(0.3),                  
            
            # Block 2
            layers.Conv1D(wandb.config.f, kernel_size=(3, 3), activation="relu"),
            layers.MaxPooling1D(pool_size=(3, 3)),
            layers.Dropout(0.3),                
            
            # FC 1
            layers.Flatten(),                                     
            layers.Dense(wandb.config.neurons, activation="relu"),             
            
            # Output 
            layers.Dense(n_classes, activation="softmax")     
        ]
    )

   model.compile(optimizer='adam',
                 loss='binary_crossentropy',
                 metrics=['accuracy'])
   
   model.fit(X_train, y_train, epochs=config.epochs, batch_size=config.bs,
             validation_split=0.1, callbacks=[WandbCallback()])
   

In [None]:
wandb.agent(sweep_id, function=train)

In [None]:
wandb.run.save
wandb.run.finish()

In [None]:
# # Create model

# embed_len = 512
# max_tokens = 300
# max_words = 100
# n_classes = 2

# def create_model(f, neuron):
    
#     model = keras.Sequential(
#         [
#             keras.Input(shape=(max_tokens, )),
            
#             layers.Embedding(input_dim=max_words, 
#                              output_dim=embed_len,  
#                              input_length=max_tokens),
            
#             # Block 1
#             layers.Conv1D(f, kernel_size=(3, 3), activation="relu"),
#             layers.MaxPooling1D(pool_size=(3, 3)),
#             layers.Dropout(0.3),                  
            
#             # Block 2
#             layers.Conv1D(f, kernel_size=(3, 3), activation="relu"),
#             layers.MaxPooling1D(pool_size=(3, 3)),
#             layers.Dropout(0.3),                
            
#             # FC 1
#             layers.Flatten(),                                     
#             layers.Dense(neuron, activation="relu"),             
            
#             # Output 
#             layers.Dense(n_classes, activation="softmax")     
#         ]
#     )
    
#     model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

#     return model

In [None]:
# model = KerasClassifier(build_fn=create_model,
#                         f=[128,256],
#                         epochs=[50],
#                         neuron=[32,64],
#                         batch_size=[32,64],)

# # Model params
# params={'batch_size':[32,64], 
#         'epochs':[50],
#         'neuron':[32,64],
#         'f':[128,256],
#         }

# # Grid search cv
# gs = GridSearchCV(estimator=model, 
#                   param_grid=params, 
#                   cv=10,
#                   refit=True)
# # Fit
# gs = gs.fit(X_train, y_train)

# print('Best acc:', gs.best_score_)
# print('Best params:', gs.best_params_)

# mean = gs.cv_results_['mean_test_score']
# stds = gs.cv_results_['std_test_score']
# params = gs.cv_results_['params']

# for mean, stdev, param in zip(mean, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))


##### Training best model

In [None]:
# Retrain with best parameters

wandb.init(project="Title classifier", reinit=True)
wandb.run.name = 'Best params CNN model training'

model = keras.Sequential(
    [
        keras.Input(shape=(max_tokens, )),
        
        layers.Embedding(input_dim=max_words, 
                    output_dim=embed_len,  
                    input_length=max_tokens),
        
        # Block 1
        layers.Conv1D(256, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling1D(pool_size=(3, 3)),                  
        layers.Dropout(0.3),                  

        # Block 2
        layers.Conv1D(256, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling1D(pool_size=(3, 3)),                
        layers.Dropout(0.3),                  

        # FC 1
        layers.Flatten(),                                     
        layers.Dense(64, activation="relu"),                
        
        # Output 
        layers.Dense(n_classes, activation="softmax")  
    ]
)

# View summary
model.summary()

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Reduce lr
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=5, min_lr=1e-6)
# Train
history = model.fit(X_train, y_train, epochs=50, batch_size=32,
                    validation_split=0.1, callbacks=[reduce_lr, WandbCallback()])

wandb.run.save
wandb.run.finish()

In [None]:
# # View plots
# plt.plot(history.history['accuracy'])
# plt.plot(history.history['val_accuracy'])
# plt.title('Model accuracy')
# plt.ylabel('Accuracy')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Validation'], loc='upper left')
# plt.show()

# # Plot training & validation loss values
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('Model loss')
# plt.ylabel('Loss')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Validation'], loc='upper left')
# plt.show()

In [None]:
# View loss

# y_test_p = np.argmax(model.predict(X_test), axis=-1)
y_test_p = model.predict(X_test)

test_loss, test_acc = model.evaluate(X_test, y_test_p, verbose=1)
print("Test loss:", test_loss)
print("Test accuracy:", test_acc)

In [None]:
# # Prediction
# # y_test_p = np.argmax(model.predict(X_test), axis=-1)

# print(f"Classification report:\n"
#       f"{classification_report(y_test, y_test_p)}\n")

##### Export model for deployment

In [None]:
model_dir = './best_model.h5'
model.save(model_dir)