In [35]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [36]:
import pandas as pd
from dont_patronize_me import DontPatronizeMe

In [37]:
dpm = DontPatronizeMe('.')
# This method loads the subtask 1 data
dpm.load_task1()
# which we can then access as a dataframe
dataset = dpm.train_task1_df
dataset.head()

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"we 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"in libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""white house press secretary sean spicer said ...",0,0
3,4,@@7811231,disabled,nz,council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" just like we received migrants fleeing el ...",0,0


In [43]:
dataset.label.value_counts()

0    9476
1     993
Name: label, dtype: int64

In [38]:
import numpy as np
import sentence_transformers

import seaborn as sns
from sklearn.preprocessing import MinMaxScaler , RobustScaler , StandardScaler

import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import re

In [39]:
train = pd.read_csv("train_subset.csv")
valid = pd.read_csv("validation_subset.csv")

In [41]:
train.label.value_counts()

0    7581
1     794
Name: label, dtype: int64

In [42]:
valid.label.value_counts()

0    1895
1     199
Name: label, dtype: int64

In [6]:
def text_preprocessing(text, lemmatize):
    if not isinstance(text, str):
        text = text.decode('ISO-8859-1')
    
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    text = regrex_pattern.sub(r'',text)

    text = re.sub('[^a-zA-Z]', ' ', text) # Clear the special characters from our dataset
    text = text.lower() 
    text = text.split() 
    text = ' '.join(text)
    
    return text

train['cleaned'] = [text_preprocessing(doc, True) for doc in train.text]
valid['cleaned'] = [text_preprocessing(str(doc), True) for doc in valid.text]

In [7]:
y_train = train.label.to_numpy()
y_val = valid.label.to_numpy()

In [8]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds

In [9]:
hub_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4", input_shape=[], output_shape=[512,16], 
  dtype=tf.string,trainable= True)

INFO:absl:Using C:\Users\Raluca\AppData\Local\Temp\tfhub_modules to cache modules.


In [10]:
from tensorflow.keras.optimizers import Adam

In [33]:
from sklearn.metrics import f1_score

def get_model(dense_1 = 128, dense_2 = 64):
  model = tf.keras.models.Sequential([
  hub_layer,
  tf.keras.layers.Dense(dense_1, activation='relu'),
  tf.keras.layers.Dense(dense_2, activation='relu'),
  tf.keras.layers.Dense(1)
  ])
  return model

def fit_with(verbose, dense_1, dense_2, lr):

    # Create the model using a specified hyperparameters.
    model = get_model(int(dense_1), int(dense_2))

    # Train the model for a specified number of epochs.
    optimizer = Adam(learning_rate = lr)
    model.compile(loss = 'binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])

    # Train the model with the train dataset.
    model.fit(x = train.cleaned, y = y_train, epochs=7,
              batch_size=64, verbose=verbose)

        # Evaluate the model with the eval dataset.
    score = model.evaluate(x = valid.cleaned, y = y_val, steps=10, verbose=0)
    print('Loss:', score[0])
    print('Accuracy:', score[1])

    # Return the mae.

    return score[1]

In [34]:
from functools import partial

verbose = 1
fit_with_partial = partial(fit_with, verbose)

from bayes_opt import BayesianOptimization

# Bounded region of parameter space
pbounds = {'dense_1': (64, 256), 'dense_2': (16, 128), 'lr': (1e-4, 1e-2)}

optimizer = BayesianOptimization(
    f=fit_with_partial,
    pbounds=pbounds,
    verbose=2,  # verbose = 1 prints only when a maximum is observed, verbose = 0 is silent
    random_state=1,
)

optimizer.maximize(init_points=10, n_iter=10)


for i, res in enumerate(optimizer.res):
    print("Iteration {}: \n\t{}".format(i, res))

print(optimizer.max)

|   iter    |  target   |  dense_1  |  dense_2  |    lr     |
-------------------------------------------------------------
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Loss: 0.769253134727478
Accuracy: 0.8882521390914917
| [0m 1       [0m | [0m 0.8883  [0m | [0m 144.1   [0m | [0m 96.68   [0m | [0m 0.000101[0m |
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Loss: 1.465885877609253
Accuracy: 0.9049665927886963
| [95m 2       [0m | [95m 0.905   [0m | [95m 122.0   [0m | [95m 32.44   [0m | [95m 0.001014[0m |
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Loss: 1.465885877609253
Accuracy: 0.9049665927886963
| [0m 3       [0m | [0m 0.905   [0m | [0m 99.76   [0m | [0m 54.7    [0m | [0m 0.004028[0m |
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Loss: 1.465885877609253
Accuracy: 0.9049665927886963
| [0m 4       [0m | [0m 0.905   [0m | [0m 167.5   [0m | [0m 62.95 

Epoch 5/7
Epoch 6/7
Epoch 7/7
Loss: 1.465885877609253
Accuracy: 0.9049665927886963
| [0m 10      [0m | [0m 0.905   [0m | [0m 71.5    [0m | [0m 35.02   [0m | [0m 0.008794[0m |
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Loss: 1.465885877609253
Accuracy: 0.9049665927886963
| [0m 11      [0m | [0m 0.905   [0m | [0m 248.8   [0m | [0m 52.28   [0m | [0m 0.009041[0m |
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Loss: 1.465885877609253
Accuracy: 0.9049665927886963
| [0m 12      [0m | [0m 0.905   [0m | [0m 190.0   [0m | [0m 16.0    [0m | [0m 0.01    [0m |
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Loss: 1.465885877609253
Accuracy: 0.9049665927886963
| [0m 13      [0m | [0m 0.905   [0m | [0m 235.4   [0m | [0m 16.0    [0m | [0m 0.01    [0m |
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Loss: 1.465885877609253
Accuracy: 0.9049665927886963
| [0m 14      [

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Loss: 0.6320094466209412
Accuracy: 0.8887296915054321
| [0m 20      [0m | [0m 0.8887  [0m | [0m 124.3   [0m | [0m 55.48   [0m | [0m 0.0001  [0m |
Iteration 0: 
	{'target': 0.8882521390914917, 'params': {'dense_1': 144.0682249028942, 'dense_2': 96.67634326552171, 'lr': 0.00010113231069171439}}
Iteration 1: 
	{'target': 0.9049665927886963, 'params': {'dense_1': 122.04785394531324, 'dense_2': 32.43665977151666, 'lr': 0.0010141520882110983}}
Iteration 2: 
	{'target': 0.9049665927886963, 'params': {'dense_1': 99.76196058451282, 'dense_2': 54.702801428821346, 'lr': 0.004027997994883633}}
Iteration 3: 
	{'target': 0.9049665927886963, 'params': {'dense_1': 167.45281292864453, 'dense_2': 62.94978561316902, 'lr': 0.00688367305392792}}
Iteration 4: 
	{'target': 0.88634192943573, 'params': {'dense_1': 103.25483194845134, 'dense_2': 114.34915287578589, 'lr': 0.00037113717265946903}}
Iteration 5: 
	{'target': 0.904966592