In [1]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
import warnings
warnings.filterwarnings('ignore')
%config Completer.use_jedi = False 

In [3]:
# importing the dataset 
df_train = pd.read_csv('./data/train.txt', header =None, sep =';', names = ['Input','Sentiment'], encoding='utf-8')
df_test = pd.read_csv('./data/test.txt', header = None, sep =';', names = ['Input','Sentiment'],encoding='utf-8')
df_val=pd.read_csv('./data/val.txt',header=None,sep=';',names=['Input','Sentiment'],encoding='utf-8')

In [4]:
df_full = pd.concat([df_train,df_test,df_val], axis = 0)
df_full

#### here we are doing some text preprocessing 



In [5]:
# pip install text_hammer

In [6]:
import text_hammer as th

In [7]:
%%time

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

def text_preprocessing(df,col_name):
    column = col_name
    df[column] = df[column].progress_apply(lambda x:str(x).lower())
    df[column] = df[column].progress_apply(lambda x: th.cont_exp(x)) #you're -> you are; i'm -> i am
    df[column] = df[column].progress_apply(lambda x: th.remove_emails(x))
    df[column] = df[column].progress_apply(lambda x: th.remove_html_tags(x))
#     df[column] = df[column].progress_apply(lambda x: ps.remove_stopwords(x))

    df[column] = df[column].progress_apply(lambda x: th.remove_special_chars(x))
    df[column] = df[column].progress_apply(lambda x: th.remove_accented_chars(x))
#     df[column] = df[column].progress_apply(lambda x: th.make_base(x)) #ran -> run,
    return(df)

In [8]:
df_cleaned = text_preprocessing(df_full,'Input')

In [9]:
df_cleaned = df_cleaned.copy()

In [10]:
df_cleaned['num_words'] = df_cleaned.Input.apply(lambda x:len(x.split()))

In [11]:
# changing the data type to the category to encode into codes 
df_cleaned['Sentiment'] = df_cleaned.Sentiment.astype('category')


In [12]:
df_cleaned.Sentiment

In [13]:
df_cleaned.Sentiment.cat.codes

In [14]:
encoded_dict  = {'anger':0,'fear':1, 'joy':2, 'love':3, 'sadness':4, 'surprise':5}

In [15]:
df_cleaned['Sentiment']  =  df_cleaned.Sentiment.cat.codes
df_cleaned.Sentiment

In [16]:
df_cleaned.num_words.max()

In [17]:
from sklearn.model_selection import train_test_split
data_train,data_test = train_test_split(df_cleaned, test_size = 0.3, random_state = 42, stratify = df_cleaned.Sentiment)

In [18]:
data_train.shape

In [19]:
data_test.shape

In [20]:
from tensorflow.keras.utils import to_categorical

In [21]:
to_categorical(data_train.Sentiment)

In [22]:
from transformers import AutoTokenizer,TFBertModel
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert = TFBertModel.from_pretrained('bert-base-cased')


In [23]:
tokenizer.save_pretrained('bert-tokenizer')
bert.save_pretrained('bert-model')
# for saving model locally and we can load it later on 

In [24]:
import shutil
shutil.make_archive('bert-tokenizer', 'zip', 'bert-tokenizer')

In [25]:
shutil.make_archive('bert-model','zip','bert-model')

In [26]:
### we can use distilbert its lighter cheaper and similar performance 

from transformers import BertTokenizer, TFBertModel, BertConfig,TFDistilBertModel,DistilBertTokenizer,DistilBertConfig
dbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')


In [28]:
# Tokenize the input (takes some time) 
# here tokenizer using from bert-base-cased
x_train = tokenizer(
    text=data_train.Input.tolist(),
    add_special_tokens=True,
    max_length=70,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)


x_test = tokenizer(
    text=data_test.Input.tolist(),
    add_special_tokens=True,
    max_length=70,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [29]:
x_test['input_ids']

In [30]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

In [31]:
import tensorflow as tf
tf.config.experimental.list_physical_devices('GPU')

In [32]:
max_len = 70
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense

input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
# embeddings = dbert_model(input_ids,attention_mask = input_mask)[0]


embeddings = bert(input_ids,attention_mask = input_mask)[0] 
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)

y = Dense(6,activation = 'sigmoid')(out)
    
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True
# for training bert our lr must be so small

In [33]:
optimizer = Adam(
    learning_rate=5e-05, # this learning rate is for bert model , taken from huggingface website 
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

# Set loss and metrics
loss =CategoricalCrossentropy(from_logits = True)
metric = CategoricalAccuracy('balanced_accuracy'),
# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

In [34]:
model.summary()

In [35]:
tf.config.experimental_run_functions_eagerly(True)
tf.config.run_functions_eagerly(True)

#### model fitting and then evaluation

In [36]:
train_history = model.fit(
    x ={'input_ids':x_train['input_ids'],'attention_mask':x_train['attention_mask']} ,
    y = to_categorical(data_train.Sentiment),
    validation_data = (
    {'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']}, to_categorical(data_test.Sentiment)
    ),
  epochs=1,
    batch_size=36
)

In [37]:
model.save_weights('sentiment_weights.h5')

In [38]:
# max_len = 70
# import tensorflow as tf
# from tensorflow.keras.layers import Input, Dense

# input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
# input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
# # embeddings = dbert_model(input_ids,attention_mask = input_mask)[0]


# embeddings = bert(input_ids,attention_mask = input_mask)[0] #(0 is the last hidden states,1 means pooler_output)
# out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
# out = Dense(128, activation='relu')(out)
# out = tf.keras.layers.Dropout(0.1)(out)
# out = Dense(32,activation = 'relu')(out)

# y = Dense(6,activation = 'sigmoid')(out)
    
# new_model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
# new_model.layers[2].trainable = True
# # for training bert our lr must be so small

# new_model.load_weights('sentiment_weights.h5')

In [39]:
predicted_raw = model.predict({'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']})

In [40]:
predicted_raw[0]

In [41]:
y_predicted = np.argmax(predicted_raw, axis = 1)

In [42]:
data_test.Sentiment

In [43]:
from sklearn.metrics import classification_report

In [44]:
print(classification_report(data_test.Sentiment, y_predicted))