# Aspect Based Sentiment Analysis

## Importing data

In [None]:
import pandas as pd

df = pd.read_csv("train.csv")

In [2]:
df.shape

(4000, 3)

In [3]:
df.head(10)

Unnamed: 0,text,aspect,label
0,can you check whether its cancelled completely?,cancelled,1
1,cannot rely on both milk delivery and grocery ...,Milk,0
2,"I get no notification, however the app is real...",notification,0
3,"Love this app, but would love it even more if ...",view,1
4,it does not let me load a clip on the scene,load,0
5,"i love notion as a tool, but the mobile just t...","tool,",2
6,unlimited yearly at $216/year,yearly,1
7,"some times missing to deliver, some times diff...","deliver,",0
8,Would love the ability to “un-delete” deleted ...,tasks,1
9,please add this basic feature,Please,2


## Checking Null Values

In [4]:
df.isna().sum()

text      0
aspect    0
label     0
dtype: int64

In [5]:
print((df['label'] == 0).sum())
print((df['label'] == 1).sum())
print((df['label'] == 2).sum())

1680
1294
1026


## Preprocessing the data

In [6]:

import re
import string

def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

# https://stackoverflow.com/questions/34293875/how-to-remove-punctuation-marks-from-a-string-in-python-3-x-using-translate/34294022
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

## Removal of punctuations and URLs

In [7]:
df["text"] = df.text.map(remove_URL) # map(lambda x: remove_URL(x))
df["text"] = df.text.map(remove_punct)

In [8]:
df.head(10)

Unnamed: 0,text,aspect,label
0,can you check whether its cancelled completely,cancelled,1
1,cannot rely on both milk delivery and grocery ...,Milk,0
2,I get no notification however the app is reall...,notification,0
3,Love this app but would love it even more if G...,view,1
4,it does not let me load a clip on the scene,load,0
5,i love notion as a tool but the mobile just ta...,"tool,",2
6,unlimited yearly at 216year,yearly,1
7,some times missing to deliver some times diffe...,"deliver,",0
8,Would love the ability to “undelete” deleted t...,tasks,1
9,please add this basic feature,Please,2


## Removing Stopwords

In [9]:

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine
# has been programmed to ignore, both when indexing entries for searching and when retrieving them 
# as the result of a search query.
stop = set(stopwords.words("english"))


def remove_stopwords(s):
    s = s.lower()
    # Change 't to 'not'
    s = re.sub(r"\'t", " not", s)
    # Remove @name
    s = re.sub(r'(@.*?)[\s]', ' ', s)
    # Isolate and remove punctuations except '?'
    s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', s)
    s = re.sub(r'[^\w\s\?]', ' ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Remove stopwords except 'not' and 'can'
    s = " ".join([word for word in s.split()
                  if word not in stopwords.words('english')
                  or word in ['not', 'can']])
    # Remove trailing whitespace
    s = re.sub(r'\s+', ' ', s).strip()
    
    return s

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pavan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
df["text"] = df.text.map(remove_stopwords)

In [11]:
df.head(10)

Unnamed: 0,text,aspect,label
0,can check whether cancelled completely,cancelled,1
1,cannot rely milk delivery grocery items,Milk,0
2,get notification however app really fine,notification,0
3,love app would love even gantt charts calendar...,view,1
4,not let load clip scene,load,0
5,love notion tool mobile takes way long load,"tool,",2
6,unlimited yearly 216year,yearly,1
7,times missing deliver times different orders d...,"deliver,",0
8,would love ability undelete deleted tasks app ...,tasks,1
9,please add basic feature,Please,2


In [12]:
df['text'] = df['text'] + " " + df['aspect']

In [None]:
X = list(df['text'])
X

In [None]:
y = list(df['label'])
y

## Splitting data for training and validation

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.1, random_state = 20)

In [16]:
print('Training Data : ' + str(len(X_train)))
print('Validation Data : ' + str(len(X_val)))

Training Data : 3600
Validation Data : 400


## Importing Transformers Pretrained Models

In [18]:
from transformers import TFAutoModel
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification

import tensorflow as tf

In [19]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [20]:
train_encodings = tokenizer(X_train,
                            truncation=True,
                            padding=True)
val_encodings = tokenizer(X_val,
                            truncation=True,
                            padding=True)

In [21]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    y_val
))

In [22]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',
                                                              num_labels=3)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_layer_norm', 'vocab_projector', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'dropout_19', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

## Compiling the model

In [23]:
optimizer = tf.keras.optimizers.Adam(learning_rate = 2e-5, epsilon = 1e-8)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])

## Training the model

In [25]:
model.fit(train_dataset.shuffle(100).batch(3),
          epochs=3,
          batch_size=32,
          validation_data=val_dataset.shuffle(100).batch(3))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1ce4f1fb7f0>

In [26]:
model.save('absa_training')





INFO:tensorflow:Assets written to: absa_training\assets


INFO:tensorflow:Assets written to: absa_training\assets


In [28]:
model.save_pretrained("absa")