In [1]:
import pandas as pd

### Read The data

In [2]:
df = pd.read_csv("Evaluation-dataset.csv",header=None)

In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,Tires where delivered to the garage of my choi...,garage service positive,ease of booking positive,,,,,,,,,,,,
1,"Easy Tyre Selection Process, Competitive Prici...",garage service positive,value for money positive,,,,,,,,,,,,
2,Very easy to use and good value for money.,value for money positive,,,,,,,,,,,,,
3,Really easy and convenient to arrange,ease of booking positive,,,,,,,,,,,,,
4,It was so easy to select tyre sizes and arrang...,location positive,value for money positive,ease of booking positive,,,,,,,,,,,


### Check the number of missing values in target columns

In [5]:
for i in range(1,15):
    print(f"i:{i} \n{df[i].isna().sum()}\n")

i:1 
2003

i:2 
5893

i:3 
8594

i:4 
9687

i:5 
10004

i:6 
10096

i:7 
10119

i:8 
10124

i:9 
10128

i:10 
10131

i:11 
10131

i:12 
10131

i:13 
10131

i:14 
10131



We see that target columns from 10 - 14 have 10131 missing values and only one actual value is present

In [6]:
df[11].value_counts()

11
call wait time positive    1
Name: count, dtype: int64

In [7]:
df[11][df[11]=='call wait time positive']

384    call wait time positive
Name: 11, dtype: object

In [8]:
df[10][df[10]=='incorrect tyres sent positive']

384    incorrect tyres sent positive
Name: 10, dtype: object

We see that entry 384 is the only entry causing target columns 10 - 14 to have that single value.

In [9]:
df.iloc[384]

0           test review, should be deleted
1             refund not actioned positive
2                refund timescale positive
3     mobile fitter didn't arrive positive
4                  change of time positive
5            discount not applied positive
6                tyre agedot code positive
7                  failed payment positive
8                     late notice positive
9                      facilities positive
10           incorrect tyres sent positive
11                 call wait time positive
12                         refund positive
13                       no stock positive
14                      balancing positive
Name: 384, dtype: object

We see that it has been mentioned that entry 384 is a test review and that it should be deleted

### Hence, we're deleting the target columns 10-14 and the test entry

In [10]:
df_1_copy = df.drop(columns=[i for i in range(10,15)],
                    index=384)
df_1_copy.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,Tires where delivered to the garage of my choi...,garage service positive,ease of booking positive,,,,,,,
1,"Easy Tyre Selection Process, Competitive Prici...",garage service positive,value for money positive,,,,,,,
2,Very easy to use and good value for money.,value for money positive,,,,,,,,
3,Really easy and convenient to arrange,ease of booking positive,,,,,,,,
4,It was so easy to select tyre sizes and arrang...,location positive,value for money positive,ease of booking positive,,,,,,


In [11]:
df_1 = df_1_copy.copy()

### We're resetting the index after dropping the test entry 384

In [12]:
df_1.reset_index(drop=True, inplace=True)
df_1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,Tires where delivered to the garage of my choi...,garage service positive,ease of booking positive,,,,,,,
1,"Easy Tyre Selection Process, Competitive Prici...",garage service positive,value for money positive,,,,,,,
2,Very easy to use and good value for money.,value for money positive,,,,,,,,
3,Really easy and convenient to arrange,ease of booking positive,,,,,,,,
4,It was so easy to select tyre sizes and arrang...,location positive,value for money positive,ease of booking positive,,,,,,


In [13]:
df_1.shape

(10131, 10)

### Creating column names for the data

In [14]:
column_names=['Review']+[f"Label_{i}" for i in range(1,10)]

In [15]:
df_1.columns=column_names
df_1.head()

Unnamed: 0,Review,Label_1,Label_2,Label_3,Label_4,Label_5,Label_6,Label_7,Label_8,Label_9
0,Tires where delivered to the garage of my choi...,garage service positive,ease of booking positive,,,,,,,
1,"Easy Tyre Selection Process, Competitive Prici...",garage service positive,value for money positive,,,,,,,
2,Very easy to use and good value for money.,value for money positive,,,,,,,,
3,Really easy and convenient to arrange,ease of booking positive,,,,,,,,
4,It was so easy to select tyre sizes and arrang...,location positive,value for money positive,ease of booking positive,,,,,,


In [16]:
df_1_new = df_1.drop(columns = ['Review']).copy()
df_1_new.head()

Unnamed: 0,Label_1,Label_2,Label_3,Label_4,Label_5,Label_6,Label_7,Label_8,Label_9
0,garage service positive,ease of booking positive,,,,,,,
1,garage service positive,value for money positive,,,,,,,
2,value for money positive,,,,,,,,
3,ease of booking positive,,,,,,,,
4,location positive,value for money positive,ease of booking positive,,,,,,


### Now, we will combine all the target labels into a single target label

* We are considering each entry after dropping the review so that only the target labels are considered. => drop(columns=['Review']).iloc[i]
* Then, we're dropping the NA values in that entry => dropna()
* Once the NA values are dropped, we're taking only the remaining values and converting them to a list. => values.tolist()


In [20]:
labels_list = []
for i in range(len(df_1.drop(columns=['Review']))):
    labels_list.append(df_1.drop(columns=['Review']).iloc[i].dropna().values.tolist())

#### We're now adding this single target column to the dataframe

In [21]:
df_1['Target']=pd.Series(labels_list)

In [22]:
df_2 = df_1[['Review','Target']].copy()
df_2.head()

Unnamed: 0,Review,Target
0,Tires where delivered to the garage of my choi...,"[garage service positive, ease of booking posi..."
1,"Easy Tyre Selection Process, Competitive Prici...","[garage service positive, value for money posi..."
2,Very easy to use and good value for money.,[value for money positive]
3,Really easy and convenient to arrange,[ease of booking positive]
4,It was so easy to select tyre sizes and arrang...,"[location positive, value for money positive, ..."


In [23]:
df_2["Target"].explode().unique()

array(['garage service positive', 'ease of booking positive',
       'value for money positive', 'location positive',
       'length of fitting positive', 'ease of booking negative',
       'change of date negative', 'tyre quality positive',
       'garage service negative', 'wait time negative',
       'delivery punctuality positive', 'mobile fitter positive',
       'advisor/agent service positive', 'advisoragent service positive',
       'wait time positive', 'extra charges positive',
       'length of fitting negative', 'location negative',
       'value for money negative', 'damage negative',
       'mobile fitter negative', 'balancing positive',
       'facilities positive', nan, 'change of time negative',
       'extra charges negative', 'booking confusion negative',
       'late notice negative', 'delivery punctuality negative',
       'discounts positive', 'tyre quality negative',
       'change of date positive', 'advisoragent service negative',
       'call wait time negativ

### Take all empty Target labels

In [24]:
df_2.shape

(10131, 2)

In [25]:
test = []
for i in range(len(df_2['Target'])):
    if len(df_2['Target'][i])==0:
        test.append(df_2.iloc[i])

In [26]:
test_df = pd.DataFrame(test)
test_indices = test_df.index.values.tolist()
test_df.reset_index(drop=True, inplace=True)
test_df

Unnamed: 0,Review,Target
0,No complaints. The price was competitive and t...,[]
1,V good,[]
2,Ease of use. Good selection. Local fitting arr...,[]
3,Tyres for Renault Master. Great choice of tyre...,[]
4,At your expense and loss. I buy the best of th...,[]
...,...,...
1998,Very happy with [REDACTED]. I would definitely...,[]
1999,new tyres that's it,[]
2000,Very simple and straightforward process. Would...,[]
2001,Excellent service from point of order to fitti...,[]


### Delete the test entries from the df_2 dataframe

In [27]:
df_2.shape

(10131, 2)

In [28]:
10131-2003

8128

In [29]:
df_2.drop(index=test_indices,inplace=True)
df_2.reset_index(drop=True,inplace=True)
df_2.shape

(8128, 2)

In [30]:
df_2.head()

Unnamed: 0,Review,Target
0,Tires where delivered to the garage of my choi...,"[garage service positive, ease of booking posi..."
1,"Easy Tyre Selection Process, Competitive Prici...","[garage service positive, value for money posi..."
2,Very easy to use and good value for money.,[value for money positive]
3,Really easy and convenient to arrange,[ease of booking positive]
4,It was so easy to select tyre sizes and arrang...,"[location positive, value for money positive, ..."


Checking if any empty labels are present in the df_2

In [31]:
for i in range(len(df_2['Target'])):
    if len(df_2['Target'][i])==0:
        print (df_2.iloc[i])

In [32]:
len(df_2['Target'].explode().unique())

98

In [33]:
for i in range(len(df_2['Target'])):
    if i<6:
        print(df_2['Target'][i])

['garage service positive', 'ease of booking positive']
['garage service positive', 'value for money positive']
['value for money positive']
['ease of booking positive']
['location positive', 'value for money positive', 'ease of booking positive']
['length of fitting positive', 'ease of booking positive', 'ease of booking negative']


### Correcting labels that are missing / from target.

In [34]:
def label_keys_correcter_method(column):
    label_keys_correcter = {"advisoragent":"advisor/agent",
                            'agedot':'age/dot'}
    for i,list_ in enumerate(column):
        for j,label in enumerate(list_):
            for label_key in label_keys_correcter.keys():
                if label_key in label:
                    temp_list = label.split() #used to split the label into list of words
                    index_= temp_list.index(label_key) #used to find the index where the incorrect wording is
                    temp_list[index_] = label_keys_correcter[label_key] #used to replace the wrong word with correct wording
                    new_label = " ".join(temp_list) #joining back the list into label
                    column[i][j] = new_label #replacing the old label with new label
    return column

In [35]:
label_keys_correcter_method(df_2['Target'])

0       [garage service positive, ease of booking posi...
1       [garage service positive, value for money posi...
2                              [value for money positive]
3                              [ease of booking positive]
4       [location positive, value for money positive, ...
                              ...                        
8123     [tyre quality positive, garage service positive]
8124                           [ease of booking positive]
8125    [refund positive, delivery punctuality positiv...
8126                         [length of fitting positive]
8127    [location positive, delivery punctuality posit...
Name: Target, Length: 8128, dtype: object

In [36]:
df_2['Target'].explode().unique()

array(['garage service positive', 'ease of booking positive',
       'value for money positive', 'location positive',
       'length of fitting positive', 'ease of booking negative',
       'change of date negative', 'tyre quality positive',
       'garage service negative', 'wait time negative',
       'delivery punctuality positive', 'mobile fitter positive',
       'advisor/agent service positive', 'wait time positive',
       'extra charges positive', 'length of fitting negative',
       'location negative', 'value for money negative', 'damage negative',
       'mobile fitter negative', 'balancing positive',
       'facilities positive', 'change of time negative',
       'extra charges negative', 'booking confusion negative',
       'late notice negative', 'delivery punctuality negative',
       'discounts positive', 'tyre quality negative',
       'change of date positive', 'advisor/agent service negative',
       'call wait time negative', 'incorrect tyres sent positive',
       

In [37]:
len(df_2['Target'].explode().unique())

95

### Encoding target

In [38]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
encoded_labels=mlb.fit_transform(df_2['Target'])

In [39]:
labels_=pd.DataFrame(encoded_labels, columns=mlb.classes_)

### Create Tokenizer

We are using BERT Tokenizer

In [40]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

#### Tokenize the input features

In [41]:
reviews = df_2['Review'].to_list()

['Tires where delivered to the garage of my choice,the garage notified me when they had been delivered. A day and time was arranged with the garage and I went and had them fitted,a Hassel free experience.',
 'Easy Tyre Selection Process, Competitive Pricing and Excellent Fitting Service',
 'Very easy to use and good value for money.',
 'Really easy and convenient to arrange',
 'It was so easy to select tyre sizes and arrange local fitting. The prices were competitive',
 "service was excellent. Only slight downside was not knowing exact time at garage although the garage were so quick so I wasn't delayed",
 'User friendly Website. Competitive Prices. Good communications. Efficient service by ATS Euromaster.',
 'Excellent prices and service',
 "It was very straightforward and the garage was great. Hadn't even known about them before",
 'Use of local garage.',
 'Easy to use, also good price.',
 'Outstanding values for money and a friendly professional service',
 'Great price and easy to u

In [42]:
df_2['Review'].shape

(8128,)

In [164]:
reviews = df_2['Review'].to_list()
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
tokenized_reviews = tokenizer.batch_encode_plus(reviews,
                                     padding = 'max_length',
                                     add_special_tokens=True,
                                     max_length=128,
                                     truncation=True,
                                     return_attention_mask=True,
                                     return_token_type_ids=False,
                                     return_tensors='tf')


In [168]:
type(tokenized_reviews)

In [44]:
import tensorflow as tf
from transformers import TFAutoModel

# labels = df_2['Label'].to_list()
labels_ = tf.convert_to_tensor(labels_, dtype=tf.float32)

### To use GPUs on Colab

In [47]:
print("Number of GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
gpus = tf.config.list_logical_devices('GPU')
strategy = tf.distribute.MirroredStrategy(gpus)

Num GPUs Available:  1


In [118]:
ids = tokenized_reviews['input_ids']
mask = tokenized_reviews['attention_mask']

### Creating dataset batches for train and validataion

In [50]:
dataset = tf.data.Dataset.from_tensor_slices((dict(tokenized_reviews), labels_))

In [51]:
dataset = dataset.shuffle(buffer_size=len(reviews)).batch(32)

In [148]:
dataset

<_BatchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 128), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 128), dtype=tf.int32, name=None)}, TensorSpec(shape=(None, 95), dtype=tf.float32, name=None))>

In [81]:
train_size = int(0.75 * len(dataset))
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [121]:
len(dataset)

254

In [120]:
len(reviews)

8128

In [122]:
len(train_dataset)

190

In [119]:
train_size

190

In [123]:
len(val_dataset)

64

In [84]:
for sample in val_dataset.take(1):  # Assuming `take` allows you to get the first 5 samples
    print(sample)

({'input_ids': <tf.Tensor: shape=(32, 128), dtype=int32, numpy=
array([[  101,  6581,  3643, ...,     0,     0,     0],
       [  101,  2307,  3976, ...,     0,     0,     0],
       [  101,  2307, 24656, ...,     0,     0,     0],
       ...,
       [  101,  6581, 24656, ...,     0,     0,     0],
       [  101,  2204,  3325, ...,     0,     0,     0],
       [  101,  5166,  3737, ...,     0,     0,     0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(32, 128), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}, <tf.Tensor: shape=(32, 95), dtype=float32, numpy=
array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0.,

### Creating the DistilBert Model

In [85]:
class Subtheme_model(tf.keras.Model):
    def __init__(self,num_labels,model_name):
        super().__init__()
        self.bert = TFAutoModel.from_pretrained(model_name)
        self.drop = tf.keras.layers.Dropout(0.3)
        self.classifier = tf.keras.layers.Dense(num_labels, activation='sigmoid')

    def call(self, inputs):
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        # token_type_ids = inputs.get('token_type_ids', None)
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]  # Use the first token's embeddings as pooled output
        dropped_output = self.drop(pooled_output, training=True)
        return self.classifier(dropped_output)

In [86]:
with strategy.scope():
  model = Subtheme_model(num_labels=95, model_name='distilbert-base-uncased-finetuned-sst-2-english')
  model.compile(
      optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
      loss=tf.keras.losses.BinaryCrossentropy(),
      metrics=[tf.keras.metrics.BinaryAccuracy()]
  )

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [87]:
print("Length of train_dataset:", len(train_dataset))
print("Length of val_dataset:", len(val_dataset))

Length of train_dataset: 190
Length of val_dataset: 64


In [117]:
with strategy.scope():
  history = model.fit(
      train_dataset,
      validation_data=val_dataset,
      epochs=3
  )

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [93]:
model.summary()

Model: "subtheme_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dropout_1 (Dropout)         multiple                  0         
                                                                 
 dense_1 (Dense)             multiple                  73055     
                                                                 
Total params: 66435935 (253.43 MB)
Trainable params: 66435935 (253.43 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [124]:
predictions = model.predict(val_dataset)



In [125]:
threshold = 0.5
predicted_labels = (predictions > threshold).astype(int)

In [126]:
predicted_labels.shape

(2048, 95)

In [127]:
mlb.inverse_transform(predicted_labels)

[('delivery punctuality positive',),
 ('garage service positive', 'length of fitting positive'),
 ('wait time positive',),
 ('tyre quality positive',),
 ('garage service positive', 'value for money positive'),
 ('ease of booking negative',),
 ('value for money positive',),
 ('location positive', 'value for money positive'),
 ('ease of booking positive', 'location positive', 'value for money positive'),
 ('value for money positive',),
 ('advisor/agent service positive', 'value for money positive'),
 ('value for money positive',),
 ('value for money positive',),
 ('delivery punctuality positive',
  'ease of booking positive',
  'garage service positive',
  'value for money positive'),
 ('value for money positive',),
 ('ease of booking positive', 'location positive'),
 ('garage service positive', 'location positive', 'value for money positive'),
 ('incorrect tyres sent negative',),
 ('garage service positive', 'length of fitting positive'),
 ('advisor/agent service positive',),
 ('advisor