In [2]:
from tensorflow import keras
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification

In [3]:
import pandas as pd

test = pd.read_csv('test.csv')

In [4]:
test.shape

(1000, 2)

In [5]:
test.head(10)

Unnamed: 0,text,aspect
0,improve your customer service and product avai...,Customer service
1,"functionality is great, almost as in desktop v...",mobile version
2,but it keeps starting from zoomed in and then ...,zoomed
3,hey marilyn thanks for your answer the soc2 ty...,Security
4,@delanovc @zoom @airtable @notionhq @calendly ...,apple
5,Love the app but missing some features likes k...,tasks
6,"and secondly, have some customer care assistance!",customer care
7,and it says that while this is rendering you c...,rendering
8,this app is not.,app
9,# notion is an equal opportunity organization ...,organization


In [6]:
test['text'] = test['text'] + ' ' + test['aspect']

In [7]:

import re
import string

def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

# https://stackoverflow.com/questions/34293875/how-to-remove-punctuation-marks-from-a-string-in-python-3-x-using-translate/34294022
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
test["text"] = test.text.map(remove_URL) # map(lambda x: remove_URL(x))
test["text"] = test.text.map(remove_punct)

In [9]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine
# has been programmed to ignore, both when indexing entries for searching and when retrieving them 
# as the result of a search query.
stop = set(stopwords.words("english"))


def remove_stopwords(s):
    s = s.lower()
    # Change 't to 'not'
    s = re.sub(r"\'t", " not", s)
    # Remove @name
    s = re.sub(r'(@.*?)[\s]', ' ', s)
    # Isolate and remove punctuations except '?'
    s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', s)
    s = re.sub(r'[^\w\s\?]', ' ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Remove stopwords except 'not' and 'can'
    s = " ".join([word for word in s.split()
                  if word not in stopwords.words('english')
                  or word in ['not', 'can']])
    # Remove trailing whitespace
    s = re.sub(r'\s+', ' ', s).strip()
    
    return s

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pavan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
test["text"] = test.text.map(remove_stopwords)

In [11]:
test_text = list(test['text'])
test_text[:10]

['improve customer service product availability customer service',
 'functionality great almost desktop version mobile version needs lot improvement esp speed mobile version',
 'keeps starting zoomed slowly zooming music plays zoomed',
 'hey marilyn thanks answer soc2 type 1 certification document can shared external parties security assessment copy report could explore security',
 'delanovc zoom airtable notionhq calendly apple x lulu would interesting ever expand apple',
 'love app missing features likes keeping backlog tasks tasks',
 'secondly customer care assistance customer care',
 'says rendering can duplicate project another dimension rendering',
 'app not app',
 'notion equal opportunity organization not allow discrimination based upon age ethnicity ancestry gender national origin disability race size religion sexual orientation socioeconomic background status prohibited applicable law organization']

In [12]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [13]:
test_encodings = tokenizer(test_text,
                                 truncation=True,
                                 padding=True)

In [14]:
print(test_encodings)

{'input_ids': [[101, 5335, 8013, 2326, 4031, 11343, 8013, 2326, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 15380, 2307, 2471, 15363, 2544, 4684, 2544, 3791, 2843, 7620, 9686, 2361, 3177, 4684, 2544, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [16]:
import tensorflow as tf

In [28]:
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings)
))

In [29]:
print(len(test_dataset))

1000


In [27]:
loaded_model = TFDistilBertForSequenceClassification.from_pretrained('absa_training')

Some layers from the model checkpoint at absa_training were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at absa_training and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
test_output = loaded_model.predict(test_dataset)[0]
print(test_output)

[[ 1.1324677  -1.3956447  -0.1848756 ]
 [ 2.8287299  -2.2925181  -1.0599262 ]
 [-0.45373788  2.7230892  -2.500064  ]
 ...
 [ 1.0552424  -2.023434    0.4134927 ]
 [ 1.0552424  -2.023434    0.4134927 ]
 [ 1.0552424  -2.023434    0.4134927 ]]


In [49]:
test_prediction = []
for i in test_output:
    test_prediction.append(tf.nn.softmax(i, axis=0).numpy())

In [47]:
def findclass(text):
    for i in range(0,3):
        if text[i] == max(text):
            return i

In [51]:
predictions = []

for text in test_prediction:
    predictions.append(findclass(text))

In [59]:
predictions

[0,
 0,
 1,
 2,
 1,
 2,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [60]:
labels_df = pd.DataFrame(predictions, columns = ['label'])  

In [67]:
test['label'] = predictions[:1000]

In [71]:
test.head(10)

Unnamed: 0,text,aspect,label
0,improve customer service product availability ...,Customer service,0
1,functionality great almost desktop version mob...,mobile version,0
2,keeps starting zoomed slowly zooming music pla...,zoomed,1
3,hey marilyn thanks answer soc2 type 1 certific...,Security,2
4,delanovc zoom airtable notionhq calendly apple...,apple,1
5,love app missing features likes keeping backlo...,tasks,2
6,secondly customer care assistance customer care,customer care,1
7,says rendering can duplicate project another d...,rendering,2
8,app not app,app,0
9,notion equal opportunity organization not allo...,organization,0


In [72]:
test.to_csv("test_results.csv")