## Dependencies

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pickle

## Loading the data

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Customer Feedback Classification/absa/notebooks/dataset/train - train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Customer Feedback Classification/absa/notebooks/dataset/test - test.csv')
test_data

Unnamed: 0,text,aspect
0,improve your customer service and product avai...,Customer service
1,"functionality is great, almost as in desktop v...",mobile version
2,but it keeps starting from zoomed in and then ...,zoomed
3,hey marilyn thanks for your answer the soc2 ty...,Security
4,@delanovc @zoom @airtable @notionhq @calendly ...,apple
...,...,...
995,in a database where i have multiple views enab...,views
996,the mere act of opening the @notionhq tab make...,opening
997,i have shared it to web now and added the link...,web
998,why the members who regularly buy milk has to ...,service charge.


## Augmenting Data to Battle Overfitting

### Package Dependencies

In [None]:
!pip install gensim textblob googletrans

Collecting googletrans
  Downloading googletrans-3.0.0.tar.gz (17 kB)
Collecting httpx==0.13.3
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 2.2 MB/s 
Collecting httpcore==0.9.*
  Downloading httpcore-0.9.1-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 1.2 MB/s 
[?25hCollecting rfc3986<2,>=1.3
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)
Collecting sniffio
  Downloading sniffio-1.2.0-py3-none-any.whl (10 kB)
Collecting hstspreload
  Downloading hstspreload-2021.12.1-py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 15.3 MB/s 
[?25hCollecting h2==3.*
  Downloading h2-3.2.0-py2.py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 2.9 MB/s 
[?25hCollecting h11<0.10,>=0.8
  Downloading h11-0.9.0-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.9 MB/s 
[?25hCollecting hpack<4,>=3.0
  Downloading hpack-3.0.0-py2.py3-n

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
!pip install textaugment

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Collecting textaugment
  Downloading textaugment-1.3.4-py3-none-any.whl (16 kB)
Installing collected packages: textaugment
Successfully installed textaugment-1.3.4


### Augmentation

#### Synonym Replacement

In [None]:
from textaugment import EDA
augmenter = EDA()
aug_data = []

for text in train_data['text']:
  aug = augmenter.synonym_replacement(text)
  aug_data.append(aug)

## Processing

### Attaching necessary columns to Augmented Data

In [None]:
train_data2 = pd.DataFrame(aug_data)

train_data2 = pd.concat([train_data2, train_data['aspect']], axis=1)
train_data2 = pd.concat([train_data2, train_data['label']], axis=1)
train_data2.rename(columns={0: 'text'}, inplace=True)

Joining both Datasets together

In [None]:
train_data = pd.concat([train_data, train_data2], axis=0)

Shuffling DataFrame

In [None]:
train_data = train_data.sample(frac=1)
train_data.reset_index(drop=True, inplace=True)
train_data

Unnamed: 0,text,aspect,label
0,And Russian languages,Russian,1
1,"the account has been cancelled, but i would st...",refund,0
2,"It’s just me, yet I can’t get notice unless I ...",get notifications,0
3,I’d like to only view my highest priorities,priorities,1
4,i have looked there and the text does not have...,text,1
...,...,...,...
7907,the only one app you needed to direct your ide...,app,2
7908,you moldiness abolish delivery charges immedia...,delivery charges,0
7909,"i have blue pencil the account, you will need ...",account,1
7910,"For such a popular project management tool, th...",project,1


In [None]:
print('-----Missing Values-----')
print(train_data.isnull().sum(), '\n')
print('---------Classes---------')
print(train_data['label'].value_counts())
print('--------Duplicates--------')
print(train_data.duplicated().sum())
print('Dropping Duplicates...')
train_data.drop_duplicates(inplace=True)
train_data.duplicated().sum()

-----Missing Values-----
text      0
aspect    0
label     0
dtype: int64 

---------Classes---------
0    3360
1    2588
2    2052
Name: label, dtype: int64
--------Duplicates--------
88
Dropping Duplicates...


0

### Word Embeddings

#### Train

In [None]:
we_X_train = train_data['text'] + train_data['aspect']
y_train = train_data.label

In [None]:
tk = Tokenizer(len(y_train), filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', split=' ')
tk.fit_on_texts(we_X_train)
we_X_train = tk.texts_to_sequences(we_X_train)
we_X_train = pad_sequences(we_X_train, maxlen=32, truncating='post', padding='post')
we_X_train[0]

array([ 884, 3049, 2076,   10, 1635,   14, 1530,  105, 3050,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
      dtype=int32)

#### Test

In [None]:
X_test = test_data['text'] + test_data['aspect']
tk = Tokenizer(len(test_data), filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', split=' ')
tk.fit_on_texts(X_test)
X_test = tk.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=32, truncating='post', padding='post')
X_test[0]

array([278,  27,  77,  46,   4, 144,  46,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0], dtype=int32)

Dumping Sets

In [None]:
pickle.dump(we_X_train, open('/content/drive/MyDrive/Colab Notebooks/Customer Feedback Classification/absa/notebooks/pickled files/WE_X_train.pkl', 'wb'))
pickle.dump(y_train, open('/content/drive/MyDrive/Colab Notebooks/Customer Feedback Classification/absa/notebooks/pickled files/y_train.pkl', 'wb'))
pickle.dump(X_test, open('/content/drive/MyDrive/Colab Notebooks/Customer Feedback Classification/absa/notebooks/pickled files/X_test.pkl', 'wb'))