## Dependencies

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pickle

## Loading the data

In [105]:
train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Customer Feedback Classification/absa/notebooks/dataset/train - train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Customer Feedback Classification/absa/notebooks/dataset/test - test.csv')
test_data

Unnamed: 0,text,aspect
0,improve your customer service and product avai...,Customer service
1,"functionality is great, almost as in desktop v...",mobile version
2,but it keeps starting from zoomed in and then ...,zoomed
3,hey marilyn thanks for your answer the soc2 ty...,Security
4,@delanovc @zoom @airtable @notionhq @calendly ...,apple
...,...,...
995,in a database where i have multiple views enab...,views
996,the mere act of opening the @notionhq tab make...,opening
997,i have shared it to web now and added the link...,web
998,why the members who regularly buy milk has to ...,service charge.


## Augmenting Data to Battle Overfitting

### Package Dependencies

In [24]:
!pip install gensim textblob googletrans



In [77]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
!pip install textaugment

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Augmentation

#### Synonym Replacement

In [106]:
from textaugment import EDA
augmenter = EDA()
aug_data = []

for text in train_data['text']:
  aug = augmenter.synonym_replacement(text)
  aug_data.append(aug)

## Processing

### Attaching necessary columns to Augmented Data

In [107]:
train_data2 = pd.DataFrame(aug_data)

train_data2 = pd.concat([train_data2, train_data['aspect']], axis=1)
train_data2 = pd.concat([train_data2, train_data['label']], axis=1)
train_data2.rename(columns={0: 'text'}, inplace=True)

Joining both Datasets together

In [108]:
train_data = pd.concat([train_data, train_data2], axis=0)

Shuffling DataFrame

In [109]:
train_data = train_data.sample(frac=1)
train_data.reset_index(drop=True, inplace=True)
train_data

Unnamed: 0,text,aspect,label
0,"better with the routine daily, weekly, monthly...",app,0
1,You can use a template or make it from scratch.,template,1
2,love using @notionhq as a tool to manage all m...,organise,2
3,but for a flying note i will use google keep i...,google keep,0
4,there is no customer support.,customer support.,0
...,...,...,...
7995,delight i did not subscribe,subscribe,0
7996,"hello, how can i cancel disembarrass account?",account?,1
7997,"timely delivery, flexibility",flexibility,2
7998,unlimited videos exports/month unlimited premi...,Premium,2


In [112]:
print('-----Missing Values-----')
print(train_data.isnull().sum(), '\n')
print('---------Classes---------')
print(train_data['label'].value_counts())
print('--------Duplicates--------')
print(train_data.duplicated().sum())
print('Dropping Duplicates...')
train_data.drop_duplicates(inplace=True)
train_data.duplicated().sum()

-----Missing Values-----
text      0
aspect    0
label     0
dtype: int64 

---------Classes---------
0    3360
1    2588
2    2052
Name: label, dtype: int64
--------Duplicate--------
86
Dropping Duplicates...


0

### Word Embeddings

#### Train

In [114]:
we_X_train = train_data['text'] + train_data['aspect']
y_train = train_data.label

In [115]:
tk = Tokenizer(len(y_train))
tk.fit_on_texts(we_X_train)
we_X_train = tk.texts_to_sequences(we_X_train)
we_X_train = pad_sequences(we_X_train, maxlen=32, truncating='post', padding='post')
we_X_train[0]

array([ 213,   22,    1, 2694,  180,  727,  430,   28,  816, 2059,    9,
          1,  226,   28, 3037,    8,   41,  396,    1,    5,  454,    5,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
      dtype=int32)

#### Test

In [116]:
X_test = test_data['text'] + test_data['aspect']
tk = Tokenizer(len(test_data))
tk.fit_on_texts(X_test)
X_test = tk.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=32, truncating='post', padding='post')
X_test[0]

array([278,  27,  77,  46,   4, 144,  46,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0], dtype=int32)

Dumping Sets

In [117]:
pickle.dump(we_X_train, open('/content/drive/MyDrive/Colab Notebooks/Customer Feedback Classification/absa/notebooks/pickled files/WE_X_train.pkl', 'wb'))
pickle.dump(y_train, open('/content/drive/MyDrive/Colab Notebooks/Customer Feedback Classification/absa/notebooks/pickled files/y_train.pkl', 'wb'))
pickle.dump(X_test, open('/content/drive/MyDrive/Colab Notebooks/Customer Feedback Classification/absa/notebooks/pickled files/X_test.pkl', 'wb'))