Please note that the code in this notebook is extracted from this GitHub https://github.com/kothiyayogesh/medium-article-code/blob/master/How%20I%20dealt%20with%20Imbalanced%20text%20dataset/data_augmentation_using_language_translation.ipynb and modified to fit with the dataset and purpose of the assignment.

In [1]:
!pip install pandas



In [2]:
!pip install nltk



In [3]:
import pandas as pd
import nltk

In [4]:
file_name = 'Preprocessed_Twitter_data.csv'

In [5]:
df = pd.read_csv(file_name)

In [6]:
df.head()

Unnamed: 0,reply,class,source
0,@thatjohn Have they named the pilot?,query,#4U9525: Robin names Andreas Lubitz as the cop...
1,@thatjohn @mschenk,comment,#4U9525: Robin names Andreas Lubitz as the cop...
2,@tinkalee_12 @USATODAY @khjelmgaard #F4Phantom...,comment,RT @khjelmgaard: German media reporting #Andre...
3,@USATODAY @khjelmgaard ers a link to prove @An...,comment,RT @khjelmgaard: German media reporting #Andre...
4,@USATODAY @khjelmgaard Sure #GermanWings CoPil...,comment,RT @khjelmgaard: German media reporting #Andre...


In [7]:
## Function for augmenting data using langauge translation
## Could not found free service for langauge translation, Use paid service like Azure, Google translator etc

from textblob import TextBlob
from textblob.translate import NotTranslated
import random
sr = random.SystemRandom()

language = ["es", "de", "fr", "ar", "te", "hi", "ja", "fa", "sq", "bg", "nl", "gu", "ig", "kk", "mt", "ps"]

def data_augmentation(message, language, aug_range=1):
    augmented_messages = []
    if hasattr(message, "decode"):
        message = message.decode("utf-8")

    for j in range(0,aug_range) :
        new_message = ""
        text = TextBlob(message)
        try:
            text = text.translate(to=sr.choice(language))   ## Converting to random langauge for meaningful variation
            text = text.translate(to="en")
        except NotTranslated:
            pass
        augmented_messages.append(str(text))

    return augmented_messages

In [8]:
## Dictionary for intent count
## Intent is column name
class_count = df['class'].value_counts().to_dict()

In [9]:
## Get max intent count to match other minority classes through data augmentation
import operator
max_class_count = max(class_count.items(), key=operator.itemgetter(1))[1]

In [10]:
## Loop to interate all messages
import numpy as np
import math
import tqdm
newdf = pd.DataFrame()
for reply_class, count in class_count.items() :
    count_diff = max_class_count - count    ## Difference to fill
    multiplication_count = math.ceil((count_diff)/count)  ## Multiplying a minority classes for multiplication_count times
    if (multiplication_count) :
        old_message_df = pd.DataFrame()
        new_message_df = pd.DataFrame()
        for source, message in tqdm.tqdm(zip(df[df["class"] == reply_class]["source"], df[df["class"] == reply_class]["reply"])):
            ## Extracting existing minority class batch
            dummy1 = pd.DataFrame({"source":[source], "reply":[message]})
            dummy1["class"] = reply_class
            old_message_df = old_message_df.append(dummy1)
            
            ## Creating new augmented batch from existing minority class
            new_messages = data_augmentation(message, language, multiplication_count)
            dummy2 = pd.DataFrame({"source":[source]*len(new_messages), "reply":new_messages})
            dummy2["class"] = reply_class
            new_message_df = new_message_df.append(dummy2)
        
        ## Select random data points from augmented data
        new_message_df=new_message_df.take(np.random.permutation(len(new_message_df))[:count_diff])
        
        ## Merge existing and augmented data points
        newdf = newdf.append([old_message_df,new_message_df])

    else :
        newdf = newdf.append(df[df["class"] == reply_class])

792it [34:27,  2.61s/it]
520it [30:10,  3.48s/it]
496it [29:57,  3.62s/it]


In [11]:
## Print count of all new data points
newdf['class'].value_counts()

comment    4445
support    4445
query      4445
deny       4445
Name: class, dtype: int64

In [12]:
newdf.head()

Unnamed: 0,reply,class,source
1,@thatjohn @mschenk,comment,#4U9525: Robin names Andreas Lubitz as the cop...
2,@tinkalee_12 @USATODAY @khjelmgaard #F4Phantom...,comment,RT @khjelmgaard: German media reporting #Andre...
3,@USATODAY @khjelmgaard ers a link to prove @An...,comment,RT @khjelmgaard: German media reporting #Andre...
4,@USATODAY @khjelmgaard Sure #GermanWings CoPil...,comment,RT @khjelmgaard: German media reporting #Andre...
5,@USATODAY @khjelmgaard #AndreasLubitz #GermanW...,comment,RT @khjelmgaard: German media reporting #Andre...


In [15]:
newdf.to_csv("Augmented_Twitter_data.csv", index=False)

Reference:

Kothiya, Y., 2019. medium-article-code/data_augmentation_using_language_translation.ipynb at master · kothiyayogesh/medium-article-code. [online] GitHub. Available at: <https://github.com/kothiyayogesh/medium-article-code/blob/master/How%20I%20dealt%20with%20Imbalanced%20text%20dataset/data_augmentation_using_language_translation.ipynb> [Accessed 15 October 2021].