In [None]:
!pip install nlpaug
!pip install transformers



In [None]:
!pip install autocorrect



In [None]:
from nlpaug.augmenter import word

import pandas as pd
import re
import nltk
import numpy as np
from autocorrect import Speller
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
spell = Speller(lang='en')
stop_words = set(stopwords.words('english'))

removal_positive = ['but', "aren't", "couldn't", "didn't", "doesn't", "don't", "hadn't", "hasn't", "haven't", "isn't",
                    "mightn't", "mustn't", "needn't", "shan't", "shouldn't", "wasn't", "won't", "wouldn't"]
stop_words.difference_update(removal_positive)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

drive_url = "/content/drive/MyDrive/LSTM-GRU-data/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dataset_target = "Dataset/"
img_target = "Img/"
model_checkpoint_target = "Model_checkpoint/"
preprocessed_target = "Preprocessed/"
pretrained_target = "Pretrained/"

In [None]:
def reduce_lengthening(text: str) -> str:
    """
    Reduce consecutive character lengthening in the input text.

    This function identifies consecutive character lengthening (e.g., "sooo" or "coooool") in the text and replaces them with double characters (e.g., "soo" or "cool").

    Parameters:
        text (str): The input text containing potential character lengthening.

    Returns:
        str: The text with consecutive character lengthening reduced.

    Example:
        >>> reduced_text = reduce_lengthening("sooo good!")
        >>> print(reduced_text)
        "soo good!"
    """
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)


def text_preprocess(doc: str) -> str:
    """
    Perform comprehensive text preprocessing on the input document.

    This function performs several text preprocessing steps, including lowercase conversion, removal of hashtags, mentions, links, numbers, and more.
    It also tokenizes the text, reduces word lengthening, corrects spelling, lemmatizes words, and removes short words.

    Parameters:
        doc (str): The input document to be preprocessed.

    Returns:
        str: The preprocessed text.

    Example:
        >>> preprocessed_text = text_preprocess("I love this product! It's amazing!!!")
        >>> print(preprocessed_text)
        "love product amazing"
    """
    #Lowercasing all the letters
    temp = doc.lower()
    #Removing hashtags and mentions
    temp = re.sub("@[A-Za-z0-9_]+", "", temp)
    temp = re.sub("#[A-Za-z0-9_]+", "", temp)
    #Removing links
    temp = re.sub(r"http\S+", "", temp)
    temp = re.sub(r"www.\S+", "", temp)
    #removing numbers
    temp = re.sub("[0-9]", "", temp)
    #Removing '
    temp = re.sub("'", " ", temp)

    #Tokenization
    temp = word_tokenize(temp)
    #Fixing Word Lengthening
    temp = [reduce_lengthening(w) for w in temp]
    #spell corrector
    temp = [spell(w) for w in temp]
    #stem
    temp = [lemmatizer.lemmatize(w) for w in temp]
    #Removing short words
    temp = [w for w in temp if len(w) > 2]

    temp = " ".join(w for w in temp)

    return temp


examples_test_cleaning_text = "I don't feel good today"

text_preprocess(examples_test_cleaning_text)

'don feel good today'

### Target: Make the number of positive datas as same as negative data:
#### Expect: Positive: `210000` rows
####         Negative: `210000` rows

In [None]:
data = pd.read_csv(drive_url+dataset_target+"cleaned_text_emotion.csv")
data['label'].value_counts()


0    218838
1    165154
Name: label, dtype: int64

In [None]:
positive_data = data[data['label']==1]
negative_data = data[data['label']==0]
positive_data_sample = positive_data.sample(frac=0.33,random_state=17)
positive_data_sample['label'].value_counts()

1    54501
Name: label, dtype: int64

In [None]:
positive_data['label'].value_counts()

1    165154
Name: label, dtype: int64

In [None]:
negative_data['label'].value_counts()

0    218838
Name: label, dtype: int64

In [None]:
augmentor = word.ContextualWordEmbsAug(model_path="bert-base-uncased",action="insert")

In [None]:
def augment(sent:str) -> str:
    augmented_text = augmentor.augment(sent)
    return "".join(augmented_text)

sample_text = "I don't feel good today"
augment(sample_text)

"i don'cold t even feel good or today"

In [None]:
# augmented = [augment(sent) for sent in positive_data_sample['text']]

In [None]:
# np.save(drive_url+preprocessed_target+"augmented.npy",augmented)

In [None]:
augmented = np.load(drive_url+preprocessed_target+"augmented.npy")

In [None]:
# cleaned_augmented = [text_preprocess(sent) for sent in augmented]

In [None]:
# np.save(drive_url+preprocessed_target+"cleaned_augmented.npy",cleaned_augmented)

In [None]:
cleaned_augmented = np.load(drive_url+preprocessed_target+"cleaned_augmented.npy" )

In [None]:
positive_data_sample['text'] = cleaned_augmented

In [None]:
positive_augmented_data = pd.concat([positive_data,positive_data_sample],ignore_index=True)
positive_augmented_data['label'].value_counts()

1    219655
Name: label, dtype: int64

In [None]:
augmented_data = pd.concat([negative_data,positive_augmented_data],ignore_index=True)
augmented_data['label'].value_counts()

1    219655
0    218838
Name: label, dtype: int64

In [77]:
augmented_data.to_csv(drive_url+dataset_target+"processed_text_emotion.csv")