In [23]:
%pip install emoji
%pip install spellchecker
%pip install nltk
%pip install contractions
%pip install gingerit

Collecting gingerit
  Downloading gingerit-0.0.0.1.tar.gz (966 bytes)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gingerit
  Building wheel for gingerit (setup.py) ... [?25l[?25hdone
  Created wheel for gingerit: filename=gingerit-0.0.0.1-py3-none-any.whl size=1305 sha256=f41829e631b14a955898b5342f9358229d9a37881d2501e0fedf30e509fb5797
  Stored in directory: /root/.cache/pip/wheels/94/4d/e8/4e9e60cc5892b405032e3d0f044da1f757240e945b4fd5c100
Successfully built gingerit
Installing collected packages: gingerit
Successfully installed gingerit-0.0.0.1


In [42]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import contractions
import re

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
corpus = [
    'Hey, can u plz tell me where’s my order??',

    'i didn’t receive my parcel yet!!!!',

    'Whr’s my ordr 😡😡',

    'delivery late af... i want refund now'
]

# Lowercasing: using list comprehension to lowercase the data by using the built in python function lower()
corpus = [text.lower() for text in corpus]
print(corpus)

# Expanding Contractions: this is to transform contracted words like can't, don't etc. Such words will be epanded to cannot, do not respectively.
corpus = [contractions.fix(text) for text in corpus]
print("Removed Contractions: ",corpus)

# Expanding abbreviations and removing emojis: created a map from the given data to expand slangs
abbreviation_map = {
    'u': 'you',
    'plz': 'please',
    'whr': 'where',
    'af': 'as fuck' # found the meaning from google.
}
def expand_abbreviations(text):
    words = re.findall(r'\b\w+\b', text)  # extract clean words which also removes emojis.
    expanded_words = [abbreviation_map.get(word, word) for word in words]
    return " ".join(expanded_words)

corpus = [expand_abbreviations(text) for text in corpus]
print("Expanded Abbreviations: ",corpus)

# Removing Punctuation
corpus = [text.translate(str.maketrans('', '', string.punctuation)) for text in corpus]
print("Removed Punctuations: ",corpus)

# Correcting Spells: using the textblob library missspelled words are fixed.
from textblob import TextBlob
corpus = [str(TextBlob(text).correct()) for text in corpus]
print("Spells Corrected: ",corpus)

# Tokenization
tokenized_corpus = [word_tokenize(text) for text in corpus]
print(tokenized_corpus)

# Removing StopWords: downloaded english stopwords from the nltk.corpus library are used to remove stopwords.
stop_words = set(stopwords.words('english'))
filtered_corpus = [[word for word in doc if word not in stop_words] for doc in tokenized_corpus]
print(filtered_corpus)


# Lemmatization: The WordNetLemmatizer module from nltk.stem is used to transform words into their base form. haven't used stemming as lemmatization is more accurate then stemming.
lemmatizer = WordNetLemmatizer()
lemmatized_corpus = [[lemmatizer.lemmatize(word) for word in doc] for doc in filtered_corpus]
print(lemmatized_corpus)

filtered_corpus = [' '.join(doc) for doc in filtered_corpus]
print("Last: ",filtered_corpus)

['hey, can u plz tell me where’s my order??', 'i didn’t receive my parcel yet!!!!', 'whr’s my ordr 😡😡', 'delivery late af... i want refund now']
Removed Contractions:  ['hey, can you plz tell me where is my order??', 'i did not receive my parcel yet!!!!', 'whr’s my ordr 😡😡', 'delivery late af... i want refund now']
Expanded Abbreviations:  ['hey can you please tell me where is my order', 'i did not receive my parcel yet', 'where s my ordr', 'delivery late as fuck i want refund now']
Removed Punctuations:  ['hey can you please tell me where is my order', 'i did not receive my parcel yet', 'where s my ordr', 'delivery late as fuck i want refund now']
Spells Corrected:  ['hey can you please tell me where is my order', 'i did not receive my parcel yet', 'where s my order', 'delivery late as fuck i want refund now']
[['hey', 'can', 'you', 'please', 'tell', 'me', 'where', 'is', 'my', 'order'], ['i', 'did', 'not', 'receive', 'my', 'parcel', 'yet'], ['where', 's', 'my', 'order'], ['delivery', 

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [54]:
# To preserve emotional intensity in the dataset we can keep the emojis and translate them into text so that the model can categorize different emotion.
# We can do this before expanding the abbreviations so that they are translated properly.
import emoji
corpus = [
    'Hey, can u plz tell me where’s my order?',

    'i didn’t receive my parcel yet!!!!',

    'Whr’s my ordr 😡😡',

    'delivery late af... i want refund now'
]
emoji_corpus = [emoji.demojize(doc, delimiters=(" <", "> ")) for doc in corpus]
print(emoji_corpus)

# We can also add tags to repetative punctuations like !!! -> <exclaim> and so on and keep the delimeters <> intact in the dataset
def tag_emotional_punctuation(text):
    text = re.sub(r'!{2,}', ' <exclaim> ', text)       # 2+ exclamation marks
    text = re.sub(r'\?{2,}', ' <question> ', text)     # 2+ question marks
    text = re.sub(r'\.{2,}', ' <pause> ', text)        # 2+ periods
    return text
corpus = [tag_emotional_punctuation(text) for text in emoji_corpus]

# emoji and punctuation tag delimeters (<>) are kept while removing punctuations.
preserve_punct = "<>"
remove_punct = ''.join([p for p in string.punctuation if p not in preserve_punct])
corpus = [text.translate(str.maketrans('', '', remove_punct)) for text in corpus]

print(corpus)

['Hey, can u plz tell me where’s my order?', 'i didn’t receive my parcel yet!!!!', 'Whr’s my ordr  <enraged_face>  <enraged_face> ', 'delivery late af... i want refund now']
['Hey can u plz tell me where’s my order', 'i didn’t receive my parcel yet <exclaim> ', 'Whr’s my ordr  <enragedface>  <enragedface> ', 'delivery late af <pause>  i want refund now']
