#**1a_Cleaning/Preprocessing Slang and Chatword dataset**

# 1.) First approach - Cleaning/Preprocessing Slang/Chatword dataset

## Load Libraries

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

## Read datasets

*   [Chat / Internet Slang](https://www.kaggle.com/datasets/gowrishankarp/chat-slang-abbreviations-acronyms/code)
*   [Twitter Slang](https://www.kaggle.com/datasets/gogylogy/twitterslang/data)

In [None]:
# for Google Colab
# from google.colab import drive
# drive.mount('/content/drive/')
# path_data = "/content/drive/My Drive/NLP_PROJECT/data/Slang/"

# for local
path_data = "D:/Google Drive/NLP_PROJECT/data/Slang/"

# column names
slang_column_names = ['slang', 'translation']

In [None]:
df_internet_slang  = pd.read_csv(path_data + 'slang.csv', usecols=[1,2], names = slang_column_names, skiprows=1)
df_twitter_slang = pd.read_csv(path_data + 'twitterSlang.csv', names = slang_column_names, skiprows=1)
df_slang = pd.concat([df_internet_slang, df_twitter_slang])
df_slang = df_slang.astype(str)

## Data Cleaning

The function **check_and_clean_slang_dataset(df)** and the further data cleaning & preprocessing functions below take our slanglist as an input and performs the following data cleaning tasks on it:

- Dealing with Missing Values
- Trimming Whitespaces
- Dealing with Empty Strings
- Converting text (slang, translation) to Lowercase
- Removing Duplicates
- Analyzing Translations (slang, translation) to define unique slang words










In [None]:
def check_and_clean_slang_dataset(df):
    spacing_string = "-------------------------------------------------------------------------"
    # DELETING MISSING VALUES
    print("Checking for missing values:")
    missing_slang = df.isnull().sum()["slang"] != 0
    missing_translation = df.isnull().sum()["translation"] != 0
    if missing_slang or missing_translation:
        print("\tMissing slang: ", missing_slang)
        print("\tMissing translation: ", missing_translation)
        print("\tRemoving missing values...")
        df = df.dropna()
    else:
        print("\tNo missing values found.")
    print(spacing_string)


    # DELETING WHITESPACES AT THE BEGINNING AND END OF THE STRINGS
    print("Trimming whitespaces from both columns...")
    df["slang"] = df["slang"].str.strip()
    df["translation"] = df["translation"].str.strip()
    print(spacing_string)


    # DELETING EMPTY STRINGS
    print("Checking for empty strings:")
    missing_slang = len(df[df["slang"] == ""]) != 0
    missing_translation = len(df[df["translation"] == ""]) != 0
    if missing_slang or missing_translation:
        print("\tMissing slang: ", missing_slang)
        print("\tMissing translation: ", missing_translation)
        print("\tRemoving empty strings...")
        df = df[df["slang"] != ""]
        df = df[df["translation"] != ""]
    else:
        print("\tNo empty strings found.")
    print(spacing_string)


    # MAKING ALL STRINGS LOWERCASE
    print("Making all strings lowercase...")
    df["slang"] = df["slang"].str.lower()
    df["translation"] = df["translation"].str.lower()
    print(spacing_string)


    # DELETING DUPLICATES
    print("Checking for duplicates:")
    num_duplicates = df.duplicated(subset=['slang', 'translation'], keep=False).sum()
    print("- number of duplicates before deletion: ", num_duplicates)
    if num_duplicates != 0:
        print("- example of duplicates: ")
        display(df[df.duplicated(subset=['slang', 'translation'], keep=False)].sort_values(by=['slang']).head(4))
        df.drop_duplicates(subset=['slang', 'translation'], keep='first', inplace=True)
        print("- number of duplicates after deletion: ", df.duplicated(subset=['slang', 'translation'], keep=False).sum())
    print(spacing_string)


    # CHECKING WHICH SLANG WORDS HAVE MULTIPLE TRANSLATIONS
    slang_mutiple_translations = df[df.duplicated(subset=['slang'], keep=False)].sort_values(by=['slang'])
    print("Number of unique slang words that have multiple translations: ", slang_mutiple_translations.drop_duplicates(subset=['slang']).shape[0])
    if len(slang_mutiple_translations) > 0:
        print("- examples:")
        display(slang_mutiple_translations)
        # what slang words have the highest amount of different translations?
        slang_mutiple_translations_mod = slang_mutiple_translations.copy(deep=True)
        slang_mutiple_translations_mod['count'] = slang_mutiple_translations_mod['slang'].map(slang_mutiple_translations_mod['slang'].value_counts())
        slang_mutiple_translations_mod = slang_mutiple_translations_mod.drop_duplicates(subset=['slang']).drop(columns=['translation']).reset_index(drop=True).sort_values(by=['count'], ascending=False)
        max_count = slang_mutiple_translations_mod['count'].max()
        most_frequent = slang_mutiple_translations_mod[slang_mutiple_translations_mod['count'] == max_count]['slang'].values
        print("Most frequent slang words: ", most_frequent)
        display(df[df['slang'].isin(most_frequent)].sort_values(by=['slang']))
        print(spacing_string)


    # CHECKING WHICH TRANSLATIONS HAVE MULTIPLE CORRESPONDING SLANG WORDS
    translation_mutiple_slang = df[df.duplicated(subset=['translation'], keep=False)].sort_values(by=['translation'])
    print("Number of unique translations that have multiple corresponding slang words: ", translation_mutiple_slang.drop_duplicates(subset=['translation']).shape[0])
    if len(translation_mutiple_slang) > 0:
        print("- examples:")
        display(translation_mutiple_slang)
    # what translations have the most different corresponding slang words?
    translation_mutiple_slang_mod = translation_mutiple_slang.copy(deep=True)
    translation_mutiple_slang_mod['count'] = translation_mutiple_slang_mod['translation'].map(translation_mutiple_slang_mod['translation'].value_counts())
    translation_mutiple_slang_mod = translation_mutiple_slang_mod.drop_duplicates(subset=['translation']).drop(columns=['slang']).reset_index(drop=True).sort_values(by=['count'], ascending=False)
    max_count = translation_mutiple_slang_mod['count'].max()
    most_frequent = translation_mutiple_slang_mod[translation_mutiple_slang_mod['count'] == max_count]['translation'].values
    print("Most frequent translations: ", most_frequent)
    display(df[df['translation'].isin(most_frequent)].sort_values(by=['translation']))
    print(spacing_string)


    # CHECKING THE LONGEST SLANG WORD
    max_length = int(df['slang'].str.len().max())
    print("Max length of slang word: ", max_length)
    print("Longest slang words with max length:")
    display(df[df['slang'].str.len() == df['slang'].str.len().max()])
    print(spacing_string)

    # CHECKING FOR SLANG WORDS WITH LENGTH 1
    print("Number of slang words of length 1:", df[df['slang'].str.len() == 1].shape[0])
    display(df[df['slang'].str.len() == 1].head(10))

    # SORT BY SLANG
    df = df.sort_values(by=['slang']).reset_index(drop=True)

    # return dfs
    return df, slang_mutiple_translations, translation_mutiple_slang

## Further analysis of our two slang datasets

In [None]:
# CHECKING THE DATAFRAME
def check_df(df):
    print("Length of the dataset: ", len(df))
    print("Data frame head:")
    display(df.head())
    print("Data frame information:")
    display(df.describe())

check_df(df_slang)

Length of the dataset:  3461
Data frame head:


Unnamed: 0,slang,translation
0,2day,today
1,2m2h,too much too handle
2,2moro,tomorrow
3,2nite,tonight
4,4eae,for ever and ever


Data frame information:


Unnamed: 0,slang,translation
count,3461,3461
unique,3268,3220
top,lol,i do not know
freq,4,5


In [None]:
df_slang, df_slang_slang_mutiple_translations, df_slang_translation_mutiple_slang = check_and_clean_slang_dataset(df_slang)

Checking for missing values:
	No missing values found.
-------------------------------------------------------------------------
Trimming whitespaces from both columns...
-------------------------------------------------------------------------
Checking for empty strings:
	No empty strings found.
-------------------------------------------------------------------------
Making all strings lowercase...
-------------------------------------------------------------------------
Checking for duplicates:
- number of duplicates before deletion:  284
- example of duplicates: 


Unnamed: 0,slang,translation
6,aar,at any rate
283,aar,at any rate
7,aayf,as always your friend
287,aayf,as always your friend


- number of duplicates after deletion:  0
-------------------------------------------------------------------------
Number of unique slang words that have multiple translations:  54
- examples:


Unnamed: 0,slang,translation
5,aaf,as a friend
279,aaf,always and forever
12,adr,address
311,adr,all due respect
17,ama,ask me anything
...,...,...
263,wth,what the heck
3279,ygm,you get me you have got mail
271,ygm,you have got mail
97,yt,youtube


Most frequent slang words:  ['nm']


Unnamed: 0,slang,translation
180,nm,nothing much
181,nm,not much
1982,nm,never mind nothing much no message
89,nm,never mind


-------------------------------------------------------------------------
Number of unique translations that have multiple corresponding slang words:  83
- examples:


Unnamed: 0,slang,translation
0,ab,about
1,abt,about
281,aamof,as a matter of fact
376,amof,as a matter of fact
318,afaia,as far as i am aware
...,...,...
275,yw,you are welcome
3258,yaw,you are welcome
2958,uw,you are welcome
3287,yhbw,you have been warned


Most frequent translations:  ['because']


Unnamed: 0,slang,translation
32,b/c,because
12,coz,because
47,bc,because
49,cuz,because


-------------------------------------------------------------------------
Max length of slang word:  13
Longest slang words with max length:


Unnamed: 0,slang,translation
1150,iitywimiwhtky,if i tell you what it means i will have to kill you
1151,iitywtmwybmad,if i tell you what this means will you buy me a drink
2342,pmymhmmfswgad,pardon me you must have mistaken me for someone who gives a damn


-------------------------------------------------------------------------
Number of slang words of length 1: 5


Unnamed: 0,slang,translation
155,k,okay
1359,j,joking
3,b,be
32,u,you
80,r,are


In [None]:
display(df_slang_translation_mutiple_slang.head(10))

Unnamed: 0,slang,translation
0,ab,about
1,abt,about
281,aamof,as a matter of fact
376,amof,as a matter of fact
318,afaia,as far as i am aware
319,afaiaa,as far as i am aware
14,afaic,as far as i am concerned
320,afaiac,as far as i am concerned
12,coz,because
47,bc,because


In [None]:
display(df_slang_slang_mutiple_translations.head(20))

Unnamed: 0,slang,translation
5,aaf,as a friend
279,aaf,always and forever
12,adr,address
311,adr,all due respect
17,ama,ask me anything
368,ama,ask me anything against medical advice
24,atm,at the moment
437,atm,at the moment automated teller machine
39,bbs,be back soon
476,bbs,be back soon bulletin board system


## Observations & further preprocessing of our two slang datasets
After manunal inspection of slang words with multiple meanings, we can see there are some cases where translation column has mutiple possible translations in it, e.g.:  
- **bg** = *bad game baby gangster*,
- **ama** = *ask me anything against medical advice*,
- **gac** = *guilty as charged get a clue*
- etc.  

It would be quite time consuimng to go over 3k rows to check and split the translations manually.  
Thus, we decided to assume that the **number of letters in the slang word is equal to the number of words in the translation or less**.  
This case will ensure that:  
- &nbsp;&nbsp; **a)** instead of having translation of *"bbs"* twice = *"be back soon"* or *"be back soon bulletin board system"*, we just have the first translation,  
- &nbsp;&nbsp; **b)** we still have two correct translations for *"bc"* = *"because"* and *"before christ"*,  
- &nbsp;&nbsp; **c)** messy columns like *"ht"* = *"hat tip or heard through"* or *"wth"* = *"what or where or who the hell"* will be removed,  
- &nbsp;&nbsp; **d)** we will not have to deal with cases like *"msg"* = *"message"* or *"message monosodium glutamate"*

Nevertheless, we are aware that this might results in deletion of some slang words that do not directly follow the rule above. However, we believe that this is the best solution for now.

In [None]:
def clean_translations(df):
    # clean df_slang such that number of letters in the slang word is equal to the number of words in the translation or less
    df['word_count_translation'] = df['translation'].str.split().str.len()
    df['first_letters_translation'] = df['translation'].apply(lambda text: [word[0] for word in text.split()])
    df['first_letters_slang'] = df['slang'].apply(lambda slang: [letter for letter in str(slang)])
    df['same_letters'] = df['first_letters_translation'] == df['first_letters_slang']

    # keep only the rows that have word_count_translation = 1 or same_letters = True
    df_deleted_rows = df[~((df['word_count_translation'] == 1) | (df['same_letters'] == True))]
    print(len(df_deleted_rows), "rows were deleted, examples: ")
    display(df_deleted_rows.iloc[10:20])

    df_cleaned = df[(df['word_count_translation'] == 1) | (df['same_letters'] == True)]
    df_cleaned = df_cleaned.drop(columns=['word_count_translation', 'first_letters_translation', 'first_letters_slang', 'same_letters'])

    return df_cleaned, df_deleted_rows


df_slang_cleaned, df_deleted_rows = clean_translations(df_slang)

755 rows were deleted, examples: 


Unnamed: 0,slang,translation,word_count_translation,first_letters_translation,first_letters_slang,same_letters
72,agh,are not going to happen,5,"[a, n, g, t, h]","[a, g, h]",False
77,ai,artificial intelligence as if,4,"[a, i, a, i]","[a, i]",False
84,ainec,and it is not even close,6,"[a, i, i, n, e, c]","[a, i, n, e, c]",False
97,alg,are not life grand,4,"[a, n, l, g]","[a, l, g]",False
99,alkqn,almighty latin king and queen nation,6,"[a, l, k, a, q, n]","[a, l, k, q, n]",False
104,am,ante meridiem amplitude modulation,4,"[a, m, a, m]","[a, m]",False
105,ama,ask me anything against medical advice,6,"[a, m, a, a, m, a]","[a, m, a]",False
115,amof,as a matter of fact,5,"[a, a, m, o, f]","[a, m, o, f]",False
117,amt,alpha methyltryptamine,2,"[a, m]","[a, m, t]",False
126,anzac,australian and new zealand army corps,6,"[a, a, n, z, a, c]","[a, n, z, a, c]",False


Based on the new insights from the output above of the rows that are being deleted we found a recurring pattern of a bunch of slang words that could have not been deleted.  

There are some translations that include for example 'you are', while abbreviation for 'are' is not visible in the slang word.

In [None]:
# if translation column starts from 'you are' then delete 'a' from first_letters_translation list at index 1
# if translation column starts from 'i am' then delete 'a' from first_letters_translation list at index 1
# if translation column starts from 'do not' then delete 'n' from first_letters_translation list at index 1
# if translation column starts from 'i will' then delete 'w' from first_letters_translation list at index 1
# if translation column starts from 'i do not' then delete 'n' from first_letters_translation list at index 2
def remove_letter(letter_list, index, letter):
    if len(letter_list) >= index+1 and letter_list[index] == letter:
        del letter_list[index]
    return letter_list

def recover_some_deleted_rows(df_deleted_rows, df):
    phrases_to_save = ['you are', 'i am', 'do not', 'i will', 'i do not']
    letters_to_remove = ['a', 'a', 'n', 'w', 'n']
    letter_indices_to_remove = [1, 1, 1, 1, 2]
    saved_rows_df = pd.DataFrame()
    for phrase, index, letter in zip(phrases_to_save, letter_indices_to_remove, letters_to_remove):
        # print(f'phrase: {phrase}, index: {index}, letter: {letter}')
        is_starting_with = df_deleted_rows['translation'].str.startswith(phrase)
        rows = df_deleted_rows.loc[is_starting_with].copy()
        rows['first_letters_translation'] = rows['first_letters_translation'].apply(lambda letter_list: remove_letter(letter_list, index, letter))
        rows['same_letters'] = rows['first_letters_translation'] == rows['first_letters_slang']
        rows['same_letters'] = rows['same_letters'].astype(bool)
        print(f'phrase: \'{phrase}\', recovered rows: ')
        display(rows[rows['same_letters'] == True])
        saved_rows_df = pd.concat([saved_rows_df, rows[rows['same_letters'] == True]])

    # add back popular translation like 'im' = 'i am' and 'ur' = 'you are'
    is_i_am_or_you_are = (df_deleted_rows['translation'] == 'i am') | (df_deleted_rows['translation'] == 'you are')
    selected_rows = df_deleted_rows[is_i_am_or_you_are]
    saved_rows_df = pd.concat([saved_rows_df, selected_rows]).drop(columns=['word_count_translation', 'first_letters_translation', 'first_letters_slang', 'same_letters'])

    # add saved_rows_df to df_slang_cleaned
    df = pd.concat([df, saved_rows_df])

    return df

df_slang_cleaned = recover_some_deleted_rows(df_deleted_rows, df_slang_cleaned)

phrase: 'you are', recovered rows: 


Unnamed: 0,slang,translation,word_count_translation,first_letters_translation,first_letters_slang,same_letters
3249,yk,you are kidding,3,"[y, k]","[y, k]",True
3270,ymw,you are most welcome,4,"[y, m, w]","[y, m, w]",True
3272,ynw,you are not welcome,4,"[y, n, w]","[y, n, w]",True
3277,yoyo,you are on your own,5,"[y, o, y, o]","[y, o, y, o]",True
3279,yqw,you are quite welcome,4,"[y, q, w]","[y, q, w]",True
3290,ysvw,you are so very welcome,5,"[y, s, v, w]","[y, s, v, w]",True
3294,ytb,you are the best,4,"[y, t, b]","[y, t, b]",True
3296,ytm,you are the man,4,"[y, t, m]","[y, t, m]",True
3298,ytmnd,you are the man now dog,6,"[y, t, m, n, d]","[y, t, m, n, d]",True
3299,yto,you are the one,4,"[y, t, o]","[y, t, o]",True


phrase: 'i am', recovered rows: 


Unnamed: 0,slang,translation,word_count_translation,first_letters_translation,first_letters_slang,same_letters
877,ib,i am back,3,"[i, b]","[i, b]",True
964,ignb,i am going now bye,5,"[i, g, n, b]","[i, g, n, b]",True
990,iil,i am in love,4,"[i, i, l]","[i, i, l]",True
1004,ij,i am joking,3,"[i, j]","[i, j]",True
1008,ijk,i am just kidding,4,"[i, j, k]","[i, j, k]",True
1010,ijs,i am just saying,4,"[i, j, s]","[i, j, s]",True
1098,inb,i am not bothered,4,"[i, n, b]","[i, n, b]",True
1103,ins,i am not sure,4,"[i, n, s]","[i, n, s]",True
1109,ioab,i am on a boat,5,"[i, o, a, b]","[i, o, a, b]",True
1111,ioh,i am outta here,4,"[i, o, h]","[i, o, h]",True


phrase: 'do not', recovered rows: 


Unnamed: 0,slang,translation,word_count_translation,first_letters_translation,first_letters_slang,same_letters
413,dln,do not look now,4,"[d, l, n]","[d, l, n]",True
415,dltbbb,do not let the bed bugs bite,7,"[d, l, t, b, b, b]","[d, l, t, b, b, b]",True
416,dltbgyd,do not let the bastards grind you down,8,"[d, l, t, b, g, y, d]","[d, l, t, b, g, y, d]",True
417,dltm,do not lie to me,5,"[d, l, t, m]","[d, l, t, m]",True
421,dmaf,do not make a fuss,5,"[d, m, a, f]","[d, m, a, f]",True
426,dmi,do not mention it,4,"[d, m, i]","[d, m, i]",True
427,dmiid,do not mind if i do,6,"[d, m, i, i, d]","[d, m, i, i, d]",True
428,dmm,do not mind me,4,"[d, m, m]","[d, m, m]",True
429,dmml,do not make me laugh,5,"[d, m, m, l]","[d, m, m, l]",True
436,dmy,do not mess yourself,4,"[d, m, y]","[d, m, y]",True


phrase: 'i will', recovered rows: 


Unnamed: 0,slang,translation,word_count_translation,first_letters_translation,first_letters_slang,same_letters
881,ibb,i will be back,4,"[i, b, b]","[i, b, b]",True
890,ibw,i will be waiting,4,"[i, b, w]","[i, b, w]",True
962,igmc,i will get my coat,5,"[i, g, m, c]","[i, g, m, c]",True
1029,ikyp,i will keep you posted,5,"[i, k, y, p]","[i, k, y, p]",True
1046,ilyf,i will love you forever,5,"[i, l, y, f]","[i, l, y, f]",True
1048,ilyk,i will let you know,5,"[i, l, y, k]","[i, l, y, k]",True
1154,itai,i will think about it,5,"[i, t, a, i]","[i, t, a, i]",True
1177,ityl,i will tell you later,5,"[i, t, y, l]","[i, t, y, l]",True


phrase: 'i do not', recovered rows: 


Unnamed: 0,slang,translation,word_count_translation,first_letters_translation,first_letters_slang,same_letters
912,idbi,i do not believe it,5,"[i, d, b, i]","[i, d, b, i]",True
913,idby,i do not believe you,5,"[i, d, b, y]","[i, d, b, y]",True
914,idc,i do not care,4,"[i, d, c]","[i, d, c]",True
916,idec,i do not even care,5,"[i, d, e, c]","[i, d, e, c]",True
917,idek,i do not even know,5,"[i, d, e, k]","[i, d, e, k]",True
918,ideky,i do not even know you,6,"[i, d, e, k, y]","[i, d, e, k, y]",True
919,idewtk,i do not even want to know,7,"[i, d, e, w, t, k]","[i, d, e, w, t, k]",True
920,idfg,i do not feel good,5,"[i, d, f, g]","[i, d, f, g]",True
921,idfli,i do not feel like it,6,"[i, d, f, l, i]","[i, d, f, l, i]",True
922,idgad,i do not give a damn,6,"[i, d, g, a, d]","[i, d, g, a, d]",True


In [None]:
# pass the cleaned df to the function again
df_slang_cleaned, df_slang_cleaned_slang_mutiple_translations, df_slang_cleaned_translation_mutiple_slang = check_and_clean_slang_dataset(df_slang_cleaned)

Checking for missing values:
	No missing values found.
-------------------------------------------------------------------------
Trimming whitespaces from both columns...
-------------------------------------------------------------------------
Checking for empty strings:
	No empty strings found.
-------------------------------------------------------------------------
Making all strings lowercase...
-------------------------------------------------------------------------
Checking for duplicates:
- number of duplicates before deletion:  0
-------------------------------------------------------------------------
Number of unique slang words that have multiple translations:  25
- examples:


Unnamed: 0,slang,translation
8,aaf,as a friend
9,aaf,always and forever
45,adr,all due respect
46,adr,address
237,bc,before christ
238,bc,because
277,bgd,black gangster disciples
278,bgd,background
503,eod,end of discussion
504,eod,end of day


Most frequent slang words:  ['nm']


Unnamed: 0,slang,translation
1853,nm,never mind
1854,nm,nothing much
1855,nm,not much


-------------------------------------------------------------------------
Number of unique translations that have multiple corresponding slang words:  13
- examples:


Unnamed: 0,slang,translation
18,ab,about
27,abt,about
198,b/c,because
238,bc,because
386,coz,because
394,cuz,because
197,b-day,birthday
249,bday,birthday
499,em,email
500,eml,email


Most frequent translations:  ['because']


Unnamed: 0,slang,translation
198,b/c,because
238,bc,because
386,coz,because
394,cuz,because


-------------------------------------------------------------------------
Max length of slang word:  13
Longest slang words with max length:


Unnamed: 0,slang,translation
997,iitywimiwhtky,if i tell you what it means i will have to kill you
998,iitywtmwybmad,if i tell you what this means will you buy me a drink
2229,pmymhmmfswgad,pardon me you must have mistaken me for someone who gives a damn


-------------------------------------------------------------------------
Number of slang words of length 1: 5


Unnamed: 0,slang,translation
196,b,be
1209,j,joking
1290,k,okay
2333,r,are
2858,u,you


In [None]:
# additional step to check if there are any translations that are substrings of other translations for the same slang word
def check_substrings(df):
    df_subsctrings = pd.DataFrame(columns=['slang', 'translation', 'translation2'])
    for slang, translation in df.values:
        for slang2, translation2 in df.values:
            if translation in translation2 and translation != translation2 and slang == slang2:
                df_subsctrings = pd.concat([df_subsctrings, pd.DataFrame([[slang, translation, translation2]], columns=['slang', 'translation', 'translation2'])])
    if len(df_subsctrings) > 0:
        display(df_subsctrings)

check_substrings(df_slang_cleaned)

Unnamed: 0,slang,translation,translation2
0,lmao,laughing my ass of,laughing my ass off


In [None]:
# fixing a typo - delete a row from df_slang_cleaned where traslation == 'laughing my ass of'
df_slang_cleaned = df_slang_cleaned[df_slang_cleaned['translation'] != 'laughing my ass of']

In [None]:
# sort values and save the cleaned dataset to csv
def sort_and_save(df, output_filename):
    df = df.sort_values(by=['slang']).reset_index(drop=True)
    df.to_csv(path_data + output_filename, index=False)

sort_and_save(df_slang_cleaned, 'two_datasets_merged_slang_cleaned.csv')

# 2.) Second approach - Cleaning/Preprocessing Slang/Chatword dataset

## 2.1) Cleaning manually constructed dataset of slang words - **SMALL** dataset

Due to the issues we discovered in the first approach by connecting the two messy datasets from Kaggle (as seen above in the 1st approach) we decided to construct a smaller list of slang words with high data quality.  

To achieve this, we have conducted inputs to create our second SMALL slang list from the following sources:
- [Twitter Slang](https://www.kaggle.com/datasets/gogylogy/twitterslang/data)
- [Common Chat Abbreviations](https://www.merriam-webster.com/wordplay/93-common-texting-abbreviations)
- asked ChatGPT to list the Top 50 common slang/chat words

Here we are dealing with a fully manually constructed list to prevent messy translations. Therefore there we did not need to use the function clean_translations().  

In [None]:
df_m_slang_small = pd.read_excel(path_data + 'slanglist_cleaned.xlsx', sheet_name=0, names=slang_column_names)
df_m_slang_small = df_m_slang_small.astype(str)

def clean_small_df(df):
    check_df(df)
    df, df_slang_mutiple_translations, df_translation_mutiple_slang = check_and_clean_slang_dataset(df)
    if len(df_translation_mutiple_slang) > 0:
        print("Same translations with multiple corresponding slang words:")
        display(df_translation_mutiple_slang.head(10))
    if len(df_slang_mutiple_translations) > 0:
        print("Same slang words with multiple translations:")
        display(df_slang_mutiple_translations.head(20))
    check_substrings(df)
    sort_and_save(df, 'm_slang_small_cleaned.csv')
    return df

df_m_slang_small_cleaned = clean_small_df(df_m_slang_small)

Length of the dataset:  160
Data frame head:


Unnamed: 0,slang,translation
0,404,Not Found
1,2nite,tonight
2,4eva,forever
3,AFAIC,as far as I’m concerned
4,afaik,as far as i know


Data frame information:


Unnamed: 0,slang,translation
count,160,160
unique,160,153
top,404,because
freq,1,4


Checking for missing values:
	No missing values found.
-------------------------------------------------------------------------
Trimming whitespaces from both columns...
-------------------------------------------------------------------------
Checking for empty strings:
	No empty strings found.
-------------------------------------------------------------------------
Making all strings lowercase...
-------------------------------------------------------------------------
Checking for duplicates:
- number of duplicates before deletion:  2
- example of duplicates: 


Unnamed: 0,slang,translation
78,k,okay
79,k,okay


- number of duplicates after deletion:  0
-------------------------------------------------------------------------
Number of unique slang words that have multiple translations:  0
Number of unique translations that have multiple corresponding slang words:  7
- examples:


Unnamed: 0,slang,translation
10,b/c,because
16,bc,because
26,coz,because
29,cuz,because
17,bday,birthday
18,b-day,birthday
13,b4n,bye for now
20,bfn,bye for now
93,nm,never mind
99,nvm,never mind


Most frequent translations:  ['because']


Unnamed: 0,slang,translation
10,b/c,because
16,bc,because
26,coz,because
29,cuz,because


-------------------------------------------------------------------------
Max length of slang word:  6
Longest slang words with max length:


Unnamed: 0,slang,translation
97,nthing,nothing


-------------------------------------------------------------------------
Number of slang words of length 1: 3


Unnamed: 0,slang,translation
78,k,okay
118,r,are
150,u,you


Same translations with multiple corresponding slang words:


Unnamed: 0,slang,translation
10,b/c,because
16,bc,because
26,coz,because
29,cuz,because
17,bday,birthday
18,b-day,birthday
13,b4n,bye for now
20,bfn,bye for now
93,nm,never mind
99,nvm,never mind


## 2.2) Cleaning half manually constructed dataset of slang words - **LARGE** dataset

As stated above we were facing issues in terms of the data quality with our first approach. Consequently, we were also forced to reclean the large dataset "[Twitter Slang](https://www.kaggle.com/datasets/gogylogy/twitterslang/data)" as well.

Therefore, we deleted very long slang and translations, as the chances of them being in our Twitter or Amazon datset being 0. Addtionally, we discovered, that certain slang/translation pairs were having multiple translations options like for example (wth, what or where or who). So we decided to manually inspect translations words which contained:

- **"or"**
- **"," (comma)**
- **"-" (Hyphen)**
- etc.

Furthermore, we have discovered that we need unique slang words, to improve our performance, which means that two slang-translations pairs like (nm, nevermind) vs. (nm, not much) had to be inspected manually to remove one of them. Last but not least, we performed a similar strategy as in the first approach with the method **check_and_clean_slang_dataset(df)** as you can see in the code below.

In [None]:
df_m_slang_big = pd.read_excel(path_data + 'slanglist_cleaned.xlsx', sheet_name=1, usecols=['Slang', 'Text/Word'])
df_m_slang_big.columns = slang_column_names
df_m_slang_big = df_m_slang_big.astype(str)

check_df(df_m_slang_big)
df_m_slang_big, df_m_slang_big_slang_mutiple_translations, df_m_slang_big_translation_mutiple_slang = check_and_clean_slang_dataset(df_m_slang_big)
if len(df_m_slang_big_translation_mutiple_slang) > 0:
    print("Same translations with multiple corresponding slang words:")
    display(df_m_slang_big_translation_mutiple_slang.head(10))
if len(df_m_slang_big_slang_mutiple_translations) > 0:
    print("Same slang words with multiple translations:")
    display(df_m_slang_big_slang_mutiple_translations.head(20))
print("-------------------------------------------------------------------------")
df_m_slang_big_cleaned, df_m_slang_big_deleted_rows = clean_translations(df_m_slang_big)
df_m_slang_big_cleaned = recover_some_deleted_rows(df_m_slang_big_deleted_rows, df_m_slang_big_cleaned)
df_m_slang_big_cleaned, df_m_slang_big_cleaned_slang_mutiple_translations, df_m_slang_big_cleaned_translation_mutiple_slang = check_and_clean_slang_dataset(df_m_slang_big_cleaned)
check_substrings(df_m_slang_big_cleaned)
sort_and_save(df_m_slang_big_cleaned, 'm_slang_big_cleaned.csv')

Length of the dataset:  2772
Data frame head:


Unnamed: 0,slang,translation
0,toefl,test of english as foreign language
1,pdea,public display of elderly affection
2,ussr,union of soviet socialist republics
3,ioi,indication of interest i am over it
4,wiwt,what i wore today wish i was there


Data frame information:


Unnamed: 0,slang,translation
count,2772,2772
unique,2772,2706
top,toefl,because
freq,1,4


Checking for missing values:
	No missing values found.
-------------------------------------------------------------------------
Trimming whitespaces from both columns...
-------------------------------------------------------------------------
Checking for empty strings:
	No empty strings found.
-------------------------------------------------------------------------
Making all strings lowercase...
-------------------------------------------------------------------------
Checking for duplicates:
- number of duplicates before deletion:  0
-------------------------------------------------------------------------
Number of unique slang words that have multiple translations:  0
Number of unique translations that have multiple corresponding slang words:  64
- examples:


Unnamed: 0,slang,translation
948,aamof,as a matter of fact
973,amof,as a matter of fact
2726,bc,because
2720,b/c,because
2723,cuz,because
...,...,...
1255,wuwh,wish you were here
942,yvw,you are very welcome
878,yavw,you are very welcome
941,ybw,you have been warned


Most frequent translations:  ['because']


Unnamed: 0,slang,translation
2720,b/c,because
2722,coz,because
2723,cuz,because
2726,bc,because


-------------------------------------------------------------------------
Max length of slang word:  5
Longest slang words with max length:


Unnamed: 0,slang,translation
0,toefl,test of english as foreign language
5,bffwb,best friend forever with benefits
6,totus,teleprompter of the united states
24,eprom,electronically programmable rom
25,imaho,in my absolutely honest opinion
...,...,...
2644,srsly,seriously
2681,2moro,tomorrow
2682,2nite,tonight
2683,b-day,birthday


-------------------------------------------------------------------------
Number of slang words of length 1: 4


Unnamed: 0,slang,translation
2751,j,joking
2768,k,okay
2770,r,are
2771,u,you


Same translations with multiple corresponding slang words:


Unnamed: 0,slang,translation
948,aamof,as a matter of fact
973,amof,as a matter of fact
2726,bc,because
2720,b/c,because
2723,cuz,because
2722,coz,because
984,bmfl,best mates for life
983,bm4l,best mates for life
2683,b-day,birthday
2685,bday,birthday


-------------------------------------------------------------------------
530 rows were deleted, examples: 


Unnamed: 0,slang,translation,word_count_translation,first_letters_translation,first_letters_slang,same_letters
70,ainec,and it is not even close,6,"[a, i, i, n, e, c]","[a, i, n, e, c]",False
83,alg,are not life grand,4,"[a, n, l, g]","[a, l, g]",False
95,amof,as a matter of fact,5,"[a, a, m, o, f]","[a, m, o, f]",False
97,amt,alpha methyltryptamine,2,"[a, m]","[a, m, t]",False
107,aok,age of kings all ok,5,"[a, o, k, a, o]","[a, o, k]",False
109,aon,all or nothing as of now,6,"[a, o, n, a, o, n]","[a, o, n]",False
111,aot,as opposed to among other things,6,"[a, o, t, a, o, t]","[a, o, t]",False
128,ark,act of random kindness,4,"[a, o, r, k]","[a, r, k]",False
137,ata,ait to air actual time of arrival,7,"[a, t, a, a, t, o, a]","[a, t, a]",False
150,att,at this time all the time,6,"[a, t, t, a, t, t]","[a, t, t]",False


phrase: 'you are', recovered rows: 


Unnamed: 0,slang,translation,word_count_translation,first_letters_translation,first_letters_slang,same_letters
2723,yk,you are kidding,3,"[y, k]","[y, k]",True
2741,ynw,you are not welcome,4,"[y, n, w]","[y, n, w]",True
2746,yoyo,you are on your own,5,"[y, o, y, o]","[y, o, y, o]",True
2748,yqw,you are quite welcome,4,"[y, q, w]","[y, q, w]",True
2759,ysvw,you are so very welcome,5,"[y, s, v, w]","[y, s, v, w]",True
2763,ytm,you are the man,4,"[y, t, m]","[y, t, m]",True
2765,ytmnd,you are the man now dog,6,"[y, t, m, n, d]","[y, t, m, n, d]",True
2766,yto,you are the one,4,"[y, t, o]","[y, t, o]",True
2768,yvw,you are very welcome,4,"[y, v, w]","[y, v, w]",True


phrase: 'i am', recovered rows: 


Unnamed: 0,slang,translation,word_count_translation,first_letters_translation,first_letters_slang,same_letters
746,ib,i am back,3,"[i, b]","[i, b]",True
817,ignb,i am going now bye,5,"[i, g, n, b]","[i, g, n, b]",True
843,iil,i am in love,4,"[i, i, l]","[i, i, l]",True
852,ij,i am joking,3,"[i, j]","[i, j]",True
856,ijk,i am just kidding,4,"[i, j, k]","[i, j, k]",True
858,ijs,i am just saying,4,"[i, j, s]","[i, j, s]",True
933,inb,i am not bothered,4,"[i, n, b]","[i, n, b]",True
937,ins,i am not sure,4,"[i, n, s]","[i, n, s]",True
942,ioab,i am on a boat,5,"[i, o, a, b]","[i, o, a, b]",True
944,ioh,i am outta here,4,"[i, o, h]","[i, o, h]",True


phrase: 'do not', recovered rows: 


Unnamed: 0,slang,translation,word_count_translation,first_letters_translation,first_letters_slang,same_letters
346,dln,do not look now,4,"[d, l, n]","[d, l, n]",True
348,dltm,do not lie to me,5,"[d, l, t, m]","[d, l, t, m]",True
351,dmaf,do not make a fuss,5,"[d, m, a, f]","[d, m, a, f]",True
356,dmi,do not mention it,4,"[d, m, i]","[d, m, i]",True
357,dmiid,do not mind if i do,6,"[d, m, i, i, d]","[d, m, i, i, d]",True
358,dmm,do not mind me,4,"[d, m, m]","[d, m, m]",True
359,dmml,do not make me laugh,5,"[d, m, m, l]","[d, m, m, l]",True
366,dmy,do not mess yourself,4,"[d, m, y]","[d, m, y]",True
378,dntk,do not need to know,5,"[d, n, t, k]","[d, n, t, k]",True
395,dta,do not trust anyone,4,"[d, t, a]","[d, t, a]",True


phrase: 'i will', recovered rows: 


Unnamed: 0,slang,translation,word_count_translation,first_letters_translation,first_letters_slang,same_letters
749,ibb,i will be back,4,"[i, b, b]","[i, b, b]",True
757,ibw,i will be waiting,4,"[i, b, w]","[i, b, w]",True
815,igmc,i will get my coat,5,"[i, g, m, c]","[i, g, m, c]",True
876,ikyp,i will keep you posted,5,"[i, k, y, p]","[i, k, y, p]",True
892,ilyf,i will love you forever,5,"[i, l, y, f]","[i, l, y, f]",True
894,ilyk,i will let you know,5,"[i, l, y, k]","[i, l, y, k]",True
978,itai,i will think about it,5,"[i, t, a, i]","[i, t, a, i]",True
1000,ityl,i will tell you later,5,"[i, t, y, l]","[i, t, y, l]",True


phrase: 'i do not', recovered rows: 


Unnamed: 0,slang,translation,word_count_translation,first_letters_translation,first_letters_slang,same_letters
772,idbi,i do not believe it,5,"[i, d, b, i]","[i, d, b, i]",True
773,idby,i do not believe you,5,"[i, d, b, y]","[i, d, b, y]",True
774,idc,i do not care,4,"[i, d, c]","[i, d, c]",True
775,idec,i do not even care,5,"[i, d, e, c]","[i, d, e, c]",True
776,idek,i do not even know,5,"[i, d, e, k]","[i, d, e, k]",True
777,ideky,i do not even know you,6,"[i, d, e, k, y]","[i, d, e, k, y]",True
778,idfg,i do not feel good,5,"[i, d, f, g]","[i, d, f, g]",True
779,idfli,i do not feel like it,6,"[i, d, f, l, i]","[i, d, f, l, i]",True
780,idgad,i do not give a damn,6,"[i, d, g, a, d]","[i, d, g, a, d]",True
782,idgi,i do not get it,5,"[i, d, g, i]","[i, d, g, i]",True


Checking for missing values:
	No missing values found.
-------------------------------------------------------------------------
Trimming whitespaces from both columns...
-------------------------------------------------------------------------
Checking for empty strings:
	No empty strings found.
-------------------------------------------------------------------------
Making all strings lowercase...
-------------------------------------------------------------------------
Checking for duplicates:
- number of duplicates before deletion:  0
-------------------------------------------------------------------------
Number of unique slang words that have multiple translations:  0
Number of unique translations that have multiple corresponding slang words:  9
- examples:


Unnamed: 0,slang,translation
163,b/c,because
198,bc,because
322,coz,because
330,cuz,because
162,b-day,birthday
209,bday,birthday
787,idk,i do not know
795,idnk,i do not know
1006,iwbb,i will be back
749,ibb,i will be back


Most frequent translations:  ['because']


Unnamed: 0,slang,translation
163,b/c,because
198,bc,because
322,coz,because
330,cuz,because


-------------------------------------------------------------------------
Max length of slang word:  5
Longest slang words with max length:


Unnamed: 0,slang,translation
2,2moro,tomorrow
3,2nite,tonight
11,aamof,as a matter of fact
34,adgth,all dogs go to heaven
46,afaik,as far as i know
...,...,...
779,idfli,i do not feel like it
780,idgad,i do not give a damn
783,idhac,i do not have a clue
784,idhtt,i do not have the time


-------------------------------------------------------------------------
Number of slang words of length 1: 4


Unnamed: 0,slang,translation
1029,j,joking
1099,k,okay
1970,r,are
2402,u,you
