In [26]:
import pandas as pd
import emoji
import emoji_vietnamese
import string

## Table of Contents
- [Load data](#load-data)
- [Remove #ERROR! values](#remove-error-values)
- [Remove duplicates](#remove-duplicates)
- [Remove outliers](#remove-outliers)
- [Remove special characters](#remove-special-characters)
  - [URL](#remove-url)
  - [Punctuation](#remove-punctuation)
  - [Stopwords](#remove-stopwords)
- [Lowercasing](#lowercasing)
- [Emoji processing](#emoji-processing)
- [Save cleaned data](#save-cleaned-data)

## Load data

In [27]:
train = pd.read_csv('../Dataset/vihsd/train.csv')

In [28]:
train.head()

Unnamed: 0,free_text,label_id
0,Em ƒë∆∞·ª£c l√†m fan c·ª©ng lu√¥n r·ªìi n√® ‚ù§Ô∏è reaction q...,0
1,ƒê√∫ng l√† b·ªçn m·∫Øt h√≠p l√≤ xo th·ª•t :))) b√™n vi·ªát n...,2
2,ƒê·∫≠u VƒÉn C∆∞·ªùng gi·ªù gi·ªëng th·∫±ng sida h∆°n √†,0
3,C√îN ƒê·ªí C·ª§C S√öC V√î NH√ÇN T√çNH ƒê·ªÄ NGHI VN. NH√Ä N∆Ø...,2
4,T·ª´ l√Ω thuy·∫øt ƒë·∫øn th·ª±c h√†nh l√† c·∫£ 1 c√¢u chuy·ªán ...,0


In [29]:
print(train['label_id'].value_counts())

label_id
0    19886
2     2556
1     1606
Name: count, dtype: int64


## Remove null values

In [30]:
train.isnull().sum()

free_text    2
label_id     0
dtype: int64

In [31]:
train = train.dropna()

## Remove #ERROR! values

In [32]:
error_rows = train[train['free_text'].str.contains("#ERROR!")]
print(error_rows)

      free_text  label_id
1729    #ERROR!         0
1867    #ERROR!         0
3568    #ERROR!         0
10788   #ERROR!         0
11218   #ERROR!         0
11674   #ERROR!         0
15413   #ERROR!         0
16080   #ERROR!         0
18660   #ERROR!         2
20848   #ERROR!         1


In [33]:
train = train[~train['free_text'].str.contains("#ERROR!")]

## Remove duplicates

In [34]:
duplicate_rows = train[train['free_text'].duplicated()]
print(duplicate_rows)

                                        free_text  label_id
225                              reaction th·∫ßy ∆°i         0
442                    ƒë·∫∑t kh√¥ g√† ·ªü ƒë√¢u v th·∫ßy ∆°i         0
466                              M·ªát qu√° th·∫ßy ·∫° üòû         0
696                                         xin ·∫°         0
790                     T·∫°i sao ph·∫£i tr·∫£ l·ªùi th·∫ßy         0
...                                           ...       ...
23986  ƒê√¨nh Quang ·ª•a ƒë√¢u c√≥ gi·∫£i g√¨ b√™n vn ƒëau ta         0
23992                        Th√†nh Huy Ho√†ng Ti·∫øn         0
24008                     T·ª± h√†o th·∫ßy Ba gold :))         0
24010                          L·ª° tay th·∫ßy ∆°i :))         0
24022                        Phong Chau c∆∞·ªùi ·ªâa üòÇ         0

[1480 rows x 2 columns]


In [35]:
# Drop duplicates, keep first occurence, make changes directly to the dataframe
train.drop_duplicates(subset='free_text', keep='first', inplace=True)

## Remove outliers
- Some examples have a very big length of characters, not containing helpful information. We will remove them.

In [36]:
num_long_rows = (train['free_text'].str.len() > 500).sum()
print(num_long_rows)

14


In [37]:
train = train[train['free_text'].str.len() <= 500]

## Remove special characters

### URL

In [38]:
# Create a boolean mask for rows that contain a URL
mask = train['free_text'].str.contains(r'http\S+|www\S+', regex=True)

# Use the mask to select a subset of the DataFrame
url_examples = train[mask]

# Print the first few examples
print(url_examples.head())

                                              free_text  label_id
1222  Xem ngay h·∫≠u tr∆∞·ªùng c·ª±c hi·∫øm c·ªßa c√¥ Minh Hi·∫øu ...         0
1432  FB ch√≠nh LinDa: https://www.facebook.com/linda...         0
1839  https://youtu.be/tvyO2B3oEYk th∆∞ gi√£n ƒë√£ c·∫£ nh...         0
2020     @c√¥ng danh nguyen https://youtu.be/fSypgwW1L_s         0
6891  bi·ªát th·ª± c·ªßa nude tiger: https://www.google.co...         0


In [39]:
# URL removal
train['free_text'] = train['free_text'].str.replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)

### Punctuation

In [40]:
train['free_text'] = train['free_text'].str.replace('['+string.punctuation+']', '', regex=True)

### Stopwords
This could led to loss of information, so I will comment this part for now.

In [41]:
# # Load stopwords
# with open('./Stopwords/vietnamese-stopwords.txt', 'r') as f:
#     stopwords = f.read().splitlines()

# # Function to remove stopwords

# def remove_stopwords(text):
#     words = text.split()
#     words = [word for word in words if word not in stopwords]
#     return ' '.join(words)


# # Remove stopwords from 'free_text'
# train['free_text'] = train['free_text'].apply(remove_stopwords)

## Lowercasing

In [42]:
train['free_text'] = train['free_text'].str.lower()

## Emoji processing

In [43]:
# Show emoji rows
def show_emoji():
    mask = train['free_text'].apply(lambda text: any(char in emoji.EMOJI_DATA for char in text))
    # Use the mask to select a subset of the DataFrame
    emoji_rows = train[mask]
    return emoji_rows
# Print the first few rows that contain an emoji
print(show_emoji())

                                               free_text  label_id
0      em ƒë∆∞·ª£c l√†m fan c·ª©ng lu√¥n r·ªìi n√® ‚ù§Ô∏è reaction q...         0
8                                                    ü•∞ü•∞ü•∞         0
10          ƒë∆∞·ª£c anh ∆∞i l√¢u r·ªìi kh√¥ng nghe ph√∫c du rap ü§£         0
11                      c·∫Øt cho tr·∫ª tr√¢u b·ªõt thui m√† üòÇüòÇüòÇ         0
42                                      m·ªát qu√° th·∫ßy ·∫° üòû         0
...                                                  ...       ...
23995  th√¥i xong t√≥c a ch√≠ t√¥i nay l√†m vlog ƒë∆∞a djchi...         0
24018               nguy·ªÖn loan √Ω anh √Ω l√† th·ªãt a ƒëi e üòù         0
24034  v·ª´a l√†m chi·ªÅu nay xong ƒë√¢y  c·ª© ch·ªó uy t√≠n m√† l...         0
24036                               cu·ªôc s√¥ng m∆∞u sinh üè¶         0
24037             l·∫°i xui ch·ªã trang cho ƒÉn ƒë√≤n tr·∫≠n bh ü§£         0

[3233 rows x 2 columns]


In [44]:
# Demojize emojis
train['free_text'] = train['free_text'].apply(emoji_vietnamese.demojize)
# Show an example after demojizing
train['free_text'][8]

':m·∫∑t c∆∞·ªùi v·ªõi 3 tr√°i tim::m·∫∑t c∆∞·ªùi v·ªõi 3 tr√°i tim::m·∫∑t c∆∞·ªùi v·ªõi 3 tr√°i tim:'

In [45]:
# Check remaining emojis
print(show_emoji())

                                               free_text  label_id
429                   ƒë·ªó thu·ª∑ trinh gi√¥ng y m·∫°ng nh√† m üòÄ         0
623                             gu th·ªùi trang g·∫° ƒë·ªãt √† üòÄ         1
667    ph∆∞∆°ng t√∫ ai bi·∫øt ng∆∞·ªùi nghe ƒë√¢u m√† m·∫•y b√†i ƒë·∫•...         0
1278        ch·ªã cho h·ªèi  t√™n tham m·ªπ vi·ªán l√† v·∫≠y ch·ªã  üòÄüòÄ         0
1931                   tu·∫•n b·∫£o ch·∫Øc b√°c pro h∆°n th·∫ßy üòÄüòÄ         0
...                                                  ...       ...
22508           ƒë√£ ƒë·∫øn l√∫c th·ª≠ ƒë·ªô b·ªÅn c·ªßa n√∫t haha r·ªìi üòÄ         0
22939                                             h√≥ng üòÄ         0
23217                    ph√∫c nh∆∞ng ph·ªëi nh·∫°c h·ª£p v·ªõi jüòÄ         0
23371                                b·∫Øt n·ªôp ph·∫°t ngay üò≥         0
23475                 phong c√°ch h·ªü rais c·ªßa ng∆∞·ªùi gi√†uüòÄ         0

[79 rows x 2 columns]


In [46]:
# See list of emojis that have not been processed
text = ' '.join(show_emoji()['free_text'])
emoji.distinct_emoji_list(text)

['üò≥', '‚úã', 'üêÑ', 'üòñ', 'üòÄ', 'üò´']

In [47]:
# Create a dictionary of emoji names
emoji_names = {
    'üòñ': ':m·∫∑t b·ªëi r·ªëi"',
    'üòÄ': ':c∆∞·ªùi toe to√©t:',
    'üò≥': ':m·∫∑t ·ª≠ng ƒë·ªè v√¨ ng∆∞·ª£ng:',
    '‚úã': ':gi∆° tay:',
    'üêÑ': ':con b√≤:',
    'üò´': ':m·∫∑t m·ªát m·ªèi:'
}
# Replace each emoji with its name
for emoji, name in emoji_names.items():
    train['free_text'] = train['free_text'].str.replace(emoji, name)

In [48]:
# Final check
print(train['free_text'][429])

ƒë·ªó thu·ª∑ trinh gi√¥ng y m·∫°ng nh√† m :c∆∞·ªùi toe to√©t:


## Save cleaned data

In [49]:
# Remove rows with empty text again after processing
train['free_text'].dropna(inplace=True)

In [50]:
train.to_csv('../Dataset_Cleaned/clean_train_vihsd.csv', index=False)