In [52]:
import pandas as pd

## Load data

In [53]:
train = pd.read_csv('../Dataset/vihsd/train.csv')

In [54]:
train.head()

Unnamed: 0,free_text,label_id
0,Em ƒë∆∞·ª£c l√†m fan c·ª©ng lu√¥n r·ªìi n√® ‚ù§Ô∏è reaction q...,0
1,ƒê√∫ng l√† b·ªçn m·∫Øt h√≠p l√≤ xo th·ª•t :))) b√™n vi·ªát n...,2
2,ƒê·∫≠u VƒÉn C∆∞·ªùng gi·ªù gi·ªëng th·∫±ng sida h∆°n √†,0
3,C√îN ƒê·ªí C·ª§C S√öC V√î NH√ÇN T√çNH ƒê·ªÄ NGHI VN. NH√Ä N∆Ø...,2
4,T·ª´ l√Ω thuy·∫øt ƒë·∫øn th·ª±c h√†nh l√† c·∫£ 1 c√¢u chuy·ªán ...,0


In [55]:
print(train['label_id'].value_counts())

label_id
0    19886
2     2556
1     1606
Name: count, dtype: int64


## Remove null values

In [56]:
train.isnull().sum()

free_text    2
label_id     0
dtype: int64

In [57]:
train = train.dropna()

## Remove #ERROR! values

In [58]:
error_rows = train[train['free_text'].str.contains("#ERROR!")]
print(error_rows)

      free_text  label_id
1729    #ERROR!         0
1867    #ERROR!         0
3568    #ERROR!         0
10788   #ERROR!         0
11218   #ERROR!         0
11674   #ERROR!         0
15413   #ERROR!         0
16080   #ERROR!         0
18660   #ERROR!         2
20848   #ERROR!         1


In [59]:
train = train[~train['free_text'].str.contains("#ERROR!")]

## Remove duplicates

In [60]:
duplicate_rows = train[train['free_text'].duplicated()]
print(duplicate_rows)

                                        free_text  label_id
225                              reaction th·∫ßy ∆°i         0
442                    ƒë·∫∑t kh√¥ g√† ·ªü ƒë√¢u v th·∫ßy ∆°i         0
466                              M·ªát qu√° th·∫ßy ·∫° üòû         0
696                                         xin ·∫°         0
790                     T·∫°i sao ph·∫£i tr·∫£ l·ªùi th·∫ßy         0
...                                           ...       ...
23986  ƒê√¨nh Quang ·ª•a ƒë√¢u c√≥ gi·∫£i g√¨ b√™n vn ƒëau ta         0
23992                        Th√†nh Huy Ho√†ng Ti·∫øn         0
24008                     T·ª± h√†o th·∫ßy Ba gold :))         0
24010                          L·ª° tay th·∫ßy ∆°i :))         0
24022                        Phong Chau c∆∞·ªùi ·ªâa üòÇ         0

[1480 rows x 2 columns]


In [61]:
# Drop duplicates, keep first occurence, make changes directly to the dataframe
train.drop_duplicates(subset='free_text', keep='first', inplace=True)

## Remove outliers
- Some examples have a very big length of characters, not containing helpful information. We will remove them.

In [62]:
num_long_rows = (train['free_text'].str.len() > 500).sum()
print(num_long_rows)

14


In [63]:
train = train[train['free_text'].str.len() <= 500]

## Remove special characters

### URL

In [64]:
# Create a boolean mask for rows that contain a URL
mask = train['free_text'].str.contains(r'http\S+|www\S+', regex=True)

# Use the mask to select a subset of the DataFrame
url_examples = train[mask]

# Print the first few examples
print(url_examples.head())

                                              free_text  label_id
1222  Xem ngay h·∫≠u tr∆∞·ªùng c·ª±c hi·∫øm c·ªßa c√¥ Minh Hi·∫øu ...         0
1432  FB ch√≠nh LinDa: https://www.facebook.com/linda...         0
1839  https://youtu.be/tvyO2B3oEYk th∆∞ gi√£n ƒë√£ c·∫£ nh...         0
2020     @c√¥ng danh nguyen https://youtu.be/fSypgwW1L_s         0
6891  bi·ªát th·ª± c·ªßa nude tiger: https://www.google.co...         0


In [65]:
# URL removal
train['free_text'] = train['free_text'].str.replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)

### Punctuation

In [66]:
train['free_text'] = train['free_text'].str.replace(r'[^\w\s]', '', regex=True)

### Stopwords

In [67]:
# Load stopwords
with open('./Stopwords/vietnamese-stopwords.txt', 'r') as f:
    stopwords = f.read().splitlines()

# Function to remove stopwords

def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word not in stopwords]
    return ' '.join(words)


# Remove stopwords from 'free_text'
train['free_text'] = train['free_text'].apply(remove_stopwords)

## Save cleaned data

In [68]:
train.to_csv('../Dataset_Cleaned/clean_train_vihsd.csv', index=False)