In [None]:
from sklearn.datasets import load_files
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import unicodedata
import underthesea
import pandas as pd

Load data

In [None]:
train_set = load_files(container_path="./data/train_set/", encoding="utf-8", decode_error="replace")
test_set = load_files(container_path="./data/test_set/", encoding="utf-8", decode_error="replace")
val_set = load_files(container_path="./data/val_set/", encoding="utf-8", decode_error="replace")

# Display data

Display the number of samples in each set

In [None]:
sets = ["Train set", "Test set", "Validation set"]
num_of_samples = [len(train_set["data"]), len(test_set["data"]), len(val_set["data"])]

sns.set_style("whitegrid")
plt.figure(figsize=(7, 5))
sns.barplot(x=sets, y=num_of_samples, palette="viridis")
plt.title("Number of samples in each set")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Display number of samples per class in train set

In [None]:
classes = train_set["target_names"]
temp, num_of_samples = np.unique(train_set["target"], return_counts=True)

sns.set_style("whitegrid")
plt.figure(figsize=(7, 5))
sns.barplot(x=classes, y=num_of_samples, palette="viridis")
plt.title("Number of samples in each class")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Display number of samples per class in test set

In [None]:
classes = test_set["target_names"]
temp, num_of_samples = np.unique(test_set["target"], return_counts=True)

sns.set_style("whitegrid")
plt.figure(figsize=(7, 5))
sns.barplot(x=classes, y=num_of_samples, palette="viridis")
plt.title("Number of samples in each class")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Display number of samples per class in validation set

In [None]:
classes = val_set["target_names"]
temp, num_of_samples = np.unique(val_set["target"], return_counts=True)

sns.set_style("whitegrid")
plt.figure(figsize=(7, 5))
sns.barplot(x=classes, y=num_of_samples, palette="viridis")
plt.title("Number of samples in each class")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Preprocessing data

Remove punctuations and numbers

In [None]:
def remove_punctuation_and_number(text):
    sample = unicodedata.normalize("NFC", text)
    sample = re.sub(r"[^a-zA-ZàáảãạâầấẩẫậăằắẳẵặèéẻẽẹêềếểễệđìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶÈÉẺẼẸÊỀẾỂỄỆĐÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴÂĂĐÔƠƯ ]", " ", sample)
    sample = re.sub(r'\s+', " ", sample)
    sample = sample.replace("nlđo", "").strip(" ")
    sample = sample.replace("NLĐO", "").strip(" ")
    return sample.lower().strip()

In [22]:
def preprocessing_data(path):
    data = load_files(container_path=f"./data/{path}/", encoding="utf-8", decode_error="replace")
    preprocessed_data = [remove_punctuation_and_number(text) for text in data.data]
    df = pd.DataFrame({"article": preprocessed_data, "label": data.target})
    df['word_count'] = df['article'].apply(lambda x: len(str(x).split()))
    df = df[df['word_count'] >= 10]
    df = df.drop(columns=["word_count"])
    df.to_csv(f"./data/{path}.csv", encoding="utf-8", index=False)

In [23]:
preprocessing_data("train_set")
preprocessing_data("test_set")
preprocessing_data("val_set")