In [15]:
import sys
import os

# Add project root to path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from config.config import get_config

# Get configuration
config = get_config()

In [16]:
try:
    train_path = os.path.join(config.paths.raw_data_dir, 'train_nor_811.xlsx')
    if not os.path.exists(train_path):
        raise FileNotFoundError(f"Training file not found at: {train_path}")
    
    df_train = pd.read_excel(train_path)
except Exception as e:
    print(f"Error loading data: {str(e)}")

In [6]:
df_train

Unnamed: 0.1,Unnamed: 0,Emotion,Sentence
0,188,Other,cho m√¨nh xin b√†i nh·∫°c t√™n l√† g√¨ v·ªõi ·∫°
1,166,Disgust,cho ƒë√°ng ƒë·ªùi con qu·ª∑ . v·ªÅ nh√† l√¥i con nh√† m√†y ...
2,1345,Disgust,lo h·ªçc ƒëi . y√™u ƒë∆∞∆°ng lol g√¨ hay l·∫°i th√≠ch h·ªçc...
3,316,Enjoyment,u·ªõc g√¨ sau n√†y v·ªÅ gi√† v·∫´n c√≥ th·ªÉ nh∆∞ c·ª• n√†y :))
4,1225,Enjoyment,m·ªói l·∫ßn c√≥ video c·ªßa con l√† c·ª© coi ƒëi coi l·∫°i ...
...,...,...,...
5543,1332,Disgust,ƒë∆∞·ªùng c·ªßa nh√† c·ª• hay sao m√† c·ª• c·∫•m ng∆∞·ªùi ta ƒë·ªó...
5544,825,Other,nh√¨n m·∫∑t h√©o queo lu√¥n
5545,165,Other,tao ƒëi xe m√°y m·ªói l·∫ßn mu·ªën ƒë·ªÉ xe ƒëi ƒë√¢u l√† phi...
5546,363,Enjoyment,th√≠ch th√¢n h√¨nh boss r·ªìi nhan üòå


In [1]:
# Display emotion distribution
if 'df_train' in locals():
    print("Emotion Distribution:")
    print(df_train['Emotion'].value_counts())

In [2]:
# Plot emotion distribution
if 'df_train' in locals():
    plt.figure(figsize=(10, 6))
    sns.countplot(x='Emotion', data=df_train, palette='Set2')
    plt.title('Bi·ªÉu ƒë·ªì ph√¢n ph·ªëi c√°c nh√£n c·∫£m x√∫c trong t·∫≠p d·ªØ li·ªáu hu·∫•n luy·ªán')
    plt.xlabel('Emotion')
    plt.ylabel('S·ªë l∆∞·ª£ng')
    plt.xticks(rotation=45)

    # Save plot using config path
    fig_path = os.path.join(config.paths.figures_dir, 'Static_emotion.png')
    plt.savefig(fig_path, dpi=300, bbox_inches='tight')
    plt.show()

In [20]:
import sys
import os

# Th√™m project root v√†o sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from src.preprocess.preprocess_data import clean_doc

# Clean text data
df_train['cleaned_text'] = df_train['Sentence'].apply(clean_doc)
print("Sample cleaned text:")
#in d√≤ng 2
print(df_train['cleaned_text'].head(2))



Sample cleaned text:
0                cho m√¨nh xin b√†i nh·∫°c t√™n l√† g√¨ v·ªõi ·∫°
1    cho ƒë√°ng ƒë·ªùi con qu·ª∑ v·ªÅ nh√† l√¥i con nh√† m√†y ra...
Name: cleaned_text, dtype: object


In [19]:
df_train['Sentence'].head()

0                cho m√¨nh xin b√†i nh·∫°c t√™n l√† g√¨ v·ªõi ·∫°
1    cho ƒë√°ng ƒë·ªùi con qu·ª∑ . v·ªÅ nh√† l√¥i con nh√† m√†y ...
2    lo h·ªçc ƒëi . y√™u ƒë∆∞∆°ng lol g√¨ hay l·∫°i th√≠ch h·ªçc...
3      u·ªõc g√¨ sau n√†y v·ªÅ gi√† v·∫´n c√≥ th·ªÉ nh∆∞ c·ª• n√†y :))
4    m·ªói l·∫ßn c√≥ video c·ªßa con l√† c·ª© coi ƒëi coi l·∫°i ...
Name: Sentence, dtype: object