In [28]:
import pandas as pd
import os
from pathlib import Path

# ƒê∆∞·ªùng d·∫´n g·ªëc
BASE_PATH = Path("H:/SentimentAnalystSchool/MidtermExam/DatasetUnder80kWord")

# ƒê·ªãnh nghƒ©a c√°c file c·∫ßn g·ªôp
FILES_CONFIG = {
    "Amazon": {
        "train": BASE_PATH / "Amazon" / "train_reviews_under80words.csv",
        "dev": BASE_PATH / "Amazon" / "dev_reviews_under80words.csv",
        "test": BASE_PATH / "Amazon" / "test_reviews_under80words.csv"
    },
    "Hotels": {
        "train": BASE_PATH / "Hotels" / "train_reviews_under80words.csv",
        "dev": BASE_PATH / "Hotels" / "dev_reviews_under80words.csv",
        "test": BASE_PATH / "Hotels" / "test_reviews_under80words.csv"
    },
    "AmazonsElectronics": {
        "all": BASE_PATH / "AmazonsElectronics" / "Kaggle" / "neutral_output_aspect.csv"
    }
}

print("C·∫•u h√¨nh ƒë∆∞·ªùng d·∫´n files:")
for source, files in FILES_CONFIG.items():
    print(f"\n{source}:")
    for file_type, path in files.items():
        exists = "‚úì" if path.exists() else "‚úó"
        print(f"  {file_type}: {path.name} [{exists}]")

C·∫•u h√¨nh ƒë∆∞·ªùng d·∫´n files:

Amazon:
  train: train_reviews_under80words.csv [‚úì]
  dev: dev_reviews_under80words.csv [‚úì]
  test: test_reviews_under80words.csv [‚úì]

Hotels:
  train: train_reviews_under80words.csv [‚úì]
  dev: dev_reviews_under80words.csv [‚úì]
  test: test_reviews_under80words.csv [‚úì]

AmazonsElectronics:
  all: neutral_output_aspect.csv [‚úì]


In [29]:
def load_and_standardize_data(file_path):
    """
    ƒê·ªçc file CSV v√† chu·∫©n h√≥a c·ªôt th√†nh reviewText, AspectTerm, Sentiment
    
    Parameters:
    - file_path: ƒë∆∞·ªùng d·∫´n file
    
    Returns:
    - DataFrame ƒë√£ chu·∫©n h√≥a v·ªõi 3 c·ªôt: reviewText, AspectTerm, Sentiment
    """
    try:
        df = pd.read_csv(file_path)
        
        # Chu·∫©n h√≥a t√™n c·ªôt th√†nh ch·ªØ th∆∞·ªùng ƒë·ªÉ d·ªÖ x·ª≠ l√Ω
        df.columns = df.columns.str.strip().str.lower()
        
        # Map c√°c t√™n c·ªôt c√≥ th·ªÉ kh√°c nhau v·ªÅ t√™n chu·∫©n
        column_mapping = {
            'reviewtext': 'reviewText',
            'review': 'reviewText',
            'text': 'reviewText',
            'aspectterm': 'AspectTerm',
            'aspect': 'AspectTerm',
            'sentiment': 'Sentiment'
        }
        
        df = df.rename(columns=column_mapping)
        
        # Ch·ªâ gi·ªØ 3 c·ªôt c·∫ßn thi·∫øt
        required_cols = ['reviewText', 'AspectTerm', 'Sentiment']
        available_cols = [col for col in required_cols if col in df.columns]
        df = df[available_cols]
        
        # Drop c√°c d√≤ng c√≥ AspectTerm tr·ªëng ho·∫∑c null
        rows_before = len(df)
        if 'AspectTerm' in df.columns:
            df = df.dropna(subset=['AspectTerm'])
            df = df[df['AspectTerm'].str.strip() != '']
            df = df[df['AspectTerm'].str.lower() != 'none']
        rows_after = len(df)
        dropped = rows_before - rows_after
        
        print(f"  ‚úì Loaded {rows_after} rows from {file_path.name} (dropped {dropped} rows with empty AspectTerm)")
        return df
        
    except Exception as e:
        print(f"  ‚úó Error loading {file_path}: {e}")
        return pd.DataFrame()

print("H√†m load_and_standardize_data ƒë√£ ƒë∆∞·ª£c ƒë·ªãnh nghƒ©a!")

H√†m load_and_standardize_data ƒë√£ ƒë∆∞·ª£c ƒë·ªãnh nghƒ©a!


In [30]:
def merge_all_datasets(files_config, output_path=None, balance_config=None):
    """
    G·ªôp t·∫•t c·∫£ c√°c dataset th√†nh m·ªôt file duy nh·∫•t
    
    Parameters:
    - files_config: dict ch·ª©a c·∫•u h√¨nh c√°c file
    - output_path: ƒë∆∞·ªùng d·∫´n l∆∞u file output (optional)
    - balance_config: dict ƒë·ªÉ gi·ªõi h·∫°n s·ªë l∆∞·ª£ng m·∫´u theo sentiment (optional)
                      V√≠ d·ª•: {'positive': 5000, 'negative': 5000, 'neutral': 5000}
                      N·∫øu None, gi·ªØ nguy√™n t·∫•t c·∫£ d·ªØ li·ªáu
    
    Returns:
    - DataFrame ƒë√£ g·ªôp v·ªõi 3 c·ªôt: reviewText, AspectTerm, Sentiment
    """
    all_dataframes = []
    
    print("=" * 60)
    print("B·∫ÆT ƒê·∫¶U G·ªòP DATASETS")
    print("=" * 60)
    
    for source_name, files in files_config.items():
        print(f"\nüìÇ Loading from {source_name}:")
        for file_type, file_path in files.items():
            if file_path.exists():
                df = load_and_standardize_data(file_path)
                if not df.empty:
                    all_dataframes.append(df)
            else:
                print(f"  ‚ö† File not found: {file_path}")
    
    if not all_dataframes:
        print("\n‚ùå Kh√¥ng c√≥ d·ªØ li·ªáu n√†o ƒë∆∞·ª£c load!")
        return pd.DataFrame()
    
    # G·ªôp t·∫•t c·∫£
    merged_df = pd.concat(all_dataframes, ignore_index=True)
    
    print("\n" + "=" * 60)
    print("K·∫æT QU·∫¢ G·ªòP DATASET (TR∆Ø·ªöC KHI C√ÇN B·∫∞NG)")
    print("=" * 60)
    print(f"\nüìä T·ªïng s·ªë m·∫´u: {len(merged_df):,}")
    print(f"\nüìà Ph√¢n b·ªë theo Sentiment:")
    print(merged_df['Sentiment'].value_counts().to_string())
    
    # √Åp d·ª•ng balance_config n·∫øu c√≥
    if balance_config:
        print("\n" + "=" * 60)
        print("√ÅP D·ª§NG C√ÇN B·∫∞NG D·ªÆ LI·ªÜU")
        print("=" * 60)
        
        balanced_dfs = []
        for sentiment, max_count in balance_config.items():
            sentiment_df = merged_df[merged_df['Sentiment'].str.lower() == sentiment.lower()]
            current_count = len(sentiment_df)
            
            if current_count > max_count:
                sentiment_df = sentiment_df.sample(n=max_count, random_state=42)
                print(f"  {sentiment}: {current_count:,} ‚Üí {max_count:,} (gi·∫£m {current_count - max_count:,})")
            else:
                print(f"  {sentiment}: {current_count:,} (gi·ªØ nguy√™n, √≠t h∆°n {max_count:,})")
            
            balanced_dfs.append(sentiment_df)
        
        merged_df = pd.concat(balanced_dfs, ignore_index=True)
        merged_df = merged_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    print("\n" + "=" * 60)
    print("K·∫æT QU·∫¢ CU·ªêI C√ôNG")
    print("=" * 60)
    print(f"\nüìä T·ªïng s·ªë m·∫´u: {len(merged_df):,}")
    print(f"\nüìà Ph√¢n b·ªë theo Sentiment:")
    print(merged_df['Sentiment'].value_counts().to_string())
    print(f"\nüìã C√°c c·ªôt: {list(merged_df.columns)}")
    
    # L∆∞u file n·∫øu c√≥ output_path
    if output_path:
        merged_df.to_csv(output_path, index=False)
        print(f"\nüíæ ƒê√£ l∆∞u file: {output_path}")
    
    return merged_df

print("H√†m merge_all_datasets ƒë√£ ƒë∆∞·ª£c ƒë·ªãnh nghƒ©a!")

H√†m merge_all_datasets ƒë√£ ƒë∆∞·ª£c ƒë·ªãnh nghƒ©a!


In [31]:
# ============================================
# C·∫§U H√åNH S·ªê L∆Ø·ª¢NG M·∫™U THEO SENTIMENT
# ============================================
# Ch·ªânh s·ª≠a c√°c gi√° tr·ªã b√™n d∆∞·ªõi ƒë·ªÉ thay ƒë·ªïi s·ªë l∆∞·ª£ng m·∫´u cho m·ªói nh√£n
# ƒê·∫∑t None n·∫øu mu·ªën gi·ªØ t·∫•t c·∫£ d·ªØ li·ªáu kh√¥ng gi·ªõi h·∫°n

BALANCE_CONFIG = {
    'positive': 3000,   # S·ªë l∆∞·ª£ng m·∫´u positive t·ªëi ƒëa
    'negative': 3000,   # S·ªë l∆∞·ª£ng m·∫´u negative t·ªëi ƒëa  
    'neutral': 1788    # S·ªë l∆∞·ª£ng m·∫´u neutral t·ªëi ƒëa
}

# ƒê·∫∑t BALANCE_CONFIG = None n·∫øu mu·ªën gi·ªØ t·∫•t c·∫£ d·ªØ li·ªáu
# BALANCE_CONFIG = None

# ============================================
# Th·ª±c hi·ªán g·ªôp dataset
OUTPUT_PATH = BASE_PATH / "merged_dataset_all_7700rows.csv"

merged_df = merge_all_datasets(
    FILES_CONFIG, 
    output_path=OUTPUT_PATH,
    balance_config=BALANCE_CONFIG
)

B·∫ÆT ƒê·∫¶U G·ªòP DATASETS

üìÇ Loading from Amazon:
  ‚úì Loaded 6297 rows from train_reviews_under80words.csv (dropped 0 rows with empty AspectTerm)
  ‚úì Loaded 974 rows from dev_reviews_under80words.csv (dropped 0 rows with empty AspectTerm)
  ‚úì Loaded 966 rows from test_reviews_under80words.csv (dropped 0 rows with empty AspectTerm)

üìÇ Loading from Hotels:
  ‚úì Loaded 10126 rows from train_reviews_under80words.csv (dropped 0 rows with empty AspectTerm)
  ‚úì Loaded 1036 rows from dev_reviews_under80words.csv (dropped 0 rows with empty AspectTerm)
  ‚úì Loaded 242 rows from test_reviews_under80words.csv (dropped 0 rows with empty AspectTerm)

üìÇ Loading from AmazonsElectronics:
  ‚úì Loaded 1212 rows from neutral_output_aspect.csv (dropped 3788 rows with empty AspectTerm)

K·∫æT QU·∫¢ G·ªòP DATASET (TR∆Ø·ªöC KHI C√ÇN B·∫∞NG)

üìä T·ªïng s·ªë m·∫´u: 20,853

üìà Ph√¢n b·ªë theo Sentiment:
Sentiment
positive    15937
negative     3116
neutral      1788

√ÅP D·ª§NG C√ÇN B·∫

In [32]:
# Xem m·ªôt s·ªë m·∫´u d·ªØ li·ªáu
print("üìã M·ªôt s·ªë m·∫´u d·ªØ li·ªáu ƒë·∫ßu ti√™n:")
merged_df.head(10)

üìã M·ªôt s·ªë m·∫´u d·ªØ li·ªáu ƒë·∫ßu ti√™n:


Unnamed: 0,reviewText,AspectTerm,Sentiment
0,"While the tablet is nice, and well constructed...",google play,neutral
1,Highly recommended for those looking for that ...,food recommendation,positive
2,"ZICO Pure Premium Coconut Water , Natural , 11...",food general,positive
3,This is the only time I have strongly disliked...,food quality,negative
4,"Located on Via Della Scala , it was the perfec...",location general,positive
5,"It works in some, not all USB ports because it...",installation,neutral
6,"I LOVE PB , but try not to eat too much becaus...",food general,neutral
7,Excellent Booked this hotel a couple of months...,hotel general,positive
8,McCormick spices aren ' t exactly world - reno...,food general,neutral
9,"First thing , the room was really small and sq...",rooms design_features,negative


## T·ªïng k·∫øt

File ƒë√£ ƒë∆∞·ª£c t·∫°o v·ªõi c·∫•u tr√∫c 3 c·ªôt:
- `reviewText`: N·ªôi dung review
- `AspectTerm`: Aspect term ƒë∆∞·ª£c tr√≠ch xu·∫•t
- `Sentiment`: Nh√£n sentiment (positive/negative/neutral)