In [1]:
import pandas as pd
import os
from pathlib import Path

# ƒê∆∞·ªùng d·∫´n g·ªëc
BASE_PATH = Path("H:/SentimentAnalystSchool/MidtermExam/DatasetUnder80kWord")

# ƒê·ªãnh nghƒ©a c√°c file c·∫ßn g·ªôp
FILES_CONFIG = {
    "Amazon": {
        "train": BASE_PATH / "Amazon" / "train_reviews_under80words.csv",
        "dev": BASE_PATH / "Amazon" / "dev_reviews_under80words.csv",
        "test": BASE_PATH / "Amazon" / "test_reviews_under80words.csv"
    },
    "Hotels": {
        "train": BASE_PATH / "Hotels" / "train_reviews_under80words.csv",
        "dev": BASE_PATH / "Hotels" / "dev_reviews_under80words.csv",
        "test": BASE_PATH / "Hotels" / "test_reviews_under80words.csv"
    },
    "AmazonsElectronics": {
        "all": BASE_PATH / "AmazonsElectronics" / "Kaggle" / "neutral_output_aspect_filledNone.csv"
    }
}

print("C·∫•u h√¨nh ƒë∆∞·ªùng d·∫´n files:")
for source, files in FILES_CONFIG.items():
    print(f"\n{source}:")
    for file_type, path in files.items():
        exists = "‚úì" if path.exists() else "‚úó"
        print(f"  {file_type}: {path.name} [{exists}]")

C·∫•u h√¨nh ƒë∆∞·ªùng d·∫´n files:

Amazon:
  train: train_reviews_under80words.csv [‚úì]
  dev: dev_reviews_under80words.csv [‚úì]
  test: test_reviews_under80words.csv [‚úì]

Hotels:
  train: train_reviews_under80words.csv [‚úì]
  dev: dev_reviews_under80words.csv [‚úì]
  test: test_reviews_under80words.csv [‚úì]

AmazonsElectronics:
  all: neutral_output_aspect_filledNone.csv [‚úì]


In [2]:
def load_and_standardize_data(file_path):
    """
    ƒê·ªçc file CSV v√† chu·∫©n h√≥a c·ªôt th√†nh reviewText, AspectTerm, Sentiment
    
    Parameters:
    - file_path: ƒë∆∞·ªùng d·∫´n file
    
    Returns:
    - DataFrame ƒë√£ chu·∫©n h√≥a v·ªõi 3 c·ªôt: reviewText, AspectTerm, Sentiment
    """
    try:
        df = pd.read_csv(file_path)
        
        # Chu·∫©n h√≥a t√™n c·ªôt th√†nh ch·ªØ th∆∞·ªùng ƒë·ªÉ d·ªÖ x·ª≠ l√Ω
        df.columns = df.columns.str.strip().str.lower()
        
        # Map c√°c t√™n c·ªôt c√≥ th·ªÉ kh√°c nhau v·ªÅ t√™n chu·∫©n
        column_mapping = {
            'reviewtext': 'reviewText',
            'review': 'reviewText',
            'text': 'reviewText',
            'aspectterm': 'AspectTerm',
            'aspect': 'AspectTerm',
            'sentiment': 'Sentiment'
        }
        
        df = df.rename(columns=column_mapping)
        
        # Ch·ªâ gi·ªØ 3 c·ªôt c·∫ßn thi·∫øt
        required_cols = ['reviewText', 'AspectTerm', 'Sentiment']
        available_cols = [col for col in required_cols if col in df.columns]
        df = df[available_cols]
        
        # ƒêi·ªÅn 'None' v√†o c√°c AspectTerm tr·ªëng ho·∫∑c null
        if 'AspectTerm' in df.columns:
            # ƒê·∫øm s·ªë l∆∞·ª£ng tr∆∞·ªõc khi ƒëi·ªÅn
            empty_mask = df['AspectTerm'].isna() | (df['AspectTerm'].astype(str).str.strip() == '')
            empty_count = empty_mask.sum()
            
            # ƒêi·ªÅn 'None' v√†o c√°c √¥ tr·ªëng
            df.loc[empty_mask, 'AspectTerm'] = 'No Aspect'
            
            print(f"  ‚úì Loaded {len(df)} rows from {file_path.name} (filled {empty_count} empty AspectTerm with 'None')")
        else:
            print(f"  ‚úì Loaded {len(df)} rows from {file_path.name}")
        
        return df
        
    except Exception as e:
        print(f"  ‚úó Error loading {file_path}: {e}")
        return pd.DataFrame()

print("H√†m load_and_standardize_data ƒë√£ ƒë∆∞·ª£c ƒë·ªãnh nghƒ©a!")

H√†m load_and_standardize_data ƒë√£ ƒë∆∞·ª£c ƒë·ªãnh nghƒ©a!


In [3]:
def merge_all_datasets(files_config, output_path=None, balance_config=None):
    """
    G·ªôp t·∫•t c·∫£ c√°c dataset th√†nh m·ªôt file duy nh·∫•t
    
    Parameters:
    - files_config: dict ch·ª©a c·∫•u h√¨nh c√°c file
    - output_path: ƒë∆∞·ªùng d·∫´n l∆∞u file output (optional)
    - balance_config: dict ƒë·ªÉ gi·ªõi h·∫°n s·ªë l∆∞·ª£ng m·∫´u theo sentiment (optional)
                      V√≠ d·ª•: {'positive': 5000, 'negative': 5000, 'neutral': 5000}
                      N·∫øu None, gi·ªØ nguy√™n t·∫•t c·∫£ d·ªØ li·ªáu
    
    Returns:
    - DataFrame ƒë√£ g·ªôp v·ªõi 3 c·ªôt: reviewText, AspectTerm, Sentiment
    """
    all_dataframes = []
    
    print("=" * 60)
    print("B·∫ÆT ƒê·∫¶U G·ªòP DATASETS")
    print("=" * 60)
    
    for source_name, files in files_config.items():
        print(f"\nüìÇ Loading from {source_name}:")
        for file_type, file_path in files.items():
            if file_path.exists():
                df = load_and_standardize_data(file_path)
                if not df.empty:
                    all_dataframes.append(df)
            else:
                print(f"  ‚ö† File not found: {file_path}")
    
    if not all_dataframes:
        print("\n‚ùå Kh√¥ng c√≥ d·ªØ li·ªáu n√†o ƒë∆∞·ª£c load!")
        return pd.DataFrame()
    
    # G·ªôp t·∫•t c·∫£
    merged_df = pd.concat(all_dataframes, ignore_index=True)
    
    print("\n" + "=" * 60)
    print("K·∫æT QU·∫¢ G·ªòP DATASET (TR∆Ø·ªöC KHI C√ÇN B·∫∞NG)")
    print("=" * 60)
    print(f"\nüìä T·ªïng s·ªë m·∫´u: {len(merged_df):,}")
    print(f"\nüìà Ph√¢n b·ªë theo Sentiment:")
    print(merged_df['Sentiment'].value_counts().to_string())
    
    # √Åp d·ª•ng balance_config n·∫øu c√≥
    if balance_config:
        print("\n" + "=" * 60)
        print("√ÅP D·ª§NG C√ÇN B·∫∞NG D·ªÆ LI·ªÜU")
        print("=" * 60)
        
        balanced_dfs = []
        for sentiment, max_count in balance_config.items():
            sentiment_df = merged_df[merged_df['Sentiment'].str.lower() == sentiment.lower()]
            current_count = len(sentiment_df)
            
            if current_count > max_count:
                sentiment_df = sentiment_df.sample(n=max_count, random_state=42)
                print(f"  {sentiment}: {current_count:,} ‚Üí {max_count:,} (gi·∫£m {current_count - max_count:,})")
            else:
                print(f"  {sentiment}: {current_count:,} (gi·ªØ nguy√™n, √≠t h∆°n {max_count:,})")
            
            balanced_dfs.append(sentiment_df)
        
        merged_df = pd.concat(balanced_dfs, ignore_index=True)
        merged_df = merged_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    print("\n" + "=" * 60)
    print("K·∫æT QU·∫¢ CU·ªêI C√ôNG")
    print("=" * 60)
    print(f"\nüìä T·ªïng s·ªë m·∫´u: {len(merged_df):,}")
    print(f"\nüìà Ph√¢n b·ªë theo Sentiment:")
    print(merged_df['Sentiment'].value_counts().to_string())
    print(f"\nüìã C√°c c·ªôt: {list(merged_df.columns)}")
    
    # L∆∞u file n·∫øu c√≥ output_path
    if output_path:
        merged_df.to_csv(output_path, index=False)
        print(f"\nüíæ ƒê√£ l∆∞u file: {output_path}")
    
    return merged_df

print("H√†m merge_all_datasets ƒë√£ ƒë∆∞·ª£c ƒë·ªãnh nghƒ©a!")

H√†m merge_all_datasets ƒë√£ ƒë∆∞·ª£c ƒë·ªãnh nghƒ©a!


In [4]:
# ============================================
# C·∫§U H√åNH S·ªê L∆Ø·ª¢NG M·∫™U THEO SENTIMENT
# ============================================
# Ch·ªânh s·ª≠a c√°c gi√° tr·ªã b√™n d∆∞·ªõi ƒë·ªÉ thay ƒë·ªïi s·ªë l∆∞·ª£ng m·∫´u cho m·ªói nh√£n
# ƒê·∫∑t None n·∫øu mu·ªën gi·ªØ t·∫•t c·∫£ d·ªØ li·ªáu kh√¥ng gi·ªõi h·∫°n

BALANCE_CONFIG = {
    'positive': 6600,   # S·ªë l∆∞·ª£ng m·∫´u positive t·ªëi ƒëa
    'negative': 6600,   # S·ªë l∆∞·ª£ng m·∫´u negative t·ªëi ƒëa  
    'neutral': 5000    # S·ªë l∆∞·ª£ng m·∫´u neutral t·ªëi ƒëa
}

# ƒê·∫∑t BALANCE_CONFIG = None n·∫øu mu·ªën gi·ªØ t·∫•t c·∫£ d·ªØ li·ªáu
# BALANCE_CONFIG = None

# ============================================
# Th·ª±c hi·ªán g·ªôp dataset
OUTPUT_PATH = BASE_PATH / "merged_dataset_all_13200rows.csv"

merged_df = merge_all_datasets(
    FILES_CONFIG, 
    output_path=OUTPUT_PATH,
    balance_config=BALANCE_CONFIG
)

B·∫ÆT ƒê·∫¶U G·ªòP DATASETS

üìÇ Loading from Amazon:
  ‚úì Loaded 6297 rows from train_reviews_under80words.csv (filled 0 empty AspectTerm with 'None')
  ‚úì Loaded 974 rows from dev_reviews_under80words.csv (filled 0 empty AspectTerm with 'None')
  ‚úì Loaded 966 rows from test_reviews_under80words.csv (filled 0 empty AspectTerm with 'None')

üìÇ Loading from Hotels:
  ‚úì Loaded 10126 rows from train_reviews_under80words.csv (filled 0 empty AspectTerm with 'None')
  ‚úì Loaded 1036 rows from dev_reviews_under80words.csv (filled 0 empty AspectTerm with 'None')
  ‚úì Loaded 242 rows from test_reviews_under80words.csv (filled 0 empty AspectTerm with 'None')

üìÇ Loading from AmazonsElectronics:
  ‚úì Loaded 5000 rows from neutral_output_aspect_filledNone.csv (filled 3788 empty AspectTerm with 'None')

K·∫æT QU·∫¢ G·ªòP DATASET (TR∆Ø·ªöC KHI C√ÇN B·∫∞NG)

üìä T·ªïng s·ªë m·∫´u: 24,641

üìà Ph√¢n b·ªë theo Sentiment:
Sentiment
positive    15937
neutral      5576
negative     3116

√

In [5]:
# Xem m·ªôt s·ªë m·∫´u d·ªØ li·ªáu
print("üìã M·ªôt s·ªë m·∫´u d·ªØ li·ªáu ƒë·∫ßu ti√™n:")
merged_df.head(10)

üìã M·ªôt s·ªë m·∫´u d·ªØ li·ªáu ƒë·∫ßu ti√™n:


Unnamed: 0,reviewText,AspectTerm,Sentiment
0,I have seen and purchased these at lower price...,No Aspect,neutral
1,"I was thinking about other thing, it was very ...",No Aspect,neutral
2,"There are Better Options than this tired , out...",hotel quality,negative
3,"I think is good, but I think creative labs pro...",No Aspect,neutral
4,"This is NOT an Apple product. Although, it is ...",No Aspect,neutral
5,"Basically , you end up paying well over 2 time...",food prices,negative
6,"There ' s no bitter after taste , and it ' s g...",food quality,positive
7,"I like the fact that there are no additives , ...",food quality,positive
8,Restaurant very odd .,facilities general,negative
9,Appears to do the trick for being able to char...,price,neutral


## T·ªïng k·∫øt

File ƒë√£ ƒë∆∞·ª£c t·∫°o v·ªõi c·∫•u tr√∫c 3 c·ªôt:
- `reviewText`: N·ªôi dung review
- `AspectTerm`: Aspect term ƒë∆∞·ª£c tr√≠ch xu·∫•t
- `Sentiment`: Nh√£n sentiment (positive/negative/neutral)

In [8]:
# ============================================
# G·ªòP 2 DATASET: merged_dataset + semeval_train
# ============================================
print("=" * 60)
print("G·ªòP DATASET: merged_dataset + semeval_train")
print("=" * 60)

# ƒê∆∞·ªùng d·∫´n c√°c file
MERGED_PATH = Path("H:/SentimentAnalystSchool/MidtermExam/DatasetUnder80kWord/merged_dataset_all_13200rows.csv")
SEMEVAL_PATH = Path("H:/SentimentAnalystSchool/MidtermExam/Semeval/semeval_train.csv")

# ƒê·ªçc 2 file
df_merged = pd.read_csv(MERGED_PATH)
df_semeval = pd.read_csv(SEMEVAL_PATH)

print(f"\nüìÇ Dataset 1: merged_dataset_all_13200rows.csv")
print(f"   S·ªë d√≤ng: {len(df_merged):,}")
print(f"   C√°c c·ªôt: {list(df_merged.columns)}")
print(f"   Ph√¢n b·ªë Sentiment:")
print(df_merged['Sentiment'].value_counts().to_string())

print(f"\nüìÇ Dataset 2: semeval_train.csv")
print(f"   S·ªë d√≤ng: {len(df_semeval):,}")
print(f"   C√°c c·ªôt: {list(df_semeval.columns)}")
print(f"   Ph√¢n b·ªë Sentiment:")
print(df_semeval['Sentiment'].value_counts().to_string())

G·ªòP DATASET: merged_dataset + semeval_train

üìÇ Dataset 1: merged_dataset_all_13200rows.csv
   S·ªë d√≤ng: 14,716
   C√°c c·ªôt: ['reviewText', 'AspectTerm', 'Sentiment']
   Ph√¢n b·ªë Sentiment:
Sentiment
positive    6600
neutral     5000
negative    3116

üìÇ Dataset 2: semeval_train.csv
   S·ªë d√≤ng: 5,416
   C√°c c·ªôt: ['reviewText', 'Sentiment', 'AspectTerm']
   Ph√¢n b·ªë Sentiment:
Sentiment
positive    3294
negative    1833
neutral      289


In [9]:
# Chu·∫©n h√≥a t√™n c·ªôt c·ªßa semeval_train ƒë·ªÉ kh·ªõp v·ªõi merged_dataset
# semeval_train c√≥: reviewText, Sentiment, AspectTerms
# merged_dataset c√≥: reviewText, AspectTerm, Sentiment

# Rename c·ªôt AspectTerms -> AspectTerm (b·ªè s)
if 'AspectTerms' in df_semeval.columns:
    df_semeval = df_semeval.rename(columns={'AspectTerms': 'AspectTerm'})
    print("‚úì ƒê√£ ƒë·ªïi t√™n c·ªôt 'AspectTerms' -> 'AspectTerm'")

# S·∫Øp x·∫øp l·∫°i th·ª© t·ª± c·ªôt cho kh·ªõp
df_semeval = df_semeval[['reviewText', 'AspectTerm', 'Sentiment']]

# G·ªôp 2 dataset
df_combined = pd.concat([df_merged, df_semeval], ignore_index=True)

# Shuffle d·ªØ li·ªáu
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

print("\n" + "=" * 60)
print("K·∫æT QU·∫¢ G·ªòP 2 DATASET")
print("=" * 60)
print(f"\nüìä T·ªïng s·ªë m·∫´u: {len(df_combined):,}")
print(f"   - T·ª´ merged_dataset: {len(df_merged):,}")
print(f"   - T·ª´ semeval_train: {len(df_semeval):,}")

print(f"\nüìà Ph√¢n b·ªë Sentiment sau khi g·ªôp:")
print(df_combined['Sentiment'].value_counts().to_string())

print(f"\nüìã C√°c c·ªôt: {list(df_combined.columns)}")


K·∫æT QU·∫¢ G·ªòP 2 DATASET

üìä T·ªïng s·ªë m·∫´u: 20,132
   - T·ª´ merged_dataset: 14,716
   - T·ª´ semeval_train: 5,416

üìà Ph√¢n b·ªë Sentiment sau khi g·ªôp:
Sentiment
positive    9894
neutral     5289
negative    4949

üìã C√°c c·ªôt: ['reviewText', 'AspectTerm', 'Sentiment']


In [10]:
# ============================================
# C√ÇN B·∫∞NG D·ªÆ LI·ªÜU (T√ôY CH·ªåN)
# ============================================
# Ch·ªânh s·ª≠a config b√™n d∆∞·ªõi n·∫øu mu·ªën c√¢n b·∫±ng

BALANCE_COMBINED = {
    'positive': 10000,   # S·ªë l∆∞·ª£ng m·∫´u positive t·ªëi ƒëa
    'negative': 10000,   # S·ªë l∆∞·ª£ng m·∫´u negative t·ªëi ƒëa  
    'neutral': 5000      # S·ªë l∆∞·ª£ng m·∫´u neutral t·ªëi ƒëa
}

# ƒê·∫∑t BALANCE_COMBINED = None n·∫øu kh√¥ng mu·ªën c√¢n b·∫±ng
# BALANCE_COMBINED = None

if BALANCE_COMBINED:
    print("=" * 60)
    print("√ÅP D·ª§NG C√ÇN B·∫∞NG D·ªÆ LI·ªÜU")
    print("=" * 60)
    
    balanced_dfs = []
    for sentiment, max_count in BALANCE_COMBINED.items():
        sentiment_df = df_combined[df_combined['Sentiment'].str.lower() == sentiment.lower()]
        current_count = len(sentiment_df)
        
        if current_count > max_count:
            sentiment_df = sentiment_df.sample(n=max_count, random_state=42)
            print(f"  {sentiment}: {current_count:,} ‚Üí {max_count:,} (gi·∫£m {current_count - max_count:,})")
        else:
            print(f"  {sentiment}: {current_count:,} (gi·ªØ nguy√™n)")
        
        balanced_dfs.append(sentiment_df)
    
    df_final = pd.concat(balanced_dfs, ignore_index=True)
    df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)
else:
    df_final = df_combined

print(f"\nüìä T·ªïng s·ªë m·∫´u cu·ªëi c√πng: {len(df_final):,}")
print(f"\nüìà Ph√¢n b·ªë Sentiment cu·ªëi c√πng:")
print(df_final['Sentiment'].value_counts().to_string())

√ÅP D·ª§NG C√ÇN B·∫∞NG D·ªÆ LI·ªÜU
  positive: 9,894 (gi·ªØ nguy√™n)
  negative: 4,949 (gi·ªØ nguy√™n)
  neutral: 5,289 ‚Üí 5,000 (gi·∫£m 289)

üìä T·ªïng s·ªë m·∫´u cu·ªëi c√πng: 19,843

üìà Ph√¢n b·ªë Sentiment cu·ªëi c√πng:
Sentiment
positive    9894
neutral     5000
negative    4949


In [11]:
# L∆∞u file g·ªôp
OUTPUT_COMBINED_PATH = Path("H:/SentimentAnalystSchool/MidtermExam/DatasetUnder80kWord/merged_dataset_combined_semeval.csv")
df_final.to_csv(OUTPUT_COMBINED_PATH, index=False)

print(f"‚úÖ ƒê√£ l∆∞u file g·ªôp: {OUTPUT_COMBINED_PATH}")
print(f"   T·ªïng s·ªë d√≤ng: {len(df_final):,}")

# Xem m·∫´u d·ªØ li·ªáu
print("\nüìã M·∫´u d·ªØ li·ªáu (10 d√≤ng ƒë·∫ßu):")
df_final.head(10)

‚úÖ ƒê√£ l∆∞u file g·ªôp: H:\SentimentAnalystSchool\MidtermExam\DatasetUnder80kWord\merged_dataset_combined_semeval.csv
   T·ªïng s·ªë d√≤ng: 19,843

üìã M·∫´u d·ªØ li·ªáu (10 d√≤ng ƒë·∫ßu):


Unnamed: 0,reviewText,AspectTerm,Sentiment
0,That made me really sad because I really wante...,food general,negative
1,The friction pad is good to use in your car. ...,No Aspect,neutral
2,I wasn ' t sure how I would like the smoked ho...,food general,positive
3,I ordered these thinking they were something d...,No Aspect,neutral
4,"There are a ton of hotels and I say , look els...",hotel miscellaneous,negative
5,"Key Word = Budget Monte Carlo is clean , fairl...",location general,positive
6,"The ambience was nice, but service wasn't so g...",SERVICE-GENERAL,negative
7,Excellent value ( at London prices ) for a lux...,hotel prices,positive
8,We enjoyed eating at Ginger and Olsen - two ne...,service general,positive
9,"Room was very comfortable and modern , and goo...",rooms design_features,positive


# G·ªôp merged_dataset_combined_semeval.csv + df_train_all_new.csv

G·ªôp 2 file dataset v·ªõi ƒë·ªãnh d·∫°ng chu·∫©n `reviewText, AspectTerm, Sentiment`

In [14]:
import pandas as pd
import os
from pathlib import Path

def standardize_columns(df):
    """
    Chu·∫©n h√≥a t√™n c·ªôt c·ªßa DataFrame th√†nh format chu·∫©n: reviewText, AspectTerm, Sentiment
    
    H·ªó tr·ª£ c√°c format input:
    - Format 1: reviewText, AspectTerm, Sentiment (ƒë√£ chu·∫©n)
    - Format 2: text, aspect, polarity (SemEval format)
    - Format 3: domain, review_id, sentence_id, text, aspect, polarity, label
    
    Parameters:
    - df: DataFrame c·∫ßn chu·∫©n h√≥a
    
    Returns:
    - DataFrame ƒë√£ chu·∫©n h√≥a v·ªõi 3 c·ªôt: reviewText, AspectTerm, Sentiment
    """
    # T·∫°o copy ƒë·ªÉ kh√¥ng ·∫£nh h∆∞·ªüng df g·ªëc
    df = df.copy()
    
    # Chu·∫©n h√≥a t√™n c·ªôt th√†nh ch·ªØ th∆∞·ªùng ƒë·ªÉ d·ªÖ x·ª≠ l√Ω
    df.columns = df.columns.str.strip().str.lower()
    
    # Map c√°c t√™n c·ªôt c√≥ th·ªÉ kh√°c nhau v·ªÅ t√™n chu·∫©n
    column_mapping = {
        # reviewText
        'reviewtext': 'reviewText',
        'review': 'reviewText',
        'text': 'reviewText',
        'sentence': 'reviewText',
        'content': 'reviewText',
        # AspectTerm  
        'aspectterm': 'AspectTerm',
        'aspectterms': 'AspectTerm',
        'aspect': 'AspectTerm',
        'aspect_term': 'AspectTerm',
        # Sentiment
        'sentiment': 'Sentiment',
        'polarity': 'Sentiment',
        'label': 'Sentiment'  # N·∫øu kh√¥ng c√≥ polarity th√¨ d√πng label
    }
    
    # ∆Øu ti√™n polarity h∆°n label n·∫øu c·∫£ 2 ƒë·ªÅu c√≥
    if 'polarity' in df.columns and 'label' in df.columns:
        # Gi·ªØ polarity, b·ªè qua label trong mapping
        df = df.rename(columns={'polarity': 'Sentiment', 'text': 'reviewText', 'aspect': 'AspectTerm'})
    else:
        df = df.rename(columns=column_mapping)
    
    # Ch·ªâ gi·ªØ 3 c·ªôt c·∫ßn thi·∫øt
    required_cols = ['reviewText', 'AspectTerm', 'Sentiment']
    available_cols = [col for col in required_cols if col in df.columns]
    
    if len(available_cols) < 3:
        missing = set(required_cols) - set(available_cols)
        print(f"  ‚ö† Thi·∫øu c√°c c·ªôt: {missing}")
        print(f"  üìã C√°c c·ªôt hi·ªán c√≥: {list(df.columns)}")
    
    df = df[available_cols]
    
    # ƒêi·ªÅn 'No Aspect' v√†o c√°c AspectTerm tr·ªëng ho·∫∑c null
    if 'AspectTerm' in df.columns:
        empty_mask = df['AspectTerm'].isna() | (df['AspectTerm'].astype(str).str.strip() == '')
        empty_count = empty_mask.sum()
        df.loc[empty_mask, 'AspectTerm'] = 'No Aspect'
        if empty_count > 0:
            print(f"  ‚úì ƒê√£ ƒëi·ªÅn {empty_count} AspectTerm tr·ªëng v·ªõi 'No Aspect'")
        
        # Chuy·ªÉn # th√†nh - trong AspectTerm (LAPTOP#GENERAL -> LAPTOP-GENERAL)
        df['AspectTerm'] = df['AspectTerm'].astype(str).str.replace('#', '-', regex=False)
    
    # Chu·∫©n h√≥a Sentiment v·ªÅ lowercase
    if 'Sentiment' in df.columns:
        df['Sentiment'] = df['Sentiment'].astype(str).str.strip().str.lower()
    
    return df

print("‚úì H√†m standardize_columns() ƒë√£ ƒë∆∞·ª£c ƒë·ªãnh nghƒ©a!")
print("  H·ªó tr·ª£ c√°c format:")
print("  - reviewText, AspectTerm, Sentiment")
print("  - text, aspect, polarity (SemEval)")
print("  - domain, review_id, sentence_id, text, aspect, polarity, label")

‚úì H√†m standardize_columns() ƒë√£ ƒë∆∞·ª£c ƒë·ªãnh nghƒ©a!
  H·ªó tr·ª£ c√°c format:
  - reviewText, AspectTerm, Sentiment
  - text, aspect, polarity (SemEval)
  - domain, review_id, sentence_id, text, aspect, polarity, label


In [15]:
# ============================================
# G·ªòP: merged_dataset_combined_semeval + df_train_all_new
# ============================================
print("=" * 60)
print("G·ªòP DATASET: merged_dataset_combined_semeval + df_train_all_new")
print("=" * 60)

# ƒê∆∞·ªùng d·∫´n c√°c file
MERGED_COMBINED_PATH = Path("H:/SentimentAnalystSchool/MidtermExam/DatasetUnder80kWord/merged_dataset_all_NoAspect_version.csv")
SEMEVAL_NEW_PATH = Path("H:/SentimentAnalystSchool/MidtermExam/Semeval/df_train_all_new.csv")

# ƒê·ªçc 2 file
df_merged_combined = pd.read_csv(MERGED_COMBINED_PATH)
df_semeval_new = pd.read_csv(SEMEVAL_NEW_PATH)

# Chu·∫©n h√≥a c·ªôt cho c·∫£ 2 dataset
print("\nüìÇ Dataset 1: merged_dataset_all_NoAspect_version.csv")
print(f"   C·ªôt ban ƒë·∫ßu: {list(df_merged_combined.columns)}")
df_merged_combined = standardize_columns(df_merged_combined)
print(f"   S·ªë d√≤ng: {len(df_merged_combined):,}")
print(f"   C·ªôt sau chu·∫©n h√≥a: {list(df_merged_combined.columns)}")
print(f"   Ph√¢n b·ªë Sentiment:")
print(df_merged_combined['Sentiment'].value_counts().to_string())

print(f"\nüìÇ Dataset 2: df_train_all_new.csv")
print(f"   C·ªôt ban ƒë·∫ßu: {list(df_semeval_new.columns)}")
df_semeval_new = standardize_columns(df_semeval_new)
print(f"   S·ªë d√≤ng: {len(df_semeval_new):,}")
print(f"   C·ªôt sau chu·∫©n h√≥a: {list(df_semeval_new.columns)}")
print(f"   Ph√¢n b·ªë Sentiment:")
print(df_semeval_new['Sentiment'].value_counts().to_string())

G·ªòP DATASET: merged_dataset_combined_semeval + df_train_all_new

üìÇ Dataset 1: merged_dataset_all_NoAspect_version.csv
   C·ªôt ban ƒë·∫ßu: ['reviewText', 'AspectTerm', 'Sentiment']
   S·ªë d√≤ng: 14,716
   C·ªôt sau chu·∫©n h√≥a: ['reviewText', 'AspectTerm', 'Sentiment']
   Ph√¢n b·ªë Sentiment:
Sentiment
positive    6600
neutral     5000
negative    3116

üìÇ Dataset 2: df_train_all_new.csv
   C·ªôt ban ƒë·∫ßu: ['domain', 'review_id', 'sentence_id', 'text', 'aspect', 'polarity', 'label']
   S·ªë d√≤ng: 5,705
   C·ªôt sau chu·∫©n h√≥a: ['reviewText', 'AspectTerm', 'Sentiment']
   Ph√¢n b·ªë Sentiment:
Sentiment
positive    3294
negative    1833
neutral      578


In [16]:
# G·ªôp 2 dataset
# C·∫£ 2 file ƒë·ªÅu c√≥ c√πng format: reviewText, AspectTerm, Sentiment
df_all_combined = pd.concat([df_merged_combined, df_semeval_new], ignore_index=True)

# Shuffle d·ªØ li·ªáu
df_all_combined = df_all_combined.sample(frac=1, random_state=42).reset_index(drop=True)

print("\n" + "=" * 60)
print("K·∫æT QU·∫¢ G·ªòP 2 DATASET")
print("=" * 60)
print(f"\nüìä T·ªïng s·ªë m·∫´u: {len(df_all_combined):,}")
print(f"   - T·ª´ merged_dataset_combined_semeval: {len(df_merged_combined):,}")
print(f"   - T·ª´ semeval_train_new: {len(df_semeval_new):,}")

print(f"\nüìà Ph√¢n b·ªë Sentiment sau khi g·ªôp:")
print(df_all_combined['Sentiment'].value_counts().to_string())

print(f"\nüìã C√°c c·ªôt: {list(df_all_combined.columns)}")


K·∫æT QU·∫¢ G·ªòP 2 DATASET

üìä T·ªïng s·ªë m·∫´u: 20,421
   - T·ª´ merged_dataset_combined_semeval: 14,716
   - T·ª´ semeval_train_new: 5,705

üìà Ph√¢n b·ªë Sentiment sau khi g·ªôp:
Sentiment
positive    9894
neutral     5578
negative    4949

üìã C√°c c·ªôt: ['reviewText', 'AspectTerm', 'Sentiment']


In [17]:
# L∆∞u file g·ªôp cu·ªëi c√πng
OUTPUT_ALL_PATH = Path("H:/SentimentAnalystSchool/MidtermExam/DatasetUnder80kWord/merged_all_final.csv")
df_all_combined.to_csv(OUTPUT_ALL_PATH, index=False)

print(f"‚úÖ ƒê√£ l∆∞u file g·ªôp: {OUTPUT_ALL_PATH}")
print(f"   T·ªïng s·ªë d√≤ng: {len(df_all_combined):,}")

# Xem m·∫´u d·ªØ li·ªáu
print("\nüìã M·∫´u d·ªØ li·ªáu (10 d√≤ng ƒë·∫ßu):")
df_all_combined.head(10)

‚úÖ ƒê√£ l∆∞u file g·ªôp: H:\SentimentAnalystSchool\MidtermExam\DatasetUnder80kWord\merged_all_final.csv
   T·ªïng s·ªë d√≤ng: 20,421

üìã M·∫´u d·ªØ li·ªáu (10 d√≤ng ƒë·∫ßu):


Unnamed: 0,reviewText,AspectTerm,Sentiment
0,"Bigger screen than expected, laptop with a des...",DISPLAY-DESIGN_FEATURES,positive
1,The hotel staff when we stayed there were extr...,hotel design_features,negative
2,The main tourist spots are within a twenty min...,location general,positive
3,Never buy from Lake Champlain .,food recommendation,negative
4,It had most of the features and all of the pow...,LAPTOP-OPERATION_PERFORMANCE,positive
5,I complained that this did not constitute an o...,service general,negative
6,Took it back as it was defective.,LAPTOP-QUALITY,negative
7,Even better than the photos ! In addition to a...,hotel design_features,positive
8,Works fine for charging (no data connection). ...,No Aspect,neutral
9,I bought them because of Hungry Girl ' s glowi...,food general,negative
