# üìä Multi-Dataset Merger (v2) - Student Dropout Prediction
## Combining 5 Datasets for Maximum Accuracy

**Updates in v2:**
- Fixed CSV loading for semicolon-separated files
- Improved target variable detection (avoids 'Marital status' confusion)
- Fixed pandas compatibility issues

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Add parent directory to path
sys.path.append('..')
import config

print("‚úì Libraries imported successfully")

## 2. Load All Datasets (Robust Loading)

In [None]:
# Load each dataset with smart separator detection
datasets = {}

dataset_info = [
    ('dataset1', config.DATASET_1_PATH),
    ('dataset2', config.DATASET_2_PATH),
    ('dataset3', config.DATASET_3_PATH),
    ('dataset4', config.DATASET_4_PATH),
    ('dataset5', config.DATASET_5_PATH)
]

for name, path in dataset_info:
    try:
        # Try loading with default comma separator
        df = pd.read_csv(path)
        
        # If only 1 column found, try semicolon separator
        if df.shape[1] == 1:
            df = pd.read_csv(path, sep=';')
            
        datasets[name] = df
        print(f"‚úì {config.DATASET_NAMES[name]:40} - {df.shape[0]:5,} rows √ó {df.shape[1]:2} columns")
    except FileNotFoundError:
        print(f"‚úó {config.DATASET_NAMES[name]:40} - FILE NOT FOUND")
    except Exception as e:
        print(f"‚úó {config.DATASET_NAMES[name]:40} - ERROR: {str(e)}")

print(f"\nüìä Total datasets loaded: {len(datasets)}")

## 3. Standardize Target Variables (Fixed Logic)

In [None]:
# Fixed Function to standardize target variable
def standardize_target(df, target_col=None):
    """
    Standardize target variable to: Dropout, Graduate, Enrolled
    """
    df_copy = df.copy()
    
    # 1. Auto-detect target column if not provided
    if target_col is None:
        # Priority 1: Exact match for 'Target'
        if 'Target' in df_copy.columns:
            target_col = 'Target'
        # Priority 2: Exact match for 'target'
        elif 'target' in df_copy.columns:
            target_col = 'target'
        # Priority 3: Search for keywords, excluding 'Marital status'
        else:
            target_candidates = ['Dropout', 'Status', 'dropout', 'status']
            for candidate in target_candidates:
                # Find columns containing the candidate string
                matches = [col for col in df_copy.columns if candidate in col]
                # Filter out 'Marital status' which is a feature, not target
                matches = [m for m in matches if 'marital' not in m.lower()]
                
                if matches:
                    target_col = matches[0]
                    break
    
    if target_col is None:
        print(f"‚ö†Ô∏è No target column found. Skipping...")
        return df_copy
    
    print(f"  Identified target column: '{target_col}'")
    
    # 2. Rename to 'Target'
    if target_col != 'Target':
        # If 'Target' already exists (but isn't the one we picked), drop it to avoid duplicates
        if 'Target' in df_copy.columns:
            print("  Dropping existing 'Target' column to avoid duplicates")
            df_copy = df_copy.drop(columns=['Target'])
        df_copy = df_copy.rename(columns={target_col: 'Target'})
    
    # 3. Standardize values (Robust method)
    # Convert to string first, then strip
    df_copy['Target'] = df_copy['Target'].astype(str)
    df_copy['Target'] = df_copy['Target'].str.strip()
    
    return df_copy

# Apply standardization
standardized_datasets = {}
for name, df in datasets.items():
    print(f"\nProcessing {config.DATASET_NAMES[name]}...")
    standardized_datasets[name] = standardize_target(df)
    if 'Target' in standardized_datasets[name].columns:
        print(f"‚úì Target standardized. Values: {standardized_datasets[name]['Target'].unique()[:5]}")
    else:
        print(f"‚úó No target found")

## 4. Feature Harmonization

In [None]:
# Define key features to extract/engineer
KEY_FEATURES = [
    'age',
    'gender',
    'course',
    'year_of_study',
    'previous_qualification',
    'admission_grade',
    'mother_qualification',
    'father_qualification',
    'tuition_fees_up_to_date',
    'scholarship_holder',
    'debtor',
    'curricular_units_1st_sem',
    'curricular_units_2nd_sem',
    'unemployment_rate',
    'inflation_rate',
    'gdp'
]

def harmonize_features(df, dataset_name):
    """
    Extract and harmonize features from dataset
    """
    harmonized = pd.DataFrame()
    
    # Add dataset source
    harmonized['dataset_source'] = dataset_name
    
    # Map columns to standard names
    for col in df.columns:
        col_lower = col.lower().replace(' ', '_')
        
        # Age
        if 'age' in col_lower:
            harmonized['age'] = df[col]
        
        # Gender
        elif 'gender' in col_lower or 'sex' in col_lower:
            harmonized['gender'] = df[col]
        
        # Course
        elif 'course' in col_lower:
            harmonized['course'] = df[col]
        
        # Add more mappings as needed
    
    # Add target if available
    if 'Target' in df.columns:
        harmonized['Target'] = df['Target']
    
    # Fill missing columns with NaN
    for feature in KEY_FEATURES:
        if feature not in harmonized.columns:
            harmonized[feature] = np.nan
    
    return harmonized

print("Harmonizing datasets...\n")
harmonized_datasets = {}
for name, df in standardized_datasets.items():
    harmonized_datasets[name] = harmonize_features(df, config.DATASET_NAMES[name])
    print(f"‚úì {config.DATASET_NAMES[name]:40} - {harmonized_datasets[name].shape}")

## 5. Merge and Save

In [None]:
# Concatenate all harmonized datasets
merged_df = pd.concat(harmonized_datasets.values(), ignore_index=True)

# Basic cleaning
if 'Target' in merged_df.columns:
    merged_df = merged_df.dropna(subset=['Target'])

# Save merged dataset
output_path = config.MERGED_DATASET_PATH
merged_df.to_csv(output_path, index=False)

print(f"\n‚úì Merged dataset saved to: {output_path}")
print(f"  Total records: {len(merged_df):,}")
print(f"  Total features: {merged_df.shape[1]}")

display(merged_df.head())