In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
import xgboost as xgb
from imblearn.combine import SMOTETomek
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.metrics import classification_report_imbalanced, geometric_mean_score
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler,  ClusterCentroids
from lightgbm import LGBMClassifier
from sklearn import ensemble, linear_model, preprocessing, neighbors, datasets
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, VotingClassifier, StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, roc_curve, auc, balanced_accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split, cross_validate, KFold, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm
from xgboost import XGBClassifier


warnings.filterwarnings('ignore')

In [5]:
#Loading the datasets - correlation-based feature selection
X_train_correlation = pd.read_csv('data/X_train_correlation.csv', engine='python')
X_test_correlation = pd.read_csv('data/X_test_correlation.csv', engine='python')
y_train_correlation = pd.read_csv('data/y_train_correlation.csv', engine='python')['rating']
y_test_correlation = pd.read_csv('data/y_test_correlation.csv', engine='python')['rating']

print(f"X_train shape: {X_train_correlation.shape}")
print(f"X_test shape: {X_test_correlation.shape}")
print(f"y_train shape: {y_train_correlation.shape}")
print(f"y_test shape: {y_test_correlation.shape}")

print("X_train info:", X_train_correlation.info())
print("X_train dtypes:", X_train_correlation.dtypes)   

#Loading the datasets - regular feature selection
X_train = pd.read_csv('data/X_train.csv', engine='python')
X_test = pd.read_csv('data/X_test.csv', engine='python')
y_train = pd.read_csv('data/y_train.csv', engine='python')['rating']
y_test = pd.read_csv('data/y_test.csv', engine='python')['rating']

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

print("X_train info:", X_train.info())
print("X_train dtypes:", X_train.dtypes)



X_train shape: (80209, 19)
X_test shape: (20053, 19)
y_train shape: (80209,)
y_test shape: (20053,)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80209 entries, 0 to 80208
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Sentiment_Blob       80209 non-null  float64
 1   Sentiment_VADER      80209 non-null  float64
 2   Sentiment_VADER_cat  80209 non-null  int64  
 3   Sentiment_Blob_cat   80209 non-null  float64
 4   text_word_length     80209 non-null  int64  
 5   text_length          80209 non-null  int64  
 6   tfidf_331            80209 non-null  float64
 7   tfidf_1867           80209 non-null  float64
 8   tfidf_3522           80209 non-null  float64
 9   tfidf_4431           80209 non-null  float64
 10  bow_331              80209 non-null  int64  
 11  bow_1739             80209 non-null  int64  
 12  bow_2738             80209 non-null  int64  
 13  bow_4431             80209 non-null 

In [3]:
def apply_resampling(
    X_train,
    y_train,
    method='none',
    random_state=25,
    n_jobs=-1
):
    """
    Simplified resampling function including natural breaks
    """
    if method == 'none':
        return X_train, y_train, {
            'method': None,
            'original_shape': X_train.shape,
            'original_distribution': pd.Series(y_train).value_counts().to_dict()
        }
    
    try:
        if method == 'natural_breaks':
            # Implement natural breaks using class frequencies
            class_counts = pd.Series(y_train).value_counts()
            median_count = class_counts.median()
            
            # Sample or truncate each class to median size
            resampled_data = []
            resampled_labels = []
            
            for class_label in class_counts.index:
                class_mask = y_train == class_label
                class_data = X_train[class_mask]
                
                if len(class_data) > median_count:
                    # Undersample to median
                    indices = np.random.choice(len(class_data), size=int(median_count), replace=False)
                    resampled_data.append(class_data.iloc[indices])
                    resampled_labels.extend([class_label] * int(median_count))
                else:
                    # Keep all samples for smaller classes
                    resampled_data.append(class_data)
                    resampled_labels.extend([class_label] * len(class_data))
            
            X_resampled = pd.concat(resampled_data, axis=0)
            y_resampled = pd.Series(resampled_labels)
            
        elif method == 'smote':
            resampler = SMOTE(random_state=random_state, n_jobs=n_jobs)
            X_resampled, y_resampled = resampler.fit_resample(X_train, y_train)
        elif method == 'random_over':
            resampler = RandomOverSampler(random_state=random_state)
            X_resampled, y_resampled = resampler.fit_resample(X_train, y_train)
        elif method == 'random_under':
            resampler = RandomUnderSampler(random_state=random_state)
            X_resampled, y_resampled = resampler.fit_resample(X_train, y_train)
        elif method == 'cluster_centroids':
            resampler = ClusterCentroids(random_state=random_state, n_jobs=n_jobs)
            X_resampled, y_resampled = resampler.fit_resample(X_train, y_train)
        else:
            raise ValueError(f"Unknown resampling method: {method}")

        return X_resampled, y_resampled, {
            'method': method,
            'original_shape': X_train.shape,
            'resampled_shape': X_resampled.shape,
            'original_distribution': pd.Series(y_train).value_counts().to_dict(),
            'resampled_distribution': pd.Series(y_resampled).value_counts().to_dict()
        }
        
    except Exception as e:
        print(f"Error in {method} resampling: {str(e)}")
        return X_train, y_train, {'error': str(e)}

def process_data_combination(
    X_train,
    X_test,
    y_train,
    y_test,
    use_anomaly_detection=False,
    propagate_anomaly_data=False,
    random_state=25
):
    """
    Process a single data combination with options for anomaly detection
    and data propagation.
    """
    results = {}
    
    if use_anomaly_detection:
        # Simple IQR-based anomaly detection
        Q1 = X_train.quantile(0.25)
        Q3 = X_train.quantile(0.75)
        IQR = Q3 - Q1
        
        # Create mask for non-anomalous data
        mask = ~((X_train < (Q1 - 1.5 * IQR)) | 
                (X_train > (Q3 + 1.5 * IQR))).any(axis=1)
        
        # Remove anomalies
        X_train_clean = X_train[mask]
        y_train_clean = y_train[mask]
        
        if propagate_anomaly_data:
            # Add copies of minority class non-anomalous samples
            for class_label in range(1, 6):  # 5 classes
                class_mask = y_train_clean == class_label
                if sum(class_mask) < len(y_train_clean) / 5:  # If minority class
                    n_copies = int(len(y_train_clean) / 5 / sum(class_mask))
                    for _ in range(n_copies - 1):
                        X_train_clean = pd.concat([X_train_clean, X_train_clean[class_mask]])
                        y_train_clean = pd.concat([y_train_clean, y_train_clean[class_mask]])
        
        return X_train_clean, X_test, y_train_clean, y_test
    
    return X_train, X_test, y_train, y_test

def train_models(
    X_train,
    X_test,
    y_train,
    y_test,
    random_state=25,
    n_jobs=-1
):
    """
    Train models with minimal hyperparameters.
    Only essential parameters kept: random_state and n_jobs for parallelization
    """
    # Convert classes to 0-based indexing for XGBoost
    y_train_zero_based = y_train - 1
    y_test_zero_based = y_test - 1
    
    # Create MinMaxScaler for NaiveBayes (ensures non-negative values)
    nb_scaler = MinMaxScaler()
    X_train_nb = nb_scaler.fit_transform(X_train)
    X_test_nb = nb_scaler.transform(X_test)
    
    models = {
        'logistic': (LogisticRegression(random_state=random_state), 
                    (X_train, X_test, y_train, y_test)),
        'naive_bayes': (MultinomialNB(), 
                       (X_train_nb, X_test_nb, y_train, y_test)),  # Use scaled non-negative data
        'decision_tree': (DecisionTreeClassifier(random_state=random_state),
                         (X_train, X_test, y_train, y_test)),
        'random_forest': (RandomForestClassifier(random_state=random_state, n_jobs=n_jobs),
                         (X_train, X_test, y_train, y_test)),
        'balanced_rf': (BalancedRandomForestClassifier(random_state=random_state, n_jobs=n_jobs),
                       (X_train, X_test, y_train, y_test)),
        'xgboost': (XGBClassifier(random_state=random_state, n_jobs=n_jobs),
                    (X_train, X_test, y_train_zero_based, y_test_zero_based)),  # Use zero-based classes
        'gradient_boosting': (GradientBoostingClassifier(random_state=random_state),
                            (X_train, X_test, y_train, y_test)),
        'adaboost': (AdaBoostClassifier(random_state=random_state),
                    (X_train, X_test, y_train, y_test)),
        'voting': (VotingClassifier(
            estimators=[
                ('lr', LogisticRegression(random_state=random_state)),
                ('rf', RandomForestClassifier(random_state=random_state)),
                ('gb', GradientBoostingClassifier(random_state=random_state))
            ],
            voting='soft',
            n_jobs=n_jobs
        ), (X_train, X_test, y_train, y_test)),
        'bagging': (BaggingClassifier(
            random_state=random_state,
            n_jobs=n_jobs
        ), (X_train, X_test, y_train, y_test))
    }
    
    results = {}
    for name, (model, (X_tr, X_te, y_tr, y_te)) in models.items():
        try:
            # Train and predict
            model.fit(X_tr, y_tr)
            y_pred = model.predict(X_te)
            
            # Convert predictions back to 1-based if using XGBoost
            if name == 'xgboost':
                y_pred = y_pred + 1
                y_te = y_test  # Use original labels for evaluation
            
            # Calculate metrics
            f1_per_class = f1_score(y_te, y_pred, average=None)
            results[name] = {
                'f1_overall': f1_score(y_te, y_pred, average='weighted'),
                'f1_per_class': f1_per_class.tolist(),
                'f1_min': float(min(f1_per_class)),
                'confusion_matrix': confusion_matrix(y_te, y_pred).tolist()
            }
        except Exception as e:
            print(f"Error training {name}: {str(e)}")
    
    return results

def run_pipeline(
    X_train,
    X_test,
    y_train,
    y_test,
    X_train_corr,
    X_test_corr,
    y_train_corr,
    y_test_corr,
    random_state=25
):
    """
    Run complete pipeline testing all combinations.
    Returns best model based on both overall and minimum F1 scores.
    """

    
    all_results = {}
    best_overall_f1 = 0
    best_min_f1 = 0
    best_combination = None
    
    # Dataset combinations
    datasets = {
        'regular': (X_train, X_test, y_train, y_test),
        'correlation': (X_train_corr, X_test_corr, y_train_corr, y_test_corr)
    }
    
    # Calculate total combinations for progress bar
    n_datasets = len(datasets)
    n_anomaly = 2  # [False, True]
    n_propagate = 2  # [False, True] when anomaly=True
    n_resampling = 6  # [none, smote, random_over, random_under, cluster_centroids, natural_breaks]
    n_models = 10  # number of models in train_models
    total_combinations = n_datasets * (1 + n_propagate) * n_resampling * n_models
    
    # Initialize progress bar
    pbar = tqdm(total=total_combinations, desc="Training Models")
    
    # Test all combinations
    for dataset_name, (X_tr, X_te, y_tr, y_te) in datasets.items():
        for anomaly in [False, True]:
            for propagate in [False, True] if anomaly else [False]:
                # Process data
                X_train_processed, X_test_processed, y_train_processed, y_test_processed = process_data_combination(
                    X_tr, X_te, y_tr, y_te,
                    use_anomaly_detection=anomaly,
                    propagate_anomaly_data=propagate,
                    random_state=random_state
                )
                
                # Try each resampling method
                for resample_method in ['none', 'smote', 'random_over', 'random_under', 'cluster_centroids', 'natural_breaks']:
                    X_train_resampled, y_train_resampled, resample_info = apply_resampling(
                        X_train_processed,
                        y_train_processed,
                        method=resample_method,
                        random_state=random_state
                    )
                    
                    # Train all models
                    model_results = train_models(
                        X_train_resampled,
                        X_test_processed,
                        y_train_resampled,
                        y_test_processed,
                        random_state=random_state
                    )
                    
                    # Store results
                    combination_name = f"{dataset_name}_anomaly{anomaly}_propagate{propagate}_{resample_method}"
                    all_results[combination_name] = model_results
                    
                    # Update best combination
                    for model_name, metrics in model_results.items():
                        if metrics['f1_min'] > best_min_f1 or (
                            metrics['f1_min'] == best_min_f1 and 
                            metrics['f1_overall'] > best_overall_f1
                        ):
                            best_min_f1 = metrics['f1_min']
                            best_overall_f1 = metrics['f1_overall']
                            best_combination = {
                                'dataset': dataset_name,
                                'anomaly_detection': anomaly,
                                'data_propagation': propagate,
                                'resampling': resample_method,
                                'model': model_name,
                                'metrics': metrics
                            }
                        pbar.update(1)  # Update progress bar after each model
    
    pbar.close()  # Close progress bar
    
    return {
        'all_results': all_results,
        'best_combination': best_combination
    }

# Example usage:
results = run_pipeline(
    X_train, X_test, y_train, y_test,
    X_train_correlation, X_test_correlation,
    y_train_correlation, y_test_correlation
)

# Print best combination
best = results['best_combination']
print("\nBest Model Combination:")
print(f"Dataset: {best['dataset']}")
print(f"Anomaly Detection: {best['anomaly_detection']}")
print(f"Data Propagation: {best['data_propagation']}")
print(f"Resampling Method: {best['resampling']}")
print(f"Model: {best['model']}")
print(f"Overall F1: {best['metrics']['f1_overall']:.4f}")
print(f"Minimum Class F1: {best['metrics']['f1_min']:.4f}")
print("Per-class F1 scores:", best['metrics']['f1_per_class'])

