In [15]:
import gc
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import scipy.sparse
import seaborn as sns
import traceback
import warnings
import xgboost as xgb

from copy import deepcopy
from imblearn.combine import SMOTETomek
from imblearn.ensemble import BalancedRandomForestClassifier 
from imblearn.metrics import classification_report_imbalanced, geometric_mean_score
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids
from joblib import Parallel, delayed
from lightgbm import LGBMClassifier
from scipy.sparse import csr_matrix, csc_matrix
from sklearn import ensemble, linear_model, preprocessing, neighbors, datasets
from sklearn.ensemble import (AdaBoostClassifier, BaggingClassifier, VotingClassifier,
                            StackingClassifier, RandomForestClassifier, GradientBoostingClassifier)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (f1_score, accuracy_score, confusion_matrix, classification_report,
                           roc_curve, auc, balanced_accuracy_score, roc_auc_score)
from sklearn.model_selection import train_test_split, cross_validate, KFold, GridSearchCV
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm

warnings.filterwarnings('ignore')

In [16]:
#Loading regular, gpt and combined datasets
def scale_dataset(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)
    return X_train_scaled, X_test_scaled

def process_gpt_analysis(train_texts, test_texts):
    vectorizer = TfidfVectorizer(max_features=100) 
    X_train_tfidf = vectorizer.fit_transform(train_texts)
    X_test_tfidf = vectorizer.transform(test_texts)
    feature_names = [f'gpt_feature_{i}' for i in range(X_train_tfidf.shape[1])]
    X_train_df = pd.DataFrame(X_train_tfidf.toarray(), columns=feature_names, index=train_texts.index)
    X_test_df = pd.DataFrame(X_test_tfidf.toarray(), columns=feature_names, index=test_texts.index)
    return X_train_df, X_test_df

def load_dataset(base_path, is_target=False):
    if is_target:
        return pd.read_csv(base_path, index_col=0).squeeze()
    return pd.read_csv(base_path, index_col=0)

def load_all_datasets():
    try:
        print("Loading datasets from data directory")
        X_train = load_dataset('data/X_train.csv')
        X_test = load_dataset('data/X_test.csv')
        y_train = load_dataset('data/y_train.csv', is_target=True)
        y_test = load_dataset('data/y_test.csv', is_target=True)
        X_train_scaled, X_test_scaled = scale_dataset(X_train, X_test)
        datasets = {'regular': (X_train_scaled, X_test_scaled, y_train, y_test)}
        
        try:
            csv_params = {'engine': 'python', 'quoting': 1, 'escapechar': '\\', 'on_bad_lines': 'warn', 
                  'encoding': 'utf-8', 'delimiter': ',', 'quotechar': '"', 'doublequote': True}
            X_train_gpt = pd.read_csv('data/train_trustpilot_3_enhanced_100_X.csv', **csv_params)
            X_test_gpt = pd.read_csv('data/test_trustpilot_3_enhanced_100_X.csv', **csv_params)
            print("GPT datasets loaded successfully")
            train_indices = X_train_gpt.index
            test_indices = X_test_gpt.index
            X_train_subset = X_train_scaled.iloc[train_indices]
            X_test_subset = X_test_scaled.iloc[test_indices]
            y_train_subset = y_train.iloc[train_indices]
            y_test_subset = y_test.iloc[test_indices]
            gpt_train = X_train_gpt['gpt_analysis'].fillna("")
            gpt_test = X_test_gpt['gpt_analysis'].fillna("")
            gpt_train_processed, gpt_test_processed = process_gpt_analysis(gpt_train, gpt_test)
            X_train_with_gpt = pd.concat([X_train_subset, gpt_train_processed], axis=1)
            X_test_with_gpt = pd.concat([X_test_subset, gpt_test_processed], axis=1)
            datasets.update({'gpt_analysis': (gpt_train_processed, gpt_test_processed, y_train_subset, y_test_subset), 'combined_with_gpt': (X_train_with_gpt, X_test_with_gpt, y_train_subset, y_test_subset)})
        except Exception as e:
            print(f"Error processing GPT datasets: {str(e)}")
            print(f"Error type: {type(e)}") 
            print(f"Full traceback: {traceback.format_exc()}") 
        print("Dataset shapes:")
        for name, (X_tr, X_te, y_tr, y_te) in datasets.items():
            print(f"{name}:")
            print(f"  X_train: {X_tr.shape}")
            print(f"  X_test: {X_te.shape}")
            print(f"  y_train: {y_tr.shape}")
            print(f"  y_test: {y_te.shape}\n")
            
        return datasets
    except Exception as e:
        print(f"Error loading datasets: {str(e)}")
        raise

if __name__ == "__main__":
    datasets = load_all_datasets()
    X_train, X_test, y_train, y_test = datasets['regular']
    print("Regular dataset shape:", X_train.shape)
    if 'gpt_analysis' in datasets:
        X_train_gpt, X_test_gpt, y_train_gpt, y_test_gpt = datasets['gpt_analysis']
        X_train_combined, X_test_combined, y_train_combined, y_test_combined = datasets['combined_with_gpt']
        print("GPT dataset shape:", X_train_gpt.shape)
        print("Combined dataset shape:", X_train_combined.shape)

datasets = load_all_datasets()
X_train, X_test, y_train, y_test = datasets['regular']
print("Regular dataset shape:", X_train.shape)
if 'gpt_analysis' in datasets:
    X_train_gpt, X_test_gpt, y_train_gpt, y_test_gpt = datasets['gpt_analysis']
    X_train_combined, X_test_combined, y_train_combined, y_test_combined = datasets['combined_with_gpt']
    print("GPT dataset shape:", X_train_gpt.shape)
    print("Combined dataset shape:", X_train_combined.shape)

Loading datasets from data directory
GPT datasets loaded successfully
Dataset shapes:
regular:
  X_train: (256039, 8)
  X_test: (53180, 8)
  y_train: (256039,)
  y_test: (53180,)

gpt_analysis:
  X_train: (1001, 100)
  X_test: (1011, 100)
  y_train: (1001,)
  y_test: (1011,)

combined_with_gpt:
  X_train: (2000, 108)
  X_test: (2017, 108)
  y_train: (1001,)
  y_test: (1011,)

Regular dataset shape: (256039, 8)
GPT dataset shape: (1001, 100)
Combined dataset shape: (2000, 108)
Loading datasets from data directory
GPT datasets loaded successfully
Dataset shapes:
regular:
  X_train: (256039, 8)
  X_test: (53180, 8)
  y_train: (256039,)
  y_test: (53180,)

gpt_analysis:
  X_train: (1001, 100)
  X_test: (1011, 100)
  y_train: (1001,)
  y_test: (1011,)

combined_with_gpt:
  X_train: (2000, 108)
  X_test: (2017, 108)
  y_train: (1001,)
  y_test: (1011,)

Regular dataset shape: (256039, 8)
GPT dataset shape: (1001, 100)
Combined dataset shape: (2000, 108)


In [17]:
""" Model Training & Evaluation with rebalancing:
Regular dataset + Logistic Regression
Regular dataset + Random Forest
Regular dataset + Gradient Boosting
GPT Analysis dataset + Logistic Regression
GPT Analysis dataset + Random Forest
GPT Analysis dataset + Gradient Boosting
Combined dataset + Logistic Regression
Combined dataset + Random Forest
Combined dataset + Gradient Boosting
"""

def apply_sampling(X_train, y_train, method='none'):
    if method == 'none':
        return X_train, y_train
    elif method == 'ros':
        sampler = RandomOverSampler(random_state=25)
    elif method == 'rus':
        sampler = RandomUnderSampler(random_state=25)
    elif method == 'smote':
        sampler = SMOTE(random_state=25)
    
    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
    return X_resampled, y_resampled

def train_evaluate_model(model_info, X_train, X_test, y_train, y_test, sampling_method='none', current_dataset='regular'):
    name, model = model_info
    try:
        X_train_sampled, y_train_sampled = apply_sampling(X_train, y_train, sampling_method)
        if len(np.unique(y_train_sampled)) < 2 or len(np.unique(y_test)) < 2:
            return None
            
        model.fit(X_train_sampled, y_train_sampled)
        y_pred = model.predict(X_test)
        f1_per_class = f1_score(y_test, y_pred, average=None)
        accuracy = accuracy_score(y_test, y_pred)
        f1_overall = f1_score(y_test, y_pred, average='weighted')

        dataset_name = 'Regular dataset' if current_dataset == 'regular' else 'GPT dataset' if current_dataset == 'gpt_analysis' else 'Combined dataset'
        print(f"\nResults for {model.__class__.__name__} on {dataset_name} with {sampling_method} sampling:")
        print("Accuracy: {:.4f}".format(accuracy))
        print("Overall F1: {:.4f}".format(f1_overall))
        print("Min Class F1: {:.4f}".format(min(f1_per_class)))
        print("F1 scores per class: {" + ", ".join([f"{k}: {v:.4f}" for k, v in enumerate(f1_per_class)]) + "}")
        
        return {'model': name, 'sampling': sampling_method, 'f1_overall': f1_overall, 'f1_per_class': f1_per_class.tolist(), 'f1_min': float(min(f1_per_class))}
    except Exception as e:
        print(f"Error in {name} with {sampling_method}: {str(e)}")
        return None

def process_dataset_parallel(X_train, X_test, y_train, y_test, name, n_jobs=-1):
    models = [('logistic', LogisticRegression(random_state=25, max_iter=5000, solver='saga')),
        ('random_forest', RandomForestClassifier(random_state=25, n_estimators=20)),
        ('gradient_boosting', GradientBoostingClassifier(random_state=25, n_estimators=20))]
    sampling_methods = ['none', 'ros', 'rus', 'smote']
    all_combinations = [(model_info, sampling) 
                       for model_info in models 
                       for sampling in sampling_methods]
    results = Parallel(n_jobs=n_jobs)(delayed(train_evaluate_model)(model_info, X_train, X_test, y_train, y_test, sampling, name)
        for model_info, sampling in all_combinations)
    return [r for r in results if r]

def run_pipeline(datasets, n_jobs=-1):
    results = []
    print("\nAvailable datasets:", list(datasets.keys()))
    with tqdm(total=len(datasets), desc="Processing datasets") as pbar:
        for name, (X_train, X_test, y_train, y_test) in datasets.items():
            try:
                dataset_results = process_dataset_parallel(X_train, X_test, y_train, y_test, name, n_jobs=n_jobs)
                for result in dataset_results:
                    result['dataset'] = name
                results.extend(dataset_results)
            except Exception as e:
                print(f"Error processing {name}: {str(e)}")
            pbar.update(1)
    
    print("\nAverage F1 scores by dataset and sampling method:")
    for name in datasets.keys():
        for sampling in ['none', 'ros', 'rus', 'smote']:
            dataset_sampling_results = [
                r for r in results 
                if r['dataset'] == name and r['sampling'] == sampling]
            if dataset_sampling_results:
                avg_f1 = np.mean([r['f1_overall'] for r in dataset_sampling_results])
                avg_min_f1 = np.mean([r['f1_min'] for r in dataset_sampling_results])
                print(f"{name} with {sampling}: Overall F1 = {avg_f1:.4f}, Min Class F1 = {avg_min_f1:.4f}")
    
    if not results:
        return {'results': [], 'best': None}
    
    best_result = max(results, key=lambda x: (x['f1_min'], x['f1_overall']))
    return {'results': results, 'best': best_result}

if __name__ == "__main__":
    import numpy as np
    import pandas as pd
    from sklearn.impute import SimpleImputer
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
    from sklearn.metrics import f1_score, accuracy_score
    from joblib import Parallel, delayed
    from tqdm import tqdm
    from imblearn.over_sampling import RandomOverSampler, SMOTE
    from imblearn.under_sampling import RandomUnderSampler
    
    results = run_pipeline(datasets, n_jobs=-1)
    
    if results['best']:
        best = results['best']
        print(f"\nBest Result:")
        print(f"Dataset: {best['dataset']}")
        print(f"Model: {best['model']}")
        print(f"Sampling Method: {best['sampling']}")
        print(f"Overall F1: {best['f1_overall']:.4f}")
        print(f"Min Class F1: {best['f1_min']:.4f}")
        print(f"Per-class F1: {best['f1_per_class']}")


Available datasets: ['regular', 'gpt_analysis', 'combined_with_gpt']


Processing datasets:   0%|          | 0/3 [00:00<?, ?it/s]


Results for LogisticRegression on Regular dataset with rus sampling:
Accuracy: 0.5104
Overall F1: 0.5426
Min Class F1: 0.1866
F1 scores per class: {0: 0.5876, 1: 0.2186, 2: 0.1866, 3: 0.2498, 4: 0.7119}

Results for LogisticRegression on Regular dataset with none sampling:
Accuracy: 0.6288
Overall F1: 0.5205
Min Class F1: 0.0000
F1 scores per class: {0: 0.6299, 1: 0.0000, 2: 0.0023, 3: 0.0000, 4: 0.7805}

Results for RandomForestClassifier on Regular dataset with rus sampling:
Accuracy: 0.4801
Overall F1: 0.5235
Min Class F1: 0.1943
F1 scores per class: {0: 0.5990, 1: 0.1943, 2: 0.2280, 3: 0.2382, 4: 0.6674}

Results for LogisticRegression on Regular dataset with ros sampling:
Accuracy: 0.5114
Overall F1: 0.5433
Min Class F1: 0.1860
F1 scores per class: {0: 0.5879, 1: 0.2170, 2: 0.1860, 3: 0.2491, 4: 0.7136}

Results for RandomForestClassifier on Regular dataset with none sampling:
Accuracy: 0.6245
Overall F1: 0.5832
Min Class F1: 0.1072
F1 scores per class: {0: 0.6574, 1: 0.1072, 2: 

Processing datasets:  33%|███▎      | 1/3 [02:07<04:14, 127.45s/it]


Results for GradientBoostingClassifier on Regular dataset with smote sampling:
Accuracy: 0.5306
Overall F1: 0.5647
Min Class F1: 0.2021
F1 scores per class: {0: 0.6308, 1: 0.2157, 2: 0.2021, 3: 0.2600, 4: 0.7331}

Results for LogisticRegression on GPT dataset with none sampling:
Accuracy: 0.5035
Overall F1: 0.3419
Min Class F1: 0.0000
F1 scores per class: {0: 0.0000, 1: 0.0000, 2: 0.0000, 3: 0.0284, 4: 0.6689}

Results for LogisticRegression on GPT dataset with rus sampling:
Accuracy: 0.1978
Overall F1: 0.2214
Min Class F1: 0.0913
F1 scores per class: {0: 0.1626, 1: 0.0913, 2: 0.1292, 3: 0.2158, 4: 0.2803}

Results for RandomForestClassifier on GPT dataset with rus sampling:
Accuracy: 0.1701
Overall F1: 0.1869
Min Class F1: 0.0870
F1 scores per class: {0: 0.2005, 1: 0.0870, 2: 0.1107, 3: 0.1596, 4: 0.2175}

Results for RandomForestClassifier on GPT dataset with none sampling:
Accuracy: 0.4857
Overall F1: 0.3671
Min Class F1: 0.0000
F1 scores per class: {0: 0.1205, 1: 0.0000, 2: 0.0182

Processing datasets: 100%|██████████| 3/3 [02:09<00:00, 43.31s/it] 


Results for GradientBoostingClassifier on GPT dataset with smote sampling:
Accuracy: 0.3709
Overall F1: 0.3443
Min Class F1: 0.0187
F1 scores per class: {0: 0.2067, 1: 0.0187, 2: 0.0865, 3: 0.0249, 4: 0.5765}
Error in logistic with none: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
Error in logistic with smote: Input X contains NaN.
SMOTE does not accept missing values encoded as NaN natively. For su




In [18]:
#Additional model combinations
def apply_sampling(X_train, y_train, method='none'):
    if method == 'none':
        return X_train, y_train
    elif method == 'ros':
        sampler = RandomOverSampler(random_state=25)
    elif method == 'rus':
        sampler = RandomUnderSampler(random_state=25)
    elif method == 'smote':
        sampler = SMOTE(random_state=25)
    return sampler.fit_resample(X_train, y_train)

def get_models():
    base_models = {'logistic': LogisticRegression(random_state=25, max_iter=5000, solver='saga'),
        'random_forest': RandomForestClassifier(random_state=25, n_estimators=20),
        'gradient_boosting': GradientBoostingClassifier(random_state=25, n_estimators=20),
        'svc': SVC(random_state=25), 'naive_bayes': GaussianNB()}
    models = [(f"{name}_basic", model) for name, model in base_models.items()]
    for name, model in base_models.items():
        models.append((f"{name}_bagging", BaggingClassifier(estimator=model, random_state=25, n_estimators=10)))
    for name, model in base_models.items():
        if name not in ['svc', 'naive_bayes']:
            models.append((f"{name}_adaboost", AdaBoostClassifier(estimator=model, random_state=25, n_estimators=10)))
    for name, model in base_models.items():
        models.extend([(f"{name}_ovr", OneVsRestClassifier(model)), (f"{name}_ovo", OneVsOneClassifier(model))])
    return models

def train_evaluate_model(model_info, X_train, X_test, y_train, y_test, sampling_method='none', current_dataset='regular'):
    name, model = model_info
    try:
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        X_train_sampled, y_train_sampled = apply_sampling(X_train_scaled, y_train, sampling_method)
        if len(np.unique(y_train_sampled)) < 2 or len(np.unique(y_test)) < 2:
            return None
        model_copy = deepcopy(model)
        model_copy.fit(X_train_sampled, y_train_sampled)
        y_pred = model_copy.predict(X_test_scaled)
        f1_per_class = f1_score(y_test, y_pred, average=None)
        accuracy = accuracy_score(y_test, y_pred)
        f1_overall = f1_score(y_test, y_pred, average='weighted')
        dataset_name = 'Regular dataset' if current_dataset == 'regular' else 'GPT dataset' if current_dataset == 'gpt_analysis' else 'Combined dataset'
        print(f"\nResults for {name} on {dataset_name} with {sampling_method} sampling:")
        print("Accuracy: {:.4f}".format(accuracy))
        print("Overall F1: {:.4f}".format(f1_overall))
        print("Min Class F1: {:.4f}".format(min(f1_per_class)))
        print("F1 scores per class: {" + ", ".join([f"{k}: {v:.4f}" for k, v in enumerate(f1_per_class)]) + "}")
        return {'model': name, 'sampling': sampling_method, 'f1_overall': f1_overall,
            'f1_per_class': f1_per_class.tolist(), 'f1_min': float(min(f1_per_class))}
    except Exception as e:
        print(f"Error in {name} with {sampling_method}: {str(e)}")
        return None

def process_dataset_parallel(X_train, X_test, y_train, y_test, name, n_jobs=-1):
    models = get_models()
    sampling_methods = ['none', 'ros', 'rus', 'smote']
    all_combinations = [(model_info, sampling) 
                       for model_info in models 
                       for sampling in sampling_methods]
    
    results = Parallel(n_jobs=n_jobs)(
        delayed(train_evaluate_model)(model_info, X_train, X_test, y_train, y_test, sampling, name)
        for model_info, sampling in all_combinations)
    return [r for r in results if r]

def run_pipeline(datasets, n_jobs=-1):
    results = []
    print("\nAvailable datasets:", list(datasets.keys()))
    with tqdm(total=len(datasets), desc="Processing datasets") as pbar:
        for name, (X_train, X_test, y_train, y_test) in datasets.items():
            try:
                dataset_results = process_dataset_parallel(X_train, X_test, y_train, y_test, name, n_jobs=n_jobs)
                for result in dataset_results:
                    result['dataset'] = name
                results.extend(dataset_results)
            except Exception as e:
                print(f"Error processing {name}: {str(e)}")
            pbar.update(1)

    print("\nResults Summary:")
    for name in datasets.keys():
        print(f"\nDataset: {name}")
        for sampling in ['none', 'ros', 'rus', 'smote']:
            dataset_sampling_results = [r for r in results 
                if r['dataset'] == name and r['sampling'] == sampling]
            if dataset_sampling_results:
                avg_f1 = np.mean([r['f1_overall'] for r in dataset_sampling_results])
                avg_min_f1 = np.mean([r['f1_min'] for r in dataset_sampling_results])
                print(f"  {sampling:8} sampling: Overall F1 = {avg_f1:.4f}, Min Class F1 = {avg_min_f1:.4f}")
    if not results:
        return {'results': [], 'best': None}
    best_result = max(results, key=lambda x: (x['f1_min'], x['f1_overall']))
    return {'results': results, 'best': best_result}

if __name__ == "__main__":
    results = run_pipeline(datasets, n_jobs=-1)
    if results['best']:
        best = results['best']
        print(f"\nBest Result:")
        print(f"Dataset: {best['dataset']}")
        print(f"Model: {best['model']}")
        print(f"Sampling Method: {best['sampling']}")
        print(f"Overall F1: {best['f1_overall']:.4f}")
        print(f"Min Class F1: {best['f1_min']:.4f}")
        print(f"Per-class F1: {[f'{f1:.4f}' for f1 in best['f1_per_class']]}")


Available datasets: ['regular', 'gpt_analysis', 'combined_with_gpt']


Processing datasets:   0%|          | 0/3 [00:00<?, ?it/s]


Results for logistic_basic on Regular dataset with rus sampling:
Accuracy: 0.5104
Overall F1: 0.5426
Min Class F1: 0.1866
F1 scores per class: {0: 0.5876, 1: 0.2186, 2: 0.1866, 3: 0.2498, 4: 0.7119}

Results for logistic_basic on Regular dataset with none sampling:
Accuracy: 0.6288
Overall F1: 0.5205
Min Class F1: 0.0000
F1 scores per class: {0: 0.6299, 1: 0.0000, 2: 0.0023, 3: 0.0000, 4: 0.7805}

Results for random_forest_basic on Regular dataset with rus sampling:
Accuracy: 0.4801
Overall F1: 0.5235
Min Class F1: 0.1943
F1 scores per class: {0: 0.5990, 1: 0.1943, 2: 0.2280, 3: 0.2382, 4: 0.6674}

Results for logistic_basic on Regular dataset with ros sampling:
Accuracy: 0.5114
Overall F1: 0.5433
Min Class F1: 0.1860
F1 scores per class: {0: 0.5879, 1: 0.2170, 2: 0.1860, 3: 0.2491, 4: 0.7136}

Results for logistic_basic on Regular dataset with smote sampling:
Accuracy: 0.5111
Overall F1: 0.5435
Min Class F1: 0.1843
F1 scores per class: {0: 0.5892, 1: 0.2197, 2: 0.1843, 3: 0.2513, 4: 

Processing datasets:   0%|          | 0/3 [17:53<?, ?it/s]


KeyboardInterrupt: 