## IDMB Data
https://www.kaggle.com/datasets/mahmoudshaheen1134/imdp-data

In [1]:
import pandas as pd
from pycaret.classification import *
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk

In [2]:
def prepare_data(file_path, sample_size=None):
    """
    Load and prepare the IMDB dataset with text preprocessing
    """
    # Download required NLTK data
    nltk.download('stopwords')
    
    # Read the dataset
    df = pd.read_csv(file_path)
    
    # Take a sample if specified (useful for quick testing)
    if sample_size:
        df = df.sample(n=sample_size, random_state=42).reset_index(drop=True)
    
    # Create TF-IDF features with stopword removal
    tfidf = TfidfVectorizer(
        stop_words='english',
        max_features=5000,
        strip_accents='unicode',
        lowercase=True
    )
    
    # Transform the text data
    text_features = tfidf.fit_transform(df['review'])
    
    # Convert to DataFrame
    feature_names = tfidf.get_feature_names_out()
    text_df = pd.DataFrame(text_features.toarray(), columns=feature_names)
    
    # Combine with target variable
    final_df = pd.concat([text_df, df['sentiment']], axis=1)
    
    return final_df

In [3]:
def setup_pycaret_classifier(data):
    """
    Setup PyCaret classification experiment
    """
    # Initialize setup with minimal parameters
    clf_setup = setup(
        data=data,
        target='sentiment',
        session_id=42,
        preprocess=True
    )
    return clf_setup

In [4]:
def get_best_models(top_n=3):
    """
    Get the best performing models
    """
    print(f"\nTraining and selecting top {top_n} models...")
    best_models = compare_models(
        n_select=top_n,
        sort='F1' # Using F1 as initial metric since it balances Precision and Recall
    )
    return best_models

In [5]:
def tune_models(models):
    """
    Fine-tune each model using PyCaret's tune_model function
    Optimizing for both Precision and Recall
    """
    print("\nTuning individual models...")
    tuned_models = []
    
    for i, model in enumerate(models, 1):
        print(f"\nTuning model {i}...")
        
        # Print available metrics before tuning
        print("\nAvailable metrics before tuning:")
        initial_metrics = pull()
        print(initial_metrics.columns.tolist())
        
        # First tune for Recall
        print(f"\nTuning for Recall...")
        recall_tuned = tune_model(
            model,
            n_iter=10,
            optimize='Recall',
            search_library='optuna',
            choose_better=True
        )
        
        # Then tune for Precision
        print(f"\nTuning for Precision...")
        precision_tuned = tune_model(
            recall_tuned,
            n_iter=10,
            optimize='Prec.',  # or whatever the actual precision metric name is
            search_library='optuna',
            choose_better=True
        )
        
        print(f"\nModel {i} tuning completed")
        
        # Print final metrics
        print("\nFinal metrics after tuning:")
        eval_metrics = pull()
        for metric in eval_metrics.columns:
            try:
                print(f"{metric}: {eval_metrics.loc[0, metric]:.4f}")
            except:
                print(f"{metric}: {eval_metrics.loc[0, metric]}")
        
        tuned_models.append(precision_tuned)
    
    return tuned_models

In [6]:
def create_ensemble(tuned_models):
    """
    Create a stacking ensemble from the tuned models
    """
    print("\nCreating stacking ensemble...")
    
    # Create stacking ensemble
    stacker = stack_models(
        estimator_list=tuned_models,
        meta_model='lr',  # Using logistic regression as meta-model
        restack=True  # Use predictions from base models as features
    )
    
    return stacker

In [7]:
def evaluate_model(model, model_name="Model"):
    """
    Evaluate model performance with focus on Precision and Recall
    """
    print(f"\nEvaluating {model_name}...")
    
    # Get model predictions
    predictions = predict_model(model)
    
    # Create evaluation plots
    try:
        plot_model(model, plot='confusion_matrix')
        plot_model(model, plot='pr')  # Precision-Recall curve
    except:
        print("Warning: Could not create some plots")
    
    # Print detailed metrics
    try:
        metrics = pull()
        print("\nDetailed Metrics:")
        
        # Get the actual column names from the metrics DataFrame
        metric_columns = metrics.columns
        
        # Print available metrics
        for metric in metric_columns:
            try:
                print(f"{metric}: {metrics.loc[0, metric]:.4f}")
            except:
                print(f"{metric}: {metrics.loc[0, metric]}")
    except Exception as e:
        print(f"Error getting metrics: {str(e)}")
    
    return predictions

In [8]:
def compare_performances(tuned_models, ensemble_model):
    """
    Compare performance of individual models against ensemble
    """
    print("\n=== PERFORMANCE COMPARISON ===")
    print("\nIndividual Models Performance:")
    
    # Store metrics for comparison
    all_metrics = []
    
    # Evaluate individual models
    for i, model in enumerate(tuned_models, 1):
        print(f"\nModel {i}:")
        predictions = predict_model(model)
        metrics = pull()
        all_metrics.append(metrics.iloc[0].to_dict())
        
        # Print all available metrics
        for col in metrics.columns:
            try:
                print(f"{col}: {metrics.iloc[0][col]:.4f}")
            except:
                print(f"{col}: {metrics.iloc[0][col]}")
    
    # Evaluate ensemble
    print("\nEnsemble Model Performance:")
    ensemble_predictions = predict_model(ensemble_model)
    ensemble_metrics = pull()
    
    # Print all available metrics for ensemble
    for col in ensemble_metrics.columns:
        try:
            print(f"{col}: {ensemble_metrics.iloc[0][col]:.4f}")
        except:
            print(f"{col}: {ensemble_metrics.iloc[0][col]}")
    
    # Compare with best individual model
    print("\n=== COMPARISON WITH BEST INDIVIDUAL MODEL ===")
    for col in ensemble_metrics.columns:
        if col in all_metrics[0]:  # Check if metric exists in individual models
            best_individual = max(m[col] for m in all_metrics if isinstance(m[col], (int, float)))
            ensemble_value = ensemble_metrics.iloc[0][col]
            
            try:
                print(f"{col}:")
                print(f"Best Individual: {best_individual:.4f}")
                print(f"Ensemble: {ensemble_value:.4f}")
                if isinstance(ensemble_value, (int, float)):
                    improvement = ((ensemble_value - best_individual) / best_individual) * 100
                    print(f"Improvement: {improvement:.2f}%")
                print()
            except:
                continue

    return ensemble_predictions


In [None]:
# Load and prepare data
print("Loading and preparing data...")
# data = prepare_data('IMDB Dataset.csv', sample_size=100)
data = prepare_data('IMDB Dataset.csv')

# Setup the classification experiment
print("Setting up PyCaret classifier...")
clf_setup = setup_pycaret_classifier(data)

# Get best individual models
best_models = get_best_models(top_n=3)

Loading and preparing data...


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\erafpac\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Setting up PyCaret classifier...


Unnamed: 0,Description,Value
0,Session id,42
1,Target,sentiment
2,Target type,Binary
3,Target mapping,"negative: 0, positive: 1"
4,Original data shape,"(50000, 5001)"
5,Transformed data shape,"(50000, 5001)"
6,Transformed train set shape,"(35000, 5001)"
7,Transformed test set shape,"(15000, 5001)"
8,Numeric features,5000
9,Preprocess,True



Training and selecting top 3 models...


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.884,0.9526,0.884,0.8843,0.884,0.768,0.7683,12.351
svm,SVM - Linear Kernel,0.8785,0.9522,0.8785,0.8811,0.8783,0.757,0.7596,211.175
ridge,Ridge Classifier,0.8754,0.9467,0.8754,0.8757,0.8753,0.7507,0.7511,7.842
lda,Linear Discriminant Analysis,0.8641,0.9378,0.8641,0.8643,0.8641,0.7282,0.7284,58.427
et,Extra Trees Classifier,0.8628,0.936,0.8628,0.8631,0.8627,0.7255,0.7259,46.765
lightgbm,Light Gradient Boosting Machine,0.8554,0.9345,0.8554,0.8558,0.8554,0.7109,0.7112,20.564
rf,Random Forest Classifier,0.8456,0.924,0.8456,0.8458,0.8455,0.6911,0.6914,25.391
gbc,Gradient Boosting Classifier,0.8054,0.8918,0.8054,0.81,0.8047,0.6109,0.6154,75.085
nb,Naive Bayes,0.7994,0.8442,0.7994,0.7995,0.7994,0.5988,0.5989,7.357
ada,Ada Boost Classifier,0.7988,0.8803,0.7988,0.8014,0.7983,0.5975,0.6002,21.862


Processing:   0%|          | 0/67 [00:00<?, ?it/s]

In [None]:
# Print initial model performances
print("\nInitial model performances:")
for i, model in enumerate(best_models, 1):
    print(f"\nModel {i} - Initial Performance:")
    evaluate_model(model, f"Initial Model {i}")

In [None]:
# Tune the models
tuned_models = tune_models(best_models)

In [None]:
# Print tuned model performances
print("\nTuned model performances:")
for i, model in enumerate(tuned_models, 1):
    print(f"\nModel {i} - Tuned Performance:")
    evaluate_model(model, f"Tuned Model {i}")


In [None]:
# Create and evaluate stacking ensemble
stacking_ensemble = create_ensemble(tuned_models)
ensemble_predictions = evaluate_model(stacking_ensemble, "Stacking Ensemble")

# Save the ensemble model
print("\nSaving ensemble model...")
save_model(stacking_ensemble, 'sentiment_classifier_ensemble')