In [59]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/amazon-reviews/amazon_review_polarity_csv.tgz
/kaggle/input/amazon-reviews/train.csv
/kaggle/input/amazon-reviews/test.csv
/kaggle/input/d/ducleathome/modules/pre_processor.py
/kaggle/input/d/ducleathome/modules/text_analyzer.py
/kaggle/input/d/ducleathome/modules/random_forest_classifier.py
/kaggle/input/d/ducleathome/modules/config_loader.py
/kaggle/input/d/ducleathome/modules/stopwords_config.py
/kaggle/input/d/ducleathome/modules/logistic_regression_classifier.py
/kaggle/input/d/ducleathome/modules/model_trainer.py
/kaggle/input/d/ducleathome/modules/kaggle_data_loader.py
/kaggle/input/d/ducleathome/modules/accuracy_optimized_config_2.json
/kaggle/input/d/ducleathome/modules/tf_idf_vectorizer.py
/kaggle/input/d/ducleathome/modules/accuracy_optimized_config.json
/kaggle/input/d/ducleathome/modules/balanced_config.json


In [None]:
import sys
sys.path.append('/kaggle/input/modules')

In [None]:
try:
    from kaggle_data_loader import KaggleDataLoader
    from config_loader import load_json_config
    from pathlib import Path
    
    # Use Kaggle-optimized preprocessor
    from preprocessor_kaggle import KagglePreProcessor
    
    from model_trainer import ModelTrainer
    print("✅ All modules imported successfully")
    print("🏆 Using Kaggle-optimized preprocessor with lazy initialization")
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("Make sure all required modules are in the path")
    raise

In [None]:
# Paths
CONFIG_PATH = "/kaggle/input/d/ducleathome/modules/accuracy_optimized_config.json"

In [63]:
# Load configuration with proper path
config = load_json_config(str(CONFIG_PATH))

✅ Configuration loaded from: /kaggle/input/d/ducleathome/modules/accuracy_optimized_config.json


In [None]:
config = {
    # "train_size": config.get("dataset_config", {}).get("train_size", 1000),
    # "test_size": config.get("dataset_config", {}).get("test_size", 100),
    "train_size": 1000,
    "test_size": 100,
}

data_loader = KaggleDataLoader(config)
train_df, test_df = data_loader.prepare_dataframes()

Downloading Kaggle Amazon reviews dataset...
KaggleHub download path: /kaggle/input/amazon-reviews

=== LOADING DATA ===
Successfully loaded data:
   - Train: (3599999, 3)
   - Test: (399999, 3)

=== DATA VALIDATION ===
Initial Train data info:
   - Shape: (3599999, 3)
Initial Test data info:
   - Shape: (399999, 3)

Initial label distribution:
   Training: {1: 1800000, 2: 1799999}
   Test: {1: 200000, 2: 199999}
All labels are within expected range [1, 2]
Initial data validation completed

=== APPLYING SIZE LIMITS ===
Size limits applied:
   Training: 3599999 -> 200000 samples
   Test: 399999 -> 20000 samples

=== DATA COMBINATION ===
Analyzing content availability...
   Training - Empty titles: 18, Empty texts: 0, Both empty: 0
   Test - Empty titles: 3, Empty texts: 0, Both empty: 0
Combining title and text columns...
Data combination completed:
   Training: (200000, 2)
   Test: (20000, 2)
   Average input length - Train: 440.0, Test: 440.7


In [None]:
# =============================================================================
# 📥 SETUP OPTIMIZED STOPWORDS FOR SENTIMENT ANALYSIS
# =============================================================================

from stopwords_provider_kaggle import StopwordsProviderKaggle

# Initialize stopwords provider (handles all downloads automatically)
kaggle_stopwords_manager = StopwordsProviderKaggle()

# Get optimized stopwords (automatic setup: cache check → download if needed → optimize)
custom_stopwords = kaggle_stopwords_manager.get_stopwords()

print(f"✅ Stopwords ready! Total: {len(custom_stopwords)} words")
print("🚀 Optimized for Amazon reviews sentiment analysis")

In [None]:
# =============================================================================
# 🔧 INITIALIZE KAGGLE PREPROCESSOR
# =============================================================================

# Initialize the Kaggle preprocessor with the optimized stopwords
preprocessor = KagglePreProcessor()
preprocessor.initialize(custom_stopwords)

In [None]:
# =============================================================================
# 📊 SMART PREPROCESSING WITH SAVE/LOAD FUNCTIONALITY
# =============================================================================

import pickle
import os

preprocessing_needed = True
PREPROCESSED_DATA_PATH = "/kaggle/working/preprocessed_data.pkl"

# Try to load existing preprocessed data first
if os.path.exists(PREPROCESSED_DATA_PATH):
    try:
        print("📂 Loading existing preprocessed data...")
        with open(PREPROCESSED_DATA_PATH, 'rb') as f:
            preprocessed_data = pickle.load(f)
            train_df = preprocessed_data['train_df']
            test_df = preprocessed_data['test_df']
        
        print(f"✅ Preprocessed data loaded successfully!")
        print(f"   Train shape: {train_df.shape}")
        print(f"   Test shape: {test_df.shape}")
        
        # Skip fresh preprocessing since we loaded existing data
        preprocessing_needed = False
        
    except Exception as e:
        print(f"❌ Error loading preprocessed data: {e}")
        print("🔄 Will run preprocessing from scratch...")
        preprocessing_needed = True
else:
    print("🔄 No existing preprocessed data found or loading disabled")
    preprocessing_needed = True

# Run test data preprocessing if needed (train data already processed above)
if preprocessing_needed:
    print("🔄 Running test data preprocessing...")
    
    # Preprocess test data with the same pipeline as train data
    test_df = preprocessor.clean_data(test_df)
    test_df = preprocessor.remove_duplicates(test_df)
    test_df = test_df.assign(
        normalized_input=test_df["input"].apply(
            lambda x: preprocessor.preprocess_for_sentiment(x)
        )
    )
    
    print("✅ Test data preprocessing completed!")
    
    # Save preprocessed data for future use
    if test_df is not None and train_df is not None:
        try:
            preprocessed_data = {
                'train_df': train_df,
                'test_df': test_df,
                'preprocessing_info': {
                    'train_shape': train_df.shape,
                    'test_shape': test_df.shape,
                    'columns': train_df.columns.tolist()
                }
            }
            
            with open(PREPROCESSED_DATA_PATH, 'wb') as f:
                pickle.dump(preprocessed_data, f)
            
            print(f"💾 Preprocessed data saved to: {PREPROCESSED_DATA_PATH}")
            print("   Next runs can load this data to save time!")
            
        except Exception as e:
            print(f"❌ Error saving preprocessed data: {e}")
    else:
        print("❌ Preprocessed data is incomplete")
        raise ValueError("Preprocessed data is incomplete, cannot save.")

print("=" * 70)

In [None]:
# Validate preprocessed data
print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"Train data columns: {train_df.columns.tolist()}")
print(f"Missing values in train: {train_df.isnull().sum().sum()}")
print(f"Missing values in test: {test_df.isnull().sum().sum()}")

# Check unique labels
print(f"Unique labels in train: {train_df['label'].unique()}")
print(f"Label distribution in train:\n{train_df['label'].value_counts()}")

# Sample of preprocessed text
print("\nSample preprocessed texts:")
for i in range(3):
    print(f"Original: {train_df.iloc[i]['input'][:100]}...")
    print(f"Processed: {train_df.iloc[i]['normalized_input'][:100]}...")
    print(f"Label: {train_df.iloc[i]['label']}")
    print("---")

In [None]:
# =============================================================================
# 🎯 KAGGLE RUNNER CONFIGURATION
# =============================================================================

# Select which models to train (set to True to train, False to skip)
MODEL_SELECTION = {
    'logistic_regression': True,    # Fast baseline model
    'random_forest': True,         # Ensemble method  
    'gradient_boosting': True      # Advanced boosting
}

In [None]:
# Initialize ModelTrainer with preprocessed data
trainer = ModelTrainer(
    train_df=train_df,
    test_df=test_df,
    text_column='normalized_input',  # Use preprocessed text
    label_column='label',
    config=config
)

print("✅ ModelTrainer initialized successfully")

In [None]:
# =============================================================================
# 🤖 MODEL TRAINING - LOGISTIC REGRESSION
# =============================================================================

logistic_results = None

if MODEL_SELECTION['logistic_regression']:
    print("🚀 Training Logistic Regression Model...")
    print("=" * 50)

    try:
        logistic_results = trainer.train_logistic_regression()
        
        print(f"✅ Logistic Regression Training Completed!")
        print(f"📊 Training Accuracy: {logistic_results['train_accuracy']:.4f}")
        print(f"📊 Validation Accuracy: {logistic_results['test_accuracy']:.4f}")
        print(f"📊 F1 Score: {logistic_results['f1_score']:.4f}")
        print(f"💾 Model saved to: {logistic_results['model_path']}")
        
    except Exception as e:
        print(f"❌ Error training Logistic Regression: {str(e)}")
        logistic_results = None

    print("\n" + "=" * 50)
else:
    print("⏭️  Logistic Regression training SKIPPED (disabled in config)")
    print("=" * 50)

In [None]:
# =============================================================================
# 🌲 MODEL TRAINING - RANDOM FOREST
# =============================================================================

forest_results = None

if MODEL_SELECTION['random_forest']:
    print("🚀 Training Random Forest Model...")
    print("=" * 50)

    try:
        forest_results = trainer.train_random_forest()
        
        print(f"✅ Random Forest Training Completed!")
        print(f"📊 Training Accuracy: {forest_results['train_accuracy']:.4f}")
        print(f"📊 Validation Accuracy: {forest_results['test_accuracy']:.4f}")
        print(f"📊 F1 Score: {forest_results['f1_score']:.4f}")
        print(f"💾 Model saved to: {forest_results['model_path']}")
        
    except Exception as e:
        print(f"❌ Error training Random Forest: {str(e)}")
        forest_results = None

    print("\n" + "=" * 50)
else:
    print("⏭️  Random Forest training SKIPPED (disabled in config)")
    print("=" * 50)

In [None]:
# =============================================================================
# ⚡ MODEL TRAINING - GRADIENT BOOSTING
# =============================================================================

gb_results = None

if MODEL_SELECTION['gradient_boosting']:
    print("🚀 Training Gradient Boosting Model...")
    print("=" * 50)

    try:
        gb_results = trainer.train_gradient_boosting()
        
        print(f"✅ Gradient Boosting Training Completed!")
        print(f"📊 Training Accuracy: {gb_results['train_accuracy']:.4f}")
        print(f"📊 Validation Accuracy: {gb_results['test_accuracy']:.4f}")
        print(f"📊 F1 Score: {gb_results['f1_score']:.4f}")
        print(f"💾 Model saved to: {gb_results['model_path']}")
        
    except Exception as e:
        print(f"❌ Error training Gradient Boosting: {str(e)}")
        gb_results = None

    print("\n" + "=" * 50)
else:
    print("⏭️  Gradient Boosting training SKIPPED (disabled in config)")
    print("=" * 50)

In [None]:
# =============================================================================
# 📈 TRAINING SUMMARY & RESULTS
# =============================================================================

print("📈 TRAINING SUMMARY")
print("=" * 60)

results_summary = []
trained_models = 0
skipped_models = 0

# Collect results from successfully trained models
if logistic_results:
    results_summary.append({
        'Model': 'Logistic Regression',
        'Train Accuracy': logistic_results['train_accuracy'],
        'Test Accuracy': logistic_results['test_accuracy'],
        'F1 Score': logistic_results['f1_score'],
        'Model Path': logistic_results['model_path']
    })
    trained_models += 1
elif MODEL_SELECTION['logistic_regression']:
    print("❌ Logistic Regression: Failed to train")
else:
    skipped_models += 1

if forest_results:
    results_summary.append({
        'Model': 'Random Forest',
        'Train Accuracy': forest_results['train_accuracy'],
        'Test Accuracy': forest_results['test_accuracy'],
        'F1 Score': forest_results['f1_score'],
        'Model Path': forest_results['model_path']
    })
    trained_models += 1
elif MODEL_SELECTION['random_forest']:
    print("❌ Random Forest: Failed to train")
else:
    skipped_models += 1

if gb_results:
    results_summary.append({
        'Model': 'Gradient Boosting',
        'Train Accuracy': gb_results['train_accuracy'],
        'Test Accuracy': gb_results['test_accuracy'],
        'F1 Score': gb_results['f1_score'],
        'Model Path': gb_results['model_path']
    })
    trained_models += 1
elif MODEL_SELECTION['gradient_boosting']:
    print("❌ Gradient Boosting: Failed to train")
else:
    skipped_models += 1

print(f"\n📊 Training Statistics:")
print(f"   ✅ Successfully trained: {trained_models} models")
print(f"   ⏭️  Skipped: {skipped_models} models")
print(f"   ❌ Failed: {3 - trained_models - skipped_models} models")

# Display results table if any models were trained
if results_summary:
    import pandas as pd
    summary_df = pd.DataFrame(results_summary)
    print("\n📊 Model Performance Comparison:")
    print(summary_df.to_string(index=False, float_format='%.4f'))
    
    # Find best model
    if len(results_summary) > 1:
        best_model = summary_df.loc[summary_df['Test Accuracy'].idxmax()]
        print(f"\n🏆 Best Model: {best_model['Model']}")
        print(f"   Test Accuracy: {best_model['Test Accuracy']:.4f}")
        print(f"   F1 Score: {best_model['F1 Score']:.4f}")
    else:
        print(f"\n✅ Single model trained: {results_summary[0]['Model']}")
        print(f"   Test Accuracy: {results_summary[0]['Test Accuracy']:.4f}")
        print(f"   F1 Score: {results_summary[0]['F1 Score']:.4f}")
    
else:
    print("\n❌ No models were successfully trained!")
    print("💡 Check your model selection configuration and try again.")

print("\n✅ Training pipeline completed!")
print("=" * 60)

In [None]:
# Optional: Save training summary to file (for Kaggle output)
if results_summary:
    import json
    from datetime import datetime
    
    # Create summary for saving
    training_summary = {
        'timestamp': datetime.now().isoformat(),
        'dataset_info': {
            'train_size': len(train_df),
            'test_size': len(test_df),
            'train_distribution': train_df['label'].value_counts().to_dict(),
        },
        'models': results_summary,
        'config_used': CONFIG_PATH
    }
    
    # Save to JSON file
    summary_file = '/kaggle/working/training_summary.json'
    with open(summary_file, 'w') as f:
        json.dump(training_summary, f, indent=2, default=str)
    
    print(f"📄 Training summary saved to: {summary_file}")
    
    # Also save as CSV for easy viewing
    csv_file = '/kaggle/working/model_comparison.csv'
    summary_df.to_csv(csv_file, index=False)
    print(f"📊 Model comparison saved to: {csv_file}")
    
print("\n🎉 All tasks completed successfully!")