In [None]:
! git clone https://github.com/Sakshamtomar2004/ML_OPS_EMOTION_PIPELINE.git


In [None]:
%pip install -q dagshub 'mlflow>=2,<3'
pip install tensorflow librosa tqdm


In [1]:
import os

folders = [
    "artifacts/data/raw",
    "artifacts/data/processed",
    "artifacts/data/clips_5sec",
    "artifacts/features"
]

for folder in folders:
    os.makedirs(folder, exist_ok=True)

print("Folder structure created successfully.")


Folder structure created successfully.


In [None]:
pip install -r /content/ML_OPS_EMOTION_PIPELINE/requirements.txt

In [None]:
%pip install -q opendatasets

In [None]:
import yaml
import os

config_path = "/content/ML_OPS_EMOTION_PIPELINE/configs/config.yaml"

try:
    with open(config_path, 'r') as f:
        config_content = f.read()
    print("Current content of config.yaml:")
    print("```yaml")
    print(config_content)
    print("```")

    # You can now manually edit the config_content string in the cell below
    # and then run the next cell to write the changes back to the file.

except FileNotFoundError:
    print(f"Error: config.yaml not found at {config_path}")
except Exception as e:
    print(f"An error occurred while reading the file: {e}")

In [None]:
# Paste the modified content of config.yaml here:
modified_config_content = """
# Data Configuration
data:
  raw_data_url: "https://www.kaggle.com/datasets/shivendratomar/emotions-audio-clips"
  raw_data_dir: "artifacts/data/raw"
  processed_data_dir: "artifacts/data/processed"
  clips_data_dir: "artifacts/data/clips_5sec"
  feat_dir: "artifacts/features"
  data_dir: "artifacts"

# Audio Processing Parameters
audio:
  sample_rate: 44000
  clip_duration: 5

# Feature Extraction Parameters
features:
  n_mfcc: 40
  n_mels: 128
  n_fft: 2048
  hop_length: 512
  max_feature_bytes: 2147483648  # 2GB

# Model Parameters
model:
  model_type: "Transfer_Learning"
  validation_split: 0.2
  batch_size: 32
  epochs: 50
  learning_rate: 0.001
  dropout_rate: 0.2
  patience: 60
  reduce_lr_patience: 60
  min_lr: 0.000005

# MLflow Configuration
mlflow:
  experiment_name: "emotion_detection"
  run_name: "emotion_classification_run"

# Logging
logging:
  level: "INFO"

# Random State
random_state: 42

"""

In [None]:
import os

config_path = "/content/ML_OPS_EMOTION_PIPELINE/configs/config.yaml"

try:
    with open(config_path, 'w') as f:
        f.write(modified_config_content)
    print(f"Successfully updated {config_path}")
except Exception as e:
    print(f"An error occurred while writing to the file: {e}")

In [None]:
import mlflow
import argparse
import yaml
import hashlib
import json
import pandas as pd
import sys
import os

# Add the project directory to the Python path
project_dir = "/content/ML_OPS_EMOTION_PIPELINE"
sys.path.append(project_dir)


from src.logger import logging
from src.exception import MyException
from src.data_ingestion import DataIngestion
from src.feature_extraction import FeatureExtraction
from src.model_training import ModelTraining


# ---------------- Helper Functions ---------------- #

def get_config_hash(config_section: dict) -> str:
    """
    Create a hash for a given section of config.
    Ensures we can detect if parameters have changed.
    """
    config_str = json.dumps(config_section, sort_keys=True)
    return hashlib.md5(config_str.encode()).hexdigest()


def save_hash(hash_val: str, hash_file: str):
    """Save hash string into a file."""
    with open(hash_file, "w") as f:
        f.write(hash_val)


def load_hash(hash_file: str) -> str:
    """Load hash string from a file."""
    if os.path.exists(hash_file):
        with open(hash_file, "r") as f:
            return f.read().strip()
    return None


# ---------------- Pipeline ---------------- #

def run_pipeline(config_path, force=False):
    """Run the complete MLflow pipeline"""
    try:
        # Load config
        with open(config_path, 'r') as f:
            config = yaml.safe_load(f)
            import dagshub
            dagshub.init(repo_owner='tomarsaksham2006', repo_name='ML_OPS_EMOTION_PIPELINE', mlflow=True) 
            mlflow.set_tracking_uri("https://dagshub.com/tomarsaksham2006/ML_OPS_EMOTION_PIPELINE.mlflow/")

        # Set MLflow experiment
        mlflow.set_experiment(config['mlflow']['experiment_name'])

        with mlflow.start_run(run_name=config['mlflow']['run_name']):
            logging.info("Starting emotion detection pipeline")

            # ---------------- STEP 1: DATA INGESTION ---------------- #
            clips_csv = os.path.join(config['data']['clips_data_dir'], "clips_metadata.csv")

            if os.path.exists(clips_csv) and not force:
                logging.info(f"Step 1 Skipped: Clips metadata found at {clips_csv}")
                df_clips = pd.read_csv(clips_csv)
            else:
                logging.info("Step 1: Data Ingestion started...")
                data_ingestion = DataIngestion(config_path)
                data_ingestion.download_data()
                df_clips = data_ingestion.clip_5sec_segments()
                logging.info("Step 1: Data Ingestion completed")

            # ---------------- STEP 2: FEATURE EXTRACTION ---------------- #
            feat_dir = "artifacts/features"
            os.makedirs(feat_dir, exist_ok=True)

            chunks_pkl = os.path.join(feat_dir, "features_chunks.pkl")
            hash_file = os.path.join(feat_dir, "features_config_hash.txt")

            # Compute current config hash for feature extraction
            current_hash = get_config_hash(config['features'])
            saved_hash = load_hash(hash_file)

            trainer = ModelTraining(config_path)  # Needed for feature loading

            if os.path.exists(chunks_pkl) and saved_hash == current_hash and not force:
                logging.info("Step 2 Skipped: Features already extracted and config unchanged.")
                features, labels_cat, labels_subcat = trainer.load_features()
            else:
                logging.info("Step 2: Feature Extraction started...")
                feature_extractor = FeatureExtraction(config_path)
                features_chunks, cats_chunks, subcats_chunks = feature_extractor.extract_all_features(df_clips)
                # Save hash of config
                save_hash(current_hash, hash_file)
                logging.info("Step 2: Feature Extraction completed")

            # ---------------- STEP 3: MODEL TRAINING ---------------- #
            logging.info("Step 3: Model Training started...")

            # Load features again (ensures we have them after extraction)
            features, labels_cat, labels_subcat = trainer.load_features()

            # Prepare data
            prepared_data = trainer.prepare_data(features, labels_cat, labels_subcat)

            # Build model
            input_shape = prepared_data['X_train'].shape[1:]
            num_categories = prepared_data['y_train_cat'].shape[1]
            num_subcategories = prepared_data['y_train_subcat'].shape[1]

            model = trainer.build_model(input_shape, num_categories, num_subcategories)

            # Train model
            model, history = trainer.train_model(model, prepared_data)

            # Save model
            trainer.save_model(model, prepared_data)

            logging.info("Step 3: Model Training completed")
            logging.info("Pipeline completed successfully!")

    except Exception as e:
        logging.error(f"Pipeline failed: {e}")
        raise MyException(e, sys) from e


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--config_path", type=str, default="/content/ML_OPS_EMOTION_PIPELINE/configs/config.yaml")
    parser.add_argument("--force", action="store_true", help="Force recomputation even if artifacts exist")

    # Check if running in a Colab environment and parse args accordingly
    if 'google.colab' in sys.modules:
        args = parser.parse_args([]) # Pass an empty list to avoid parsing Colab's args
    else:
        args = parser.parse_args()

    run_pipeline(args.config_path, args.force)