# Complete Data Pipeline

This notebook demonstrates a complete end-to-end pipeline:
1. **HFDataIndexer**: Lazy loading of Hugging Face datasets
2. **TextToAudioPipeline**: Convert text to MP3 audio files
3. **AudioPermutationPipeline**: Apply audio transformations (pitch, speed, reverb)
4. **End-to-End Examples**: Chain all pipelines together

## Install Dependencies

In [66]:
%pip install datasets gtts librosa soundfile pydub scipy numpy -q

## Import Required Libraries

In [67]:
import os
import sys
import librosa
import soundfile as sf
import numpy as np
from scipy.signal import fftconvolve
from pydub import AudioSegment
import shutil
from pathlib import Path
from typing import List, Optional
from datasets import load_dataset
from gtts import gTTS

---
## 1. HFDataIndexer Class
Enables lazy indexing over Hugging Face datasets

In [68]:
class HFDataIndexer:
    """
    Lazy indexer for Hugging Face datasets.
    """

    def __init__(self, path, name=None, split='train'):
        self.path = path
        self.split = split
        try:
            self.dataset = load_dataset(path, name, split=split, streaming=True)

            # Peek at one row to get column names without loading the rest
            first_row = next(iter(self.dataset))
            self.columns = list(first_row.keys())

            print(f"--- Connected to {path} ({split}) ---")
            print(f"Columns available: {self.columns}")
        except Exception as e:
            print(f"Connection Error: {e}")
            sys.exit(1)

    def get_rows(self, start=0, count=1, target_columns=None):
        """
        Stream a specific range of rows. Returns a generator to
        minimize memory footprint.
        """
        cols = target_columns if target_columns else self.columns

        # "lazy" indexing in streaming mode uses skip() and take()
        subset = self.dataset.skip(start).take(count)

        for row in subset:
            yield { col: row.get(col) for col in cols }

    def get_cell(self, row_idx, col_name):
        """
        Grab a single specific 'cell'.
        """
        if col_name not in self.columns:
            return None

        # skip to row_idx, take 1 row, get the first item
        row = next(iter(self.dataset.skip(row_idx).take(1)), None)

        return row.get(col_name) if row else None

print("✓ HFDataIndexer class loaded successfully!")

✓ HFDataIndexer class loaded successfully!


---
## 2. TextToAudioPipeline Class
Converts text into MP3 audio files using Google Text-to-Speech

In [69]:
class TextToAudioPipeline:
    """Pipeline for converting text to MP3 audio files."""

    def __init__(self, output_dir: str = "./outputs/"):
        self.output_dir = Path(output_dir)
        self._init_output_dir()

    def _init_output_dir(self):
        """Ensures the output directory exists."""
        self.output_dir.mkdir(parents=True, exist_ok=True)

    def _get_unique_filename(self, base_name: str) -> str:
        """
        Checks for file existence and returns a unique filename string.
        Example: 'test' -> 'test_v1' if 'test.mp3' exists.
        """
        stem = Path(base_name).stem
        candidate_stem = stem
        counter = 1

        while (self.output_dir / f"{candidate_stem}.mp3").exists() or \
              (self.output_dir / f"{candidate_stem}.txt").exists():
            candidate_stem = f"{stem}_v{counter}"
            counter += 1

        return candidate_stem

    def convert(self, input_text: str, output_filename: str = "file_save") -> Optional[Path]:
        """Converts text to an MP3 and saves a transcript copy."""
        if not input_text.strip():
            print("Error: Input text is empty.")
            return None

        unique_stem = self._get_unique_filename(output_filename)
        audio_path = self.output_dir / f"{unique_stem}.mp3"
        text_path = self.output_dir / f"{unique_stem}.txt"

        try:
            print(f"Processing: '{unique_stem}'...")

            # Generate and save audio
            tts = gTTS(text=input_text, lang='en', slow=False)
            tts.save(str(audio_path))

            # Save transcript
            text_path.write_text(input_text, encoding="utf-8")

            print(f"\t[✓] Transcript: {text_path.absolute()}")
            print(f"\t[✓] Audio:      {audio_path.absolute()}\n")

            return audio_path

        except Exception as e:
            print(f"Failed to process {unique_stem}: {e}")
            return None

    def batch_convert(self, input_texts: List[str], file_names: Optional[List[str]] = None) -> List[Path]:
        """Batch convert list of texts to audio files."""
        results = []

        if file_names:
            assert len(input_texts) == len(file_names), "Texts and filenames must have same length"
            for text, name in zip(input_texts, file_names):
                result = self.convert(text, name)
                if result:
                    results.append(result)
        else:
            for text in input_texts:
                result = self.convert(text)
                if result:
                    results.append(result)

        return results

print("✓ TextToAudioPipeline class loaded successfully!")

✓ TextToAudioPipeline class loaded successfully!


---
## 3. AudioPermutationPipeline Class
Apply various audio transformations: pitch shifting, speed changes, and reverb

In [70]:
class AudioPermutationPipeline:
    """Pipeline for applying audio transformations and effects."""

    def __init__(self, output_dir: str = "./outputs/"):
        self.output_dir = Path(output_dir)
        self._init_output_dir()


    def _init_output_dir(self):
        """Ensures the output directory exists."""
        self.output_dir.mkdir(parents=True, exist_ok=True)


    @staticmethod
    def _shift_pitch(data, sr, n_steps=0):
        """
        Shifts pitch (without changing duration).

        Args:
            - positive 'n_steps' increases pitch
            - negative 'n_steps' decreases pitch
        """
        return librosa.effects.pitch_shift(y=data, sr=sr, n_steps=n_steps)


    @staticmethod
    def _stretch_time(data, rate=1.0):
        """
        Changes speed (without changing pitch).

        Args:
            - 'rate' > 1.0 increases speed
            - 'rate' < 1.0 decreases speed
        """
        return librosa.effects.time_stretch(y=data, rate=rate)


    @staticmethod
    def _apply_reverb(data, sr, room_size=0.5, wet_dry=0.3):
        """
        Adds reverb effect to audio.

        Args:
            - 'room_size' controls the reverb length (0.0 to 1.0)
            - 'wet_dry' mixes between original and reverb (0.0 = dry, 1.0 = wet)
        """
        reverb_duration = room_size * 2.0
        ir_length = int(reverb_duration * sr)

        t = np.linspace(0, reverb_duration, ir_length)
        decay = np.exp(-3.0 * t / reverb_duration)
        impulse = decay * np.random.randn(ir_length) * 0.1

        reverb_signal = fftconvolve(data, impulse, mode='same')
        output = (1 - wet_dry) * data + wet_dry * reverb_signal
        output = output / np.max(np.abs(output))

        return output


    @staticmethod
    def _overlay_audio(data, sr, overlay_path, volume_ratio=1.0):
        """
        Overlays a secondary audio file onto the original audio.

        Args:
            - data: original audio data (numpy array)
            - sr: sample rate of original audio
            - overlay_path: path to the secondary audio file to overlay
            - volume_ratio: volume of overlay relative to original (1.0 = equal volume, 0.5 = half volume)

        Returns:
            numpy array with overlaid audio
        """
        # Load the overlay audio file
        overlay_data, overlay_sr = librosa.load(overlay_path, sr=sr)

        # Get lengths
        original_length = len(data)
        overlay_length = len(overlay_data)

        # Handle length differences
        if overlay_length < original_length:
            # Loop the overlay to match original length
            num_repeats = int(np.ceil(original_length / overlay_length))
            overlay_data = np.tile(overlay_data, num_repeats)[:original_length]
        elif overlay_length > original_length:
            # Trim the overlay to match original length
            overlay_data = overlay_data[:original_length]

        # Apply volume ratio to overlay
        overlay_data = overlay_data * volume_ratio

        # Mix the two audio signals
        mixed = data + overlay_data

        # Normalize to prevent clipping
        max_val = np.max(np.abs(mixed))
        if max_val > 1.0: mixed = mixed / max_val

        return mixed


    @staticmethod
    def _save_as_mp3(data, sr, output_path):
        """Save audio data as MP3 file."""
        temp_wav = str(output_path).replace('.mp3', '_temp.wav')
        sf.write(temp_wav, data, sr)

        audio = AudioSegment.from_wav(temp_wav)
        audio.export(output_path, format='mp3', bitrate='192k')

        os.remove(temp_wav)


    def process(self,
                input_path: str,
                pitch_increase: Optional[float] = None,
                pitch_decrease: Optional[float] = None,
                speed_increase: Optional[float] = None,
                speed_decrease: Optional[float] = None,
                reverb_room_size: Optional[float] = None):
        """
        Process audio file with various transformations.

        Args:
            - input_path: path to input audio file
            - pitch_increase: positive number of semitones to increase pitch
            - pitch_decrease: negative number of semitones to decrease pitch
            - speed_increase: speed factor > 1.0 to increase speed
            - speed_decrease: speed factor < 1.0 to decrease speed
            - reverb_room_size: reverb room size (0.0 to 1.0)
        """
        if not os.path.exists(input_path):
            print(f"Error: File '{input_path}' not found.")
            return

        # Validate inputs
        assert pitch_increase is None or pitch_increase > 0, "pitch_increase must be positive"
        assert pitch_decrease is None or pitch_decrease < 0, "pitch_decrease must be negative"
        assert speed_increase is None or speed_increase > 1.0, "speed_increase must be > 1.0"
        assert speed_decrease is None or (0 < speed_decrease < 1.0), "speed_decrease must be between 0 and 1"
        assert reverb_room_size is None or (0 < reverb_room_size <= 1.0), "reverb_room_size must be between 0 and 1"

        # Create output subdirectory
        base_name = Path(input_path).stem
        output_subdir = self.output_dir / base_name
        output_subdir.mkdir(parents=True, exist_ok=True)

        # Copy input file
        print(f"Copying input file to output directory...")
        original_copy = output_subdir / Path(input_path).name
        shutil.copy2(input_path, original_copy)
        print(f"\tCopied to: {original_copy}\n")

        # Load audio
        print(f"Loading audio file ('{input_path}')...")
        data, sr = librosa.load(input_path, sr=None)

        # Apply transformations
        if pitch_increase:
            print(f"\tGenerating pitch increase by {pitch_increase} semitones...")
            pitch_up = self._shift_pitch(data, sr, n_steps=pitch_increase)
            output_path = output_subdir / f"{base_name}_pitch_up.mp3"
            self._save_as_mp3(pitch_up, sr, output_path)
            print(f"\t\tSaved: {output_path}\n")

        if pitch_decrease:
            print(f"\tGenerating pitch decrease by {pitch_decrease} semitones...")
            pitch_down = self._shift_pitch(data, sr, n_steps=pitch_decrease)
            output_path = output_subdir / f"{base_name}_pitch_down.mp3"
            self._save_as_mp3(pitch_down, sr, output_path)
            print(f"\t\tSaved: {output_path}\n")

        if speed_increase:
            print(f"\tGenerating increased speed by factor of {speed_increase}...")
            speed_up = self._stretch_time(data, rate=speed_increase)
            output_path = output_subdir / f"{base_name}_speed_up.mp3"
            self._save_as_mp3(speed_up, sr, output_path)
            print(f"\t\tSaved: {output_path}\n")

        if speed_decrease:
            print(f"\tGenerating decreased speed by factor of {speed_decrease}...")
            speed_down = self._stretch_time(data, rate=speed_decrease)
            output_path = output_subdir / f"{base_name}_speed_down.mp3"
            self._save_as_mp3(speed_down, sr, output_path)
            print(f"\t\tSaved: {output_path}\n")

        if reverb_room_size:
            print(f"\tGenerating reverb (room size: {reverb_room_size})...")
            reverb = self._apply_reverb(data, sr, room_size=reverb_room_size)
            output_path = output_subdir / f"{base_name}_reverb.mp3"
            self._save_as_mp3(reverb, sr, output_path)
            print(f"\t\tSaved: {output_path}\n")

        print(f"Done! All files saved to: {output_subdir}\n")


    def apply_overlay(self,
                      original_audio_path: str,
                      overlay_audio_path: str,
                      overlay_volume: float):
        """
        Apply audio overlay effect to an audio file.

        Args:
            - original_audio_path: path to the original audio file
            - overlay_audio_path: path to the overlay audio file
            - overlay_volume: volume of overlay relative to original (1.0 = equal, 0.5 = half)
        """
        # Check if original file exists
        if not os.path.exists(original_audio_path):
            print(f"Error: File '{original_audio_path}' not found.")
            return

        # Check if overlay file exists
        if not os.path.exists(overlay_audio_path):
            print(f"Error: Overlay file '{overlay_audio_path}' not found.")
            return

        # Create output subdirectory
        base_name = Path(original_audio_path).stem
        output_subdir = self.output_dir / base_name
        output_subdir.mkdir(parents=True, exist_ok=True)

        # Copy input file
        print(f"Copying input file to output directory...")
        original_copy = output_subdir / Path(original_audio_path).name
        shutil.copy2(original_audio_path, original_copy)
        print(f"\tCopied to: {original_copy}\n")

        # Load audio
        print(f"Loading audio file ('{original_audio_path}')...")
        data, sr = librosa.load(original_audio_path, sr=None)

        # Apply overlay
        print(f"\tApplying overlay from '{overlay_audio_path}' at relative volume {overlay_volume}...")
        overlaid = self._overlay_audio(data, sr, overlay_audio_path, volume_ratio=overlay_volume)

        # Save output
        overlay_base_name = Path(overlay_audio_path).stem
        output_path = output_subdir / f"{base_name}_overlay_{overlay_base_name}.mp3"
        self._save_as_mp3(overlaid, sr, output_path)
        print(f"\t\tSaved: {output_path}\n")

        print(f"Done! File saved to: {output_subdir}\n")


print("✓ AudioPermutationPipeline class loaded successfully!")

✓ AudioPermutationPipeline class loaded successfully!


---
# Usage Examples

## Initialize All Pipelines

In [71]:
# Initialize all pipelines
indexer = HFDataIndexer("TIGER-Lab/MMLU-Pro", split="validation")
text_pipeline = TextToAudioPipeline(output_dir="./outputs/")
audio_pipeline = AudioPermutationPipeline(output_dir="./outputs/")

print("\nAll pipelines initialized!")

--- Connected to TIGER-Lab/MMLU-Pro (validation) ---
Columns available: ['question_id', 'question', 'options', 'answer', 'answer_index', 'cot_content', 'category', 'src']

All pipelines initialized!


---
## Example 1: HFDataIndexer Single 'Cell' Indexing

In [72]:
# Grab a specific cell (Row 5, Column 'question')
question_5 = indexer.get_cell(5, "question")
print(f"Cell Index [5, 'question']:\n{question_5[:100]}...\n")

Cell Index [5, 'question']:
Which of the following is the body cavity that contains the pituitary gland?...



---
## Example 2: HFDataIndexer Data-Set Iteration

In [73]:
# Iterate over a range (Rows 10 to 13)
print("Streaming Rows 10 - 13 ...\n\n")

fields = ['question', 'options', 'answer']
data_stream = indexer.get_rows(start=10, count=3, target_columns=fields)

for i, entry in enumerate(data_stream, start=10):
    extracted_fields = [f"{f}:\n{entry.get(f, None)}\n\n" for f in fields]
    print(f"Row {i}:\n\n" + "\n".join(extracted_fields) + "\n" + "-" * 25 + "\n\n")

Streaming Rows 10 - 13 ...


Row 10:

question:
Say the pupil of your eye has a diameter of 5 mm and you have a telescope with an aperture of 50 cm. How much more light can the telescope gather than your eye?


options:
['1000 times more', '50 times more', '5000 times more', '500 times more', '10000 times more', '20000 times more', '2000 times more', '100 times more', '10 times more', 'N/A']


answer:
E


-------------------------


Row 11:

question:
Where do most short-period comets come from and how do we know?


options:
['The Kuiper belt; short period comets tend to be in the plane of the solar system just like the Kuiper belt.', 'The asteroid belt; short period comets tend to come from random directions indicating a spherical distribution of comets called the asteroid belt.', 'The asteroid belt; short period comets tend to be in the plane of the solar system just like the asteroid belt.', 'The Oort cloud; short period comets have orbital periods similar to asteroids like Vesta an

---
## Example 3: Single Text-to-Audio Conversion

In [74]:
# Single conversion
sample_text = """Hello, World! This is an audio file. Now I will read some random text.
Paragraphs are the building blocks of papers. Many students define
paragraphs in terms of length: a paragraph is a group of at least five
sentences, a paragraph is half a page long, etc. In reality, though, the
unity and coherence of ideas among sentences is what constitutes a paragraph."""

audio_file = text_pipeline.convert(sample_text, "sample_output")
print(f"Generated audio file: {audio_file}")

Processing: 'sample_output'...
	[✓] Transcript: /content/outputs/sample_output.txt
	[✓] Audio:      /content/outputs/sample_output.mp3

Generated audio file: outputs/sample_output.mp3


---
## Example 4: Batch Text-to-Audio Conversion

In [75]:
# Batch conversion
texts = [
    "This is the first audio message.",
    "This is the second audio message.",
    "This is the third audio message."
]

filenames = ["message_1", "message_2", "message_3"]

audio_files = text_pipeline.batch_convert(texts, filenames)
print(f"\nGenerated {len(audio_files)} audio files.")

Processing: 'message_1'...
	[✓] Transcript: /content/outputs/message_1.txt
	[✓] Audio:      /content/outputs/message_1.mp3

Processing: 'message_2'...
	[✓] Transcript: /content/outputs/message_2.txt
	[✓] Audio:      /content/outputs/message_2.mp3

Processing: 'message_3'...
	[✓] Transcript: /content/outputs/message_3.txt
	[✓] Audio:      /content/outputs/message_3.mp3


Generated 3 audio files.


---
## Example 5: Audio Permutations

In [76]:
# Apply basic transformations to an existing audio file
audio_pipeline.process(
    "./outputs/sample_output.mp3",  # Use the file we just created
    pitch_increase=4,
    pitch_decrease=-4,
    speed_increase=1.5,
    speed_decrease=0.5,
    reverb_room_size=0.5
)

# Adding background noise effects
background_noise_effects = [
    # https://www.youtube.com/watch?v=5jlUVr6gkos
    ("./saved_effects/wind.mp3", 0.75),

    # https://www.youtube.com/watch?v=C4pJ6Hi4MU4
    ("./saved_effects/rain.mp3", 1.25),

    # https://www.youtube.com/watch?v=wyzgbdI6x24
    ("./saved_effects/coffee_shop.mp3", 1.4),

    # https://www.youtube.com/watch?v=FeOrG8FrNko
    ("./saved_effects/busy_street.mp3", 0.6),

    # https://www.youtube.com/watch?v=cNWxqMx69WI
    ("./saved_effects/song1.mp3", 0.35),
]

for background_audio_path, relative_volume in background_noise_effects:
    audio_pipeline.apply_overlay(
        "./outputs/sample_output.mp3",
        background_audio_path,
        relative_volume
    )

Copying input file to output directory...
	Copied to: outputs/sample_output/sample_output.mp3

Loading audio file ('./outputs/sample_output.mp3')...
	Generating pitch increase by 4 semitones...
		Saved: outputs/sample_output/sample_output_pitch_up.mp3

	Generating pitch decrease by -4 semitones...
		Saved: outputs/sample_output/sample_output_pitch_down.mp3

	Generating increased speed by factor of 1.5...
		Saved: outputs/sample_output/sample_output_speed_up.mp3

	Generating decreased speed by factor of 0.5...
		Saved: outputs/sample_output/sample_output_speed_down.mp3

	Generating reverb (room size: 0.5)...
		Saved: outputs/sample_output/sample_output_reverb.mp3

Done! All files saved to: outputs/sample_output

Copying input file to output directory...
	Copied to: outputs/sample_output/sample_output.mp3

Loading audio file ('./outputs/sample_output.mp3')...
	Applying overlay from './saved_effects/wind.mp3' at relative volume 0.75...
		Saved: outputs/sample_output/sample_output_overlay_

---
## Example 6: Linked Audio Pipelines
**Text → Audio Generation → Permutations Permutation Generation**

In [77]:
# Chain both pipelines together
print("STEP 1: Converting text to audio...")

text = "The quick brown fox jumps over the lazy dog. This is a test of the audio processing pipeline."
audio_path = text_pipeline.convert(text, "complete_pipeline_test")

print("\nSTEP 2: Applying audio effects...")

if audio_path:
    audio_pipeline.process(
        str(audio_path),
        pitch_increase=5,
        pitch_decrease=-5,
        speed_increase=2.0,
        reverb_room_size=0.7
    )
    print("\n✓ Complete pipeline finished successfully!")
else:
    print("Failed to generate audio file.")

STEP 1: Converting text to audio...
Processing: 'complete_pipeline_test'...
	[✓] Transcript: /content/outputs/complete_pipeline_test.txt
	[✓] Audio:      /content/outputs/complete_pipeline_test.mp3


STEP 2: Applying audio effects...
Copying input file to output directory...
	Copied to: outputs/complete_pipeline_test/complete_pipeline_test.mp3

Loading audio file ('outputs/complete_pipeline_test.mp3')...
	Generating pitch increase by 5 semitones...
		Saved: outputs/complete_pipeline_test/complete_pipeline_test_pitch_up.mp3

	Generating pitch decrease by -5 semitones...
		Saved: outputs/complete_pipeline_test/complete_pipeline_test_pitch_down.mp3

	Generating increased speed by factor of 2.0...
		Saved: outputs/complete_pipeline_test/complete_pipeline_test_speed_up.mp3

	Generating reverb (room size: 0.7)...
		Saved: outputs/complete_pipeline_test/complete_pipeline_test_reverb.mp3

Done! All files saved to: outputs/complete_pipeline_test


✓ Complete pipeline finished successfully!


---
## Example 7: End-to-End Pipeline
**HuggingFace Text Data → Audio Generation → Audio Permutation Generation**

In [78]:
print("STEP 1: Retrieve Text Data ...")

question_5 = indexer.get_cell(5, "question")
print(f"Cell Index [5, 'question']:\n{question_5[:100]}...\n")

print("STEP 2: Generate Base Audio ...")

audio_path = text_pipeline.convert(question_5, "e2e_output")

print("STEP 3: Applying Audio Effects ...")

if audio_path:
    audio_pipeline.process(
        str(audio_path),
        pitch_increase=5,
        pitch_decrease=-5,
        speed_increase=2.0,
        reverb_room_size=0.7
    )
    print("\n✓ End-to-end pipeline finished successfully!")
else:
    print("Failed to generate audio file.")

STEP 1: Retrieve Text Data ...
Cell Index [5, 'question']:
Which of the following is the body cavity that contains the pituitary gland?...

STEP 2: Generate Base Audio ...
Processing: 'e2e_output'...
	[✓] Transcript: /content/outputs/e2e_output.txt
	[✓] Audio:      /content/outputs/e2e_output.mp3

STEP 3: Applying Audio Effects ...
Copying input file to output directory...
	Copied to: outputs/e2e_output/e2e_output.mp3

Loading audio file ('outputs/e2e_output.mp3')...
	Generating pitch increase by 5 semitones...
		Saved: outputs/e2e_output/e2e_output_pitch_up.mp3

	Generating pitch decrease by -5 semitones...
		Saved: outputs/e2e_output/e2e_output_pitch_down.mp3

	Generating increased speed by factor of 2.0...
		Saved: outputs/e2e_output/e2e_output_speed_up.mp3

	Generating reverb (room size: 0.7)...
		Saved: outputs/e2e_output/e2e_output_reverb.mp3

Done! All files saved to: outputs/e2e_output


✓ End-to-end pipeline finished successfully!


---
## Summary

Complete audio processing pipeline:

**Available Classes:**
- `HFDataIndexer`: Lazy-load data from HuggingFace datasets
- `TextToAudioPipeline`: Convert text to MP3 files with transcripts
- `AudioPermutationPipeline`: Apply audio effects (pitch, speed, reverb)

**Key Features:**
- Streaming HuggingFace datasets
- Create audio files + add audio effects (pitch, speed, reverd)
- Full end-to-end pipeline integration

All generated files are saved to `./outputs/` by default.

---

Prototyping saving data to google drive here

In [85]:
# import gspread
# from google.colab import auth
# from google.auth import default
# from googleapiclient.discovery import build
# from googleapiclient.http import MediaFileUpload

# def export_ml_data(sheet_link, drive_parent_folder_id, data_array):
#     """
#     Processes an array of data, creates public Drive folders/files, and logs to Sheets.
#     data_array format: [[page_name, question_id, file_name, local_file_path], ...]
#     """

#     # 1. Authenticate the Colab user (This will trigger a popup in Colab)
#     print("Authenticating user...")
#     auth.authenticate_user()
#     creds, _ = default()

#     # 2. Initialize Google Drive and Sheets APIs
#     gc = gspread.authorize(creds)
#     drive_service = build('drive', 'v3', credentials=creds)

#     # 3. Open the Google Sheet
#     try:
#         sheet = gc.open_by_url(sheet_link)
#     except Exception as e:
#         print(f"Error opening Google Sheet: {e}")
#         return

#     # 4. Loop through the array items
#     for item in data_array:
#         page_name, question_id, file_name, local_file_path = item
#         print(f"\nProcessing Question ID: {question_id}...")

#         # --- GOOGLE DRIVE OPERATIONS ---

#         # Step 1: Create a new folder named after the question_id
#         folder_metadata = {
#             'name': str(question_id),
#             'mimeType': 'application/vnd.google-apps.folder',
#             'parents': [drive_parent_folder_id]
#         }
#         folder = drive_service.files().create(body=folder_metadata, fields='id').execute()
#         folder_id = folder.get('id')

#         # Make the folder publicly viewable
#         permission = {'type': 'anyone', 'role': 'reader'}
#         drive_service.permissions().create(fileId=folder_id, body=permission).execute()

#         # Step 2: Upload the local file into the new folder
#         file_metadata = {
#             'name': file_name,
#             'parents': [folder_id]
#         }
#         media = MediaFileUpload(local_file_path, resumable=True)
#         uploaded_file = drive_service.files().create(
#             body=file_metadata,
#             media_body=media,
#             fields='id, webViewLink'
#         ).execute()

#         file_id = uploaded_file.get('id')
#         file_url = uploaded_file.get('webViewLink')

#         # Make the uploaded file publicly viewable
#         drive_service.permissions().create(fileId=file_id, body=permission).execute()

#         # --- GOOGLE SHEETS OPERATIONS ---

#         # Step 3: Navigate to the specified page (worksheet)
#         try:
#             worksheet = sheet.worksheet(page_name)
#         except gspread.exceptions.WorksheetNotFound:
#             print(f"Worksheet '{page_name}' not found. Skipping {question_id}.")
#             continue

#         # Step 4: Calculate the new incremented row ID and append the data
#         col_1_values = worksheet.col_values(1)

#         # Grab the last row's ID. If it's text (like a header) or empty, default to 0.
#         try:
#             last_id = int(col_1_values[-1])
#         except (ValueError, IndexError):
#             last_id = 0

#         new_id = last_id + 1

#         # Append the new row to the first empty row automatically
#         # Column 1: ID | Column 2: Question ID | Column 3: Public File Link
#         worksheet.append_row([new_id, question_id, file_url])

#         print(f"Successfully uploaded and logged {file_name} to sheet '{page_name}'.")

# # Example Usage:
# upload_details_1 = [
#     ["original", "0", "0_original.mp3", "/content/outputs/sample_output/sample_output.mp3"],
#     ["busy_street", "0", "0_busy_street.mp3", "/content/outputs/sample_output/sample_output_overlay_busy_street.mp3"],
# ]
# upload_details_2 = [
#     ["original", "1", "1_original.mp3", "/content/outputs/sample_output/sample_output_speed_up.mp3"],
#     ["busy_street", "1", "1_busy_street.mp3", "/content/outputs/sample_output/sample_output_speed_down.mp3"],
# ]
# upload_details_3 = [
#     ["original", "2", "2_original.mp3", "/content/outputs/sample_output/sample_output_pitch_up.mp3"],
#     ["busy_street", "2", "2_busy_street.mp3", "/content/outputs/sample_output/sample_output_pitch_down.mp3"]
# ]

# sheet_link = "https://docs.google.com/spreadsheets/d/1tZ-eaQqtb-tbzn9YVdqo7cEg80UxD_DP737HNdAJfbI/edit?gid=293340750#gid=293340750"
# drive_parent_id = "1FQjh0-DBKC0iettweDAs9bfLI2B3NoDe"
# for upload_detail in [upload_details_1, upload_details_2, upload_details_3]:
#   export_ml_data(sheet_link, drive_parent_id, upload_detail)








#
#
#







import gspread
from google.colab import auth
from google.auth import default
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload

def export_ml_data(sheet_link, drive_parent_folder_id, data_array):
    print("Authenticating user...")
    auth.authenticate_user()
    creds, _ = default()

    gc = gspread.authorize(creds)
    drive_service = build('drive', 'v3', credentials=creds)

    try:
        sheet = gc.open_by_url(sheet_link)
    except Exception as e:
        print(f"Error opening Google Sheet: {e}")
        return

    # Step 1: Create Folder (No permission changes here!)
    folder_metadata = {
        'name': str(data_array[0][1]),
        'mimeType': 'application/vnd.google-apps.folder',
        'parents': [drive_parent_folder_id]
    }
    folder = drive_service.files().create(body=folder_metadata, fields='id').execute()
    folder_id = folder.get('id')

    for item in data_array:
        page_name, question_id, file_name, local_file_path = item
        print(f"\nProcessing Question ID: {question_id}...")

        # --- GOOGLE DRIVE OPERATIONS ---

        # Step 2: Upload File (No permission changes here either!)
        file_metadata = {
            'name': file_name,
            'parents': [folder_id]
        }
        media = MediaFileUpload(local_file_path, resumable=True)
        uploaded_file = drive_service.files().create(
            body=file_metadata,
            media_body=media,
            fields='id, webViewLink'
        ).execute()

        file_url = uploaded_file.get('webViewLink')

        # --- GOOGLE SHEETS OPERATIONS ---

        # Step 3: Connect to Worksheet
        try:
            worksheet = sheet.worksheet(page_name)
        except gspread.exceptions.WorksheetNotFound:
            print(f"Worksheet '{page_name}' not found. Skipping {question_id}.")
            continue

        # Step 4: Calculate ID and Append Row
        # col_1_values = worksheet.col_values(1)

        # try:
        #     last_id = int(col_1_values[-1])
        # except (ValueError, IndexError):
        #     last_id = 0

        # new_id = last_id + 1

        # worksheet.append_row([new_id, question_id, file_url])
        # col_1_values = worksheet.col_values(1)

        # try:
        #     last_id = int(col_1_values[-1])
        # except (ValueError, IndexError):
        #     last_id = 0

        # new_id = last_id + 1

        worksheet.append_row([question_id, file_url])

        print(f"Successfully uploaded and logged {file_name} to sheet '{page_name}'.")

# Example Usage:
upload_details_1 = [
    ["original", "0", "0_original.mp3", "/content/outputs/sample_output/sample_output.mp3"],
    ["busy_street", "0", "0_busy_street.mp3", "/content/outputs/sample_output/sample_output_overlay_busy_street.mp3"],
]
upload_details_2 = [
    ["original", "1", "1_original.mp3", "/content/outputs/sample_output/sample_output_speed_up.mp3"],
    ["busy_street", "1", "1_busy_street.mp3", "/content/outputs/sample_output/sample_output_speed_down.mp3"],
]
upload_details_3 = [
    ["original", "2", "2_original.mp3", "/content/outputs/sample_output/sample_output_pitch_up.mp3"],
    ["busy_street", "2", "2_busy_street.mp3", "/content/outputs/sample_output/sample_output_pitch_down.mp3"]
]

sheet_link = "https://docs.google.com/spreadsheets/d/1tZ-eaQqtb-tbzn9YVdqo7cEg80UxD_DP737HNdAJfbI/edit?gid=293340750#gid=293340750"
drive_parent_id = "1FQjh0-DBKC0iettweDAs9bfLI2B3NoDe"

for upload_detail in [upload_details_1, upload_details_2, upload_details_3]:
  export_ml_data(sheet_link, drive_parent_id, upload_detail)

Authenticating user...





Processing Question ID: 0...
Successfully uploaded and logged 0_original.mp3 to sheet 'original'.

Processing Question ID: 0...
Successfully uploaded and logged 0_busy_street.mp3 to sheet 'busy_street'.
Authenticating user...





Processing Question ID: 1...
Successfully uploaded and logged 1_original.mp3 to sheet 'original'.

Processing Question ID: 1...
Successfully uploaded and logged 1_busy_street.mp3 to sheet 'busy_street'.
Authenticating user...





Processing Question ID: 2...
Successfully uploaded and logged 2_original.mp3 to sheet 'original'.

Processing Question ID: 2...
Successfully uploaded and logged 2_busy_street.mp3 to sheet 'busy_street'.
