### Completed Pipline

#### 1: Imports


In [17]:
import os
import re
import pandas as pd
import requests
import assemblyai as aai
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

import torch
import pickle


#### 2: API Key Setup & File Paths

In [None]:
# Set AssemblyAI API key
API_KEY = "xxx"
aai.settings.api_key = API_KEY

# Polish audio MP3 files
file_paths = [
    r"C:\Users\legak\Documents\GitHub\Y2 repos\2024-25c-fai2-adsai-group-group22_y2c\task 2\Sondy uliczne, wywiady z ludźmi. Jak jest pod koniec wakacji w Ustce  Studio Promenada 25.08.2022.mp3",
    r"C:\Users\legak\Documents\GitHub\Y2 repos\2024-25c-fai2-adsai-group-group22_y2c\task 2\TE TEKSTY PRZEJDĄ DO HISTORII! Najlepsze sondy uliczne z emerytami.mp3"
]
file_paths = [os.path.normpath(p) for p in file_paths]


#### 3: Upload Function

In [19]:
def upload_file(filepath):
    headers = {'authorization': API_KEY}
    with open(filepath, "rb") as f:
        response = requests.post("https://api.assemblyai.com/v2/upload", headers=headers, files={"file": f})
    if response.status_code == 200:
        print(f"✅ Uploaded: {filepath}")
        return response.json()["upload_url"]
    else:
        raise Exception(f"❌ Upload failed: {response.text}")

def request_transcription(upload_url):
    print("📝 Requesting transcription...")
    config = aai.TranscriptionConfig(language_code="pl", punctuate=True)
    transcriber = aai.Transcriber()
    transcript = transcriber.transcribe(upload_url, config=config)
    if transcript.status == aai.TranscriptStatus.error:
        raise Exception(f"❌ Transcription failed: {transcript.error}")
    return transcript.text


#### 4.Transcription Request Function

In [20]:
def save_transcription_to_csv(text, output_file):
    sentences = re.split(r'(?<=[.!?])\\s+', text)
    df = pd.DataFrame({"pl_text": sentences})
    df.to_csv(output_file, index=False, encoding="utf-8")
    print(f"💾 Saved transcription: {output_file}")
    return df


#### 5.Save to CSV Function

In [21]:
def process_files_and_return_dfs():
    all_dfs = []
    for path in file_paths:
        try:
            print(f"\n🚀 Processing: {path}")
            upload_url = upload_file(path)
            transcription = request_transcription(upload_url)
            csv_path = path.replace(".mp3", "_assembly_pl.csv")
            df = save_transcription_to_csv(transcription, csv_path)
            all_dfs.append(df)
        except Exception as e:
            print(f"⚠️ Error with {path}: {e}")

    if not all_dfs:
        print("❌ No valid audio transcribed. Please check your files.")
        return pd.DataFrame(columns=["pl_text"])
    
    return pd.concat(all_dfs, ignore_index=True)


#### 6.Main Processing Loop

In [22]:

pl_en_model_name = "Helsinki-NLP/opus-mt-pl-en"
pl_en_tokenizer = AutoTokenizer.from_pretrained(pl_en_model_name)
pl_en_model = AutoModelForSeq2SeqLM.from_pretrained(pl_en_model_name)

# Automatically uses GPU if available
translator = pipeline(
    "translation", 
    model=pl_en_model, 
    tokenizer=pl_en_tokenizer, 
    framework="pt", 
    device=0 if torch.cuda.is_available() else -1
)

def translate_texts(texts):
    return [translator(text)[0]['translation_text'] for text in tqdm(texts)]


Device set to use cpu


In [None]:
# --- Configuration ---
checkpoint_path = "./checkpoint-3906"
label_encoder_path = "label_encoder.pkl"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Load LabelEncoder ---
with open(label_encoder_path, "rb") as f:
    label_encoder = pickle.load(f)

# --- Load Model and Tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path).to(device)
model.eval()

# --- Emotion Classification Function ---
def classify_emotions_transformer(texts):
    emotions = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
        with torch.no_grad():
            logits = model(**inputs).logits
            pred = torch.argmax(logits, dim=1).item()
            label = label_encoder.inverse_transform([pred])[0]
        emotions.append(label)
    return emotions

# --- Apply Classification ---
pl_df["emotion"] = classify_emotions_transformer(pl_df["en_translation"].tolist())



In [None]:
# --- Final Output DataFrame ---
final_df = pl_df[["pl_text", "en_translation", "emotion"]].copy()
final_df.columns = ["Sentence", "Translation", "Emotion"]

# --- Save Result ---
final_df.to_csv("final_output.csv", index=False)
final_df