# YouTube Transcript Processing for Die Linke

1. This code retrieves German language transcripts from YouTube videos of "Die Linke" political party using video IDs stored in a JSON file
2. Processes and cleans the transcript data by:
    - Handling videos with no available transcripts
    - Removing line breaks from text
    - Adding metadata: party name "Die Linke" and initializing a score of 0
3. Creates a cleaned dataset saved as JSON for further analysis

In [None]:
pip install youtube-transcript-api

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
import json

In [None]:
ytt_api = YouTubeTranscriptApi()
# ytt_api.fetch("0g8nnp7tB70")
video_id = "0g8nnp7tB70"
transcript = ytt_api.fetch(video_id, languages=['de'])

In [None]:
print(transcript)

In [None]:
transcript_list = []

for snippet in transcript:
    transcript_list.append(snippet.text)
print("Ready to use ")

In [None]:
transcript[1].text

In [None]:
transcript_list[:10]

In [None]:
joined_transcript = " ".join(transcript_list)

In [None]:
print(joined_transcript)

In [None]:
PATH = "/data/linkeData/LinkeYouTubeVids.json"
with open(PATH, "r") as f:
    linke_df = json.load(f)

In [None]:
linke_df[1]["id"]
print(len(linke_df))

In [None]:
df_transcripts = linke_df.copy()

In [None]:
df_transcripts[1]

In [None]:
# Total items to know where the loop currently is
total_items = len(df_transcripts)

## TODO: Loop through JSON and get ID's
for index, item in enumerate(df_transcripts):
    id = item["id"]
    print(f"Proccesing Item {id}: {index + 1} / {total_items}")
    try:
        ## TODO: Send the ID to YouTube API to get transcript "DE" only if it's in German
        transcript = ytt_api.fetch(id, languages=['de'])
        transcript_list = []
        for snippet in transcript:
            transcript_list.append(snippet.text)
        joined_transcript = " ".join(transcript_list)

        ## TODO: Append transcript to original JSON
        item["transcript"] = joined_transcript
    ## TODO: Check for errors and non german langauge. 
    except (TranscriptsDisabled, NoTranscriptFound, Exception) as e:
        # Handle the case when transcript is not available
        print(f"No transcript available for video ID: {id}, Error: {str(e)}")
        ## TODO: Append no transcript to original JSON if not available
        item["transcript"] = "no transcript available"

In [None]:
df_transcripts

In [77]:
file_path = "/Users/noursafadi/Documents/Uni/Parsons-Spring-25/MajorStudio02/Thesis/political_ai/data/linkeData/Transcribed_linke.json"
with open(file_path, 'w', encoding='utf-8') as file:
    json.dump(df_transcripts, file, ensure_ascii=False, indent=2)

In [97]:
df_cleaned = []
not_transcribed = []
for x in df_transcripts:
    if x["transcript"] == "no transcript available":
        not_transcribed.append(x)
    else:
        df_cleaned.append(x)

In [96]:
len(df_cleaned)

897

In [None]:
df_cleaned

In [None]:
import re
## Clean breaks 
for item in df_cleaned:
    item["transcript"] = re.sub(r'\s*[\n\r]+\s*', ' ', item["transcript"])


In [117]:
for item in df_cleaned:
    item["party"] = "Die Linke"
    item["score"] = 0

In [None]:
file_path = ""

with open (file_path, "w", encoding="utf-8") as file:
    json.dump(df_cleaned, file, ensure_ascii=False, indent=2)