# Tanscripting Video Files From TikTok

#### Installing dependencies
We need moviepy to convert the videos to mp3
We need openai-whisper to tanscript the files to text

In [None]:
%pip install moviepy
%pip install -U openai-whisper
%pip install faster-whisper 
%pip install pandas 
%pip install requests
%pip install moviepy
%pip install -U spacy
%pip install langdetect

Set your notebook to the right directrory

In [None]:

%cd '/Users/noursafadi/Documents/Uni/Parsons-Spring-25/Major Studio 02/Thesis/tiktok-scraper'

Importing

In [1]:
from moviepy import *
import whisper
import csv
import requests
from faster_whisper import WhisperModel
import httpx
import json
from os import listdir, path
import pandas as pd
import spacy
from langdetect import detect

Downloading video files

In [None]:
with open("tiktok_vids.json", "r") as file: 
    data = json.load(file)

links = []
for idx,post in enumerate(data):
    if "videoMeta" in post and "downloadAddr" in post["videoMeta"]:
        url = post["videoMeta"]["downloadAddr"]
        if url: 
            video_response = httpx.get(url)
            file_path = f"/videos/{idx}.mp4"
            with open(file_path, "wb") as f:
                f.write(video_response.content)
            print(f"{idx}: Seccessful")
        else:
            print(f"{idx} is not Available")
    else:
        print(f"{idx} is Not Available")

Converting from video to audio

In [None]:
video_path = "/videos"
audio_output_path = "/audio"

files = [f for f in listdir(video_path) if f.endswith(".mp4")]

for fname in files:
    video_location = path.join(video_path, fname)

    try: 
        video = VideoFileClip(video_location)
        audio = video.audio

        audio_file_name = path.splitext(fname)[0] + ".mp3"
        audio_file_path = path.join(audio_output_path, audio_file_name)
        audio.write_audiofile(audio_file_path, codec="mp3")

        video.close()
    except Exception as e: 
        print(f"Error Processing {fname} : {e}")

Lodaing Whisper object Model from Open AI


In [None]:
#OpenAI WHISPER basic transcripting
model = whisper.load_model("turbo")

Transcripting Audio Files with Whisper AI 

In [None]:

audio_folder = "audio/"
audio_files = [f for f in listdir(audio_folder) if f.endswith(".mp3")]

scripts = []
for fname in audio_files:
    file = path.join(audio_folder, fname)
    result = model.transcribe(file)
    scripts.append(result["text"])

Creating a JSON file 

In [None]:
#note when using json with a different language, letters such as ä, ö or simillar will be encoded so we will need to deal with that. 
# That's why I will create a csv file, which keep the letters as they are without any change. 
json_sting = json.dumps(scripts)
json_path = "data/scripts.json"
with open(json_path, "w") as file: 
    json.dump(scripts,file,indent=4)

Create a CSV file

In [None]:
csv_path = "data/scripts.csv"

# Open CSV File for Writing
with open(csv_path, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)

    # Write Header
    writer.writerow(["id", "script"])

    # Process Each Transcribed Segment
    for idx, text in enumerate(scripts):
        writer.writerow([idx, text])

print(f"Transcription and translation saved to {csv_path}")

In [None]:
#reading csv as data frame. 
df = pd.read_csv("data/scripts.csv")

Translating the texts with DEEPL API

In [11]:
#DEEPL API KEY
DEEPL_API_KEY = "39f5e8f7-4926-44de-b567-420c316bc88e:fx"  # Replace with your actual key

# Define DeepL Translation Function
def translate_text(text, source_lang="DE", target_lang="EN"):
    url = "https://api-free.deepl.com/v2/translate"  # Use "api.deepl.com" for Pro accounts
    params = {
        "auth_key": DEEPL_API_KEY,
        "text": text,
        "source_lang": source_lang,
        "target_lang": target_lang
    }
    response = requests.post(url, data=params)
    if response.status_code == 200:
        return response.json()["translations"][0]["text"]
    else:
        print("Translation Error:", response.text)
        return text  # Return original text in case of failure

In [None]:
#creating a loop to translate and store the translated text in a list
translated_list = []
for index, row in df.iterrows():
    #print(f"{row['script']}")
    translated_list.append(translate_text(row['script']))

In [None]:
#saving the translated scripts as csv file with index's 
csv_path = "data/translated_scripts.csv"

# Open CSV File for Writing
with open(csv_path, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)

    # Write Header
    writer.writerow(["id", "script"])

    for idx, text in enumerate(translated_list):
        writer.writerow([idx, text])
    
print(f"Transcription and translation saved to {csv_path}")

Checking the rows that did not get translated in order to apply DEEPL function again if not. 

In [None]:
df_translated = pd.read_csv("data/translated_scripts.csv")

def detect_language(text):
    try:
        return detect(text)
    except:
        return "Unknown"

#detecting each row
df_translated["language"] = df_translated["script"].astype(str).apply(detect_language)

german_index = df_translated[df_translated["language"] == "de"].index.tolist()

print(f"index with Germany Text: {german_index}")

In [None]:
#translate the rows that are now translated
translated_list_second = []

for index, row in df_translated.iterrows():
    if row["language"] == "de":
        translated = translate_text(row["script"])
        translated_list_second.append(translated)
    else: 
        translated_list_second.append(row["script"])

In [None]:
# Save the list as a csv
csv_path = "data/translated_scripts_checked.csv"

# Open CSV File for Writing
with open(csv_path, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)

    # Write Header
    writer.writerow(["id", "script"])
    
    #create index for each row
    for idx, text in enumerate(translated_list_second):
        writer.writerow([idx, text])

In [None]:
#rechecking the if any rows did not get translated
df_checked = pd.read_csv("data/translated_scripts_checked.csv")

df_checked["language_checked"] = df_checked["script"].astype(str).apply(detect_language)

german_index = df_checked[df_checked["language_checked"] == "de"].index.tolist()

print(f"index with Germany Text: {german_index}")

Spacy Framework for efficient categories and keywords handeling

In [27]:
df_translated = pd.read_csv("data/translated_scripts_checked.csv")
nlp = spacy.load("en_core_web_sm")

for index, row in df_translated.iterrows():
    print(index)
    #print(row["script"])
    text = row["script"]
    doc = nlp(text)
    for entity in doc.ents:
            print("This is an Entity",entity.text, entity.label_)
    print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
    print("Verbs: ", [token.lemma_ for token in doc if token.pos_ =="VERB"])

0
This is an Entity AfD PERSON
This is an Entity Germany GPE
This is an Entity February 23 DATE
This is an Entity Germany GPE
Noun phrases: [' A country', 'engineers', 'inventors', 'bans', 'No combustion engines', 'no oil and gas heating', 'We', 'no more ideological experiments', 'The AfD', 'genuine technological openness', 'paternalism', 'Progress', 'a diversity', 'ideas', 'green regulations', 'bans', 'We', 'Germany', 'a country', 'innovation', 'February', 'Alternative', 'Germany']
Verbs:  ['slow', 'say', 'commit', 'research', 'come', 'turn', 'vote']
1
This is an Entity the hundreds of thousands CARDINAL
This is an Entity AfD PERSON
This is an Entity February 23 DATE
Noun phrases: [' The completely uncontrolled, illegal, mass immigration', 'People', 'our country', 'the hundreds', 'thousands', 'We', 'that', 'That', 'we', 'AfD', 'February']
Verbs:  ['uncontrolle', 'pour', 'cope', 'vote']
2
This is an Entity Friedrich Merz PERSON
This is an Entity December DATE
This is an Entity one CARD

In [31]:
#creating categories
categories = {
    "Anti-Immigration": ["immigration", "refugees", "borders", "asylum", "illegal", "invasion"],
    "Nationalism & Identity": ["homeland", "patriotism", "people", "sovereignty", "Germany first", "national pride"],
    "Anti-Elite & Populism": ["fake news", "establishment", "mainstream media", "deep state", "political elite"],
    "Climate Change Skepticism": ["climate hoax", "CO2 tax", "green agenda", "climate change scam"],
    "Law & Order / Authoritarianism": ["crime", "police", "law", "security", "strict laws", "criminals", "justice"],
    "Anti-EU & Anti-Globalization": ["EU dictatorship", "Brussels", "sovereignty", "globalists", "great reset"]
}

def classify_text(text):
    text = text.lower()  # Convert text to lowercase
    matched_categories = []

    for category, keywords in categories.items():
        for keyword in keywords:
            if keyword.lower() in text:  # Check if the keyword appears
                matched_categories.append(category)
                break  # Avoid duplicate matches
    
    return ", ".join(matched_categories) if matched_categories else "Other"

# Apply classification to the dataset
df_translated["Category"] = df_translated["script"].astype(str).apply(classify_text)

# Save categorized data
df_translated.to_csv("categorized_speeches.csv", index=False, encoding="utf-8")

# Print sample results
df_translated[["script", "Category"]]



Unnamed: 0,script,Category
0,A country of engineers and inventors is being...,Other
1,"The completely uncontrolled, illegal, mass im...","Anti-Immigration, Nationalism & Identity"
2,Friedrich Merz took the influx limitation law...,Law & Order / Authoritarianism
3,Can you still remember that? Agnes Strack-Zim...,Other
4,Welcome to the big AfD clique calendar. Today...,Nationalism & Identity
...,...,...
185,"Wuh! Isn't that great? Actually, that's what ...",Nationalism & Identity
186,ставeeocoolcomjpi Anchorkee Access字幕пис,Other
187,Then the big farmers' protests. Brandenburg G...,Nationalism & Identity
188,We'll be right back.,Other
