In [1]:
import os
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download required NLTK data (only needs to be run once)
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to C:\Users\Pranay
[nltk_data]     Malhotra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Pranay
[nltk_data]     Malhotra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Folder where transcripts are stored
data_folder = "data"


In [3]:
# line loads a list of common English stop words from NLTK and stores them in a Python set called stop_words.
stop_words = set(stopwords.words('english'))


In [4]:
def clean_text(text):
    # Lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Simple tokenization (split by whitespace)
    tokens = text.split()

    # Remove stopwords and non-alphabetic tokens
    cleaned_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    return " ".join(cleaned_tokens), cleaned_tokens




In [5]:
import nltk

# Clean download
nltk.download('punkt', download_dir='C:/Users/Pranay Malhotra/nltk_data')


[nltk_data] Downloading package punkt to C:/Users/Pranay
[nltk_data]     Malhotra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
# Initialize list to store cleaned data
all_data = []

# Loop over all .txt files in data folder
for file_name in os.listdir(data_folder):
    if file_name.endswith(".txt"):
        comedian_name = file_name.replace(".txt", "")
        with open(os.path.join(data_folder, file_name), "r", encoding="utf-8") as f:
            content = f.read()
            cleaned_text, tokens = clean_text(content)
            all_data.append({
                "comedian": comedian_name,
                "original_text": content,
                "cleaned_text": cleaned_text,
                "tokens": tokens
            })


In [7]:
# Convert list of dictionaries to pandas DataFrame
df = pd.DataFrame(all_data)


In [8]:
# Preview the cleaned data
df.head()


Unnamed: 0,comedian,original_text,cleaned_text,tokens
0,kenny,You guys are the positive audience. Okay? Ther...,guys positive audience okay positive audience ...,"[guys, positive, audience, okay, positive, aud..."
1,urooj,Are you guys aware of the cameras? Are you fee...,guys aware cameras feeling conscious cameras c...,"[guys, aware, cameras, feeling, conscious, cam..."
2,varun,Are you ready?\n\nAre you excited?\n\nSo put y...,ready excited put hands together welcome stage...,"[ready, excited, put, hands, together, welcome..."
3,virdas1,I lost 80% of my mind. It’s very freeing. You ...,lost mind freeing see look faces right way oh ...,"[lost, mind, freeing, see, look, faces, right,..."
4,virdas2,[Vir Das] What you’re about to watch wasn’t su...,vir das watch supposed happen completely unscr...,"[vir, das, watch, supposed, happen, completely..."


In [9]:
df.to_csv("cleaned_transcripts.csv", index=False)
