<a href="https://colab.research.google.com/github/Sabaudian/Conversational_Agents_project/blob/main/load_and_process_movie_corpus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import

In [None]:
# === Install === #
!pip install -q wordcloud
!pip install -q contractions


In [None]:
# === Import === #
import os
import re
import sys
import html
import json
import nltk
import string
import zipfile
import requests
import warnings
import contractions

import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from google.colab import drive
from collections import Counter
from wordcloud import WordCloud
from nltk.corpus import stopwords
from scipy.stats import gaussian_kde
from typing import Any, Dict, List, Tuple
from sklearn.model_selection import train_test_split


In [None]:
# === Constants & Set-Up === #

# Google Drive Paths
drive.mount("/content/drive", force_remount=True)                                            # Mount Google Drive
DRIVE_BASE_DIR = "/content/drive/MyDrive/Colab_Notebooks"                                    # Path to Drive main directory

# URL
URL = "https://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip"   # URL of the dataset

# Directory & Paths
ROOT_DIR = os.getcwd()                                                                       # Get current working directory (Root)
DATA_DIR = os.path.join(DRIVE_BASE_DIR, "data")                                              # Path to save data (general directory)
CORNELL_DIR = os.path.join(DATA_DIR, "cornell")                                              # Path to save Cornell movie-corpus processed data
ZIP_PATH = os.path.join(ROOT_DIR, "movie-corpus.zip")                                        # Path to the zipped dataset
UTTERANCES_PATH = os.path.join(ROOT_DIR, "movie-corpus", "utterances.jsonl")                 # Path to the utterances file (JSON Lines format)
PROCESSED_DATA_PATH = os.path.join(CORNELL_DIR, "processed_data.csv")                        # Path to the processed data file (CSV format)

# Warnings Set-up
warnings.filterwarnings("ignore")                                                            # Suppress warnings for cleaner output during execution

# NLTK Downloader
nltk.download("stopwords")


In [None]:
# === Define Working environment === #
def makedirs(dir_path: str) -> None:
    """
    Create a directory if it doesn't exist.

    Args:
        path: Path to the directory to be created.
    """
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
        print(f"> Directory '{dir_path}' created.")
    else:
        print(f"> Directory '{dir_path}' already exists.")


# Create necessary directory
for dir_path in [DATA_DIR, CORNELL_DIR]:
    makedirs(dir_path)


# Download the dataset

In [None]:
# Download dataset
def download_and_extract_dataset(source_url: str, zip_file: str, destination: str) -> None:
    """
    Download the dataset from a source url.

    Args:
        source_url: url to the data file.
        zip_file: path to the compressed file.
        destination: where to extract the data.
    """
    if not os.path.exists(zip_file):
        response = requests.get(url=source_url)
        successful_code = 200

        if response.status_code == successful_code:
            print(f"\n> Downloading data from '{source_url}' to '{destination}'")
            with open(zip_file, mode="wb") as f:
                f.write(response.content)
        else:
            print(f"\n> Failed to download data. Status code: {response.status_code}")
            sys.exit(1)

        # Extracts the content of a compressed '.zip' file
        with zipfile.ZipFile(zip_file, "r") as f:
            print(f"\n> Unzipping data '{zip_file}' to '{destination}'")
            f.extractall(path=destination)
    # Delete the zip file after extraction
    os.remove(zip_file)


In [None]:
# Download and Unzip the dataset
download_and_extract_dataset(source_url=URL, zip_file=ZIP_PATH, destination=ROOT_DIR)


## Brief Data Analysis

In [None]:
# 'utterance.jsonl' file structure
def analyze_utterances(file_path: str) -> None:
    """
    Analyzes the structure and content of the 'utterances.jsonl' file.

    Args:
        file_path: Path to 'utterances.jsonl' file.
    """
    print(f"\n> Analyzing '{file_path}' file...")
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            first_line = f.readline()

            if first_line:
                first_entry = json.loads(first_line)

                # Detailed insight
                for key, value in first_entry.items():
                    print(f"-- Key: '{key}', Value Type: {type(value)}; First entry: {value}")

    except FileNotFoundError:
        print(f"\n> Error: File not found at '{file_path}'")

    except json.JSONDecodeError as e:
        print(f"\n> Error decoding JSON: {e}")


# Give a quick look at the data
def show_data(file_path: str, n: int = 2) -> None:
    """
    Displays the first lines of the dataset.

    Args:
        file_path: Path to 'utterances.jsonl' file.
        n: Number of lines to display.
    """
    print(f"\n> First {n} lines of the dataset:")
    with open(file_path, "rb") as datafile:
        lines = datafile.readlines()

    for line in lines[:n]:
        print(line)


In [None]:
# Dataset view
analyze_utterances(file_path=UTTERANCES_PATH)
show_data(file_path=UTTERANCES_PATH)

In [None]:
# Plot text vs word count
def plot_text_length_distribution(file_path: str, max_words: int = 100) -> None:
    """
    Plots the distribution of word counts in the utterances.

    Args:
        file_path: Path to the utterances.jsonl file.
        max_words: Maximum number of words to display on the x-axis.
    """
    word_counts = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            text = data.get("text", "")
            words = text.split()
            word_counts.append(len(words))

    word_counts = np.array(word_counts)
    bins = np.arange(0, max(word_counts) + 1)

    # Compute KDE
    kde = gaussian_kde(word_counts)
    x_kde = np.linspace(0, max(word_counts), 1000)
    y_kde = kde(x_kde)

    # Histogram
    hist_trace = go.Histogram(
        x=word_counts,
        xbins=dict(start=0, end=max(word_counts), size=1),
        name="Histogram",
        marker=dict(
            color="cornflowerblue",
            line=dict(
                color="black",
                width=1.2
            )
        ),
    opacity=0.85
    )

    # KDE line
    kde_trace = go.Scatter(
        x=x_kde,
        y=y_kde * len(word_counts),
        mode="lines",
        name="KDE",
        line=dict(color="crimson", width=2)
    )

    # Plot
    fig = go.Figure(data=[hist_trace, kde_trace])
    fig.update_layout(
        title={
            "text": "<b>Words Count Distribution Across Utterances in Dataset</b>",
            "x": 0.5,
            "xanchor": "center",
            "font": dict(size=18, family="Arial", color="black", weight="bold"),
        },
        xaxis_title="Number of Words per Utterance",
        yaxis_title="Frequency",
        template="plotly",
        width=1000,
        height=700,
        legend=dict(
            title="Legend", font=dict(size=12),
            bordercolor="black", borderwidth=1
        ),
    )
    fig.update_xaxes(range=[0, max_words])
    fig.show()


In [None]:
# Text lenght distribution
plot_text_length_distribution(file_path=UTTERANCES_PATH)


In [None]:
# Conversation lenght Analysis
def plot_conversation_lenght_analysis(file_path: str) -> None:
    """
    Generates a scatter plot to analyze the lengths of conversations in the dataset.

    Args:
        file_path: Path to the 'utterances.jsonl' file.
    """
    conversation_ids = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line_json = json.loads(line)
            conversation_ids.append(line_json["conversation_id"])

    # Count utterances per conversation
    conv_length_counter = Counter(conversation_ids)
    conversation_lengths = sorted(conv_length_counter.values())
    indices = list(range(len(conversation_lengths)))

    # Create DataFrame for plotting
    cdf = pd.DataFrame({
        "conversation_index": indices,
        "conversation_length": conversation_lengths
    })

    # Plot
    fig = px.scatter(
        cdf,
        x="conversation_index",
        y="conversation_length",
        labels={
            "conversation_index": "Count",
            "conversation_length": "Lenght"
        },
    )
    # Layout
    fig.update_layout(
        title={
            "text": "<b>Conversation Length Analysis</b>",
            "x": 0.5,
            "xanchor": "center"
        },
        title_font=dict(size=18, family="Arial", color="black"),
        xaxis_title_font=dict(size=16, family="Arial", color="black"),
        yaxis_title_font=dict(size=16, family="Arial", color="black"),
        font=dict(size=14),
        width=1000,
        height=700,
        template="plotly"
    )

    fig.update_yaxes(matches=None)
    fig.show()


In [None]:
# Plot Conversation Lenght
plot_conversation_lenght_analysis(file_path=UTTERANCES_PATH)


# Preprocessing

In [None]:
# Parse individual utterance lines
def parse_line(line_json: Dict[str, Any]) -> Dict[str, Any]:
    """
    Extract relevant fields.

    Args:
        line_json: A dictionary containing the JSON line data.
    Returns:
        A dictionary containing the extracted fields.
    """
    return {
        "line_id": line_json["id"],             # Unique identifier for each utterance
        "character_id": line_json["speaker"],   # Character who speaks the line
        "text": line_json["text"],              # Raw text of the utterance
    }


# Parse the entire dataset into structured format
def parse_conversations(file_path: str) -> Dict[str, Any]:
    """
    Parses a JSONL file of conversations, extracting structured lines and conversation data.

    Args:
        file_path: The path to the 'utterances.jsonl' file.

    Returns:
        The cornell movie dialog individual lines and conversations.
    """
    # ensure the file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    lines, conversations = {}, {}

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line_json = json.loads(line)
            line_obj = parse_line(line_json)
            lines[line_obj["line_id"]] = line_obj

            conv_id = line_json["conversation_id"]

            # Initialize conversation if not seen before
            if conv_id not in conversations:
                conversations[conv_id] = {
                    "conversation_id": conv_id,
                    "movie_id": line_json["meta"]["movie_id"],
                    "lines": []
                }
            conversations[conv_id]["lines"].append(line_obj)

    # Ensure each conversation's lines are sorted in order of appearance
    for conv in conversations.values():
        conv["lines"].sort(key=lambda x: x["line_id"])

    return conversations


# Clean and normalize dialogue text
def clean_dialogue(text: str) -> str:
    """
    Cleans and normalizes dialogue text.

    - Handles repeated characters.
    - Expands contractions.
    - Normalizes punctuation.

    Args:
        text: Raw dialogue string.

    Returns:
        Cleaned and normalized text string.
    """
    # Expand contractions (e.g., can't -> cannot)
    text = contractions.fix(text)

    # Remove unwanted characters (keep letters, numbers, and basic punctuation)
    text = re.sub(r"[^a-zA-Z0-9.,!?'\s]", "", text)

    # Normalize repeated punctuation and spaces
    text = re.sub(r"\.{2,}", ".", text)         # Convert '...' -> '.'
    text = re.sub(r"(\!|\?){2,}", r"\1", text)  # Convert '!!!' -> '!'
    text = re.sub(r"\s{2,}", " ", text)         # Remove extra spaces

    # Lowercase
    text = text.strip().lower()

    return text


# Build context-response pairs for model training
def create_context_response_dataframe(conversations: dict, context_num: int = 5) -> pd.DataFrame:
    """
      Extract context-response pairs from conversation data and create a DataFrame.
      Each row will have a specified number of context columns followed by a response column.

      Args:
          conversations: A dictionary containing conversations.
          context_num: The number of context columns to include in each row.

      Returns:
          A cleaned DataFrame containing context-response pairs.
    """
    context_response_pairs = []
    skipped_lines = 0
    total_lines = 0

    for conversation in conversations.values():
        # Filter and clean valid lines
        cleaned_lines = []
        for entry in conversation["lines"]:
            text = entry.get("text")
            total_lines += 1
            if text and isinstance(text, str) and text.strip().lower() != "none":
                cleaned_lines.append(clean_dialogue(text))
            else:
                skipped_lines += 1

        if len(cleaned_lines) > context_num:
            for i in range(context_num, len(cleaned_lines)):
                context = cleaned_lines[i - context_num:i]
                response = cleaned_lines[i]
                context_response_pairs.append(context + [response])

    # Build context-response df
    columns = ["response"] + [f"context_{i}" for i in range(context_num)]
    df = pd.DataFrame(context_response_pairs, columns=columns)
    cleaned_df= df.dropna().astype(str) # Drop any lingering NaNs

    # Logging
    print(f"\n - Skipped {skipped_lines} invalid utterances out of {total_lines} total.")
    print(f"- Dropped {len(df) - len(cleaned_df)} rows with NaNs out of {len(df)} generated pairs.")
    print(f"- Final dataset size: {len(cleaned_df)} rows")

    # Save to csv
    cleaned_df.to_csv(path_or_buf=PROCESSED_DATA_PATH, index=False)

    return cleaned_df


In [None]:
# Split Dataset
def split_dataset(df: pd.DataFrame, seed: int = 42) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Splits dataset into train, validation, and test sets with 80/10/10 ratio.

    Args:
        df: Input DataFrame.
        seed: Random seed for reproducibility.

    Returns:
        Tuple containing train, validation, and test DataFrames.
    """
    # First split — Train & Temp
    train_set, temp_set = train_test_split(df, test_size=0.2, random_state=seed, shuffle=True)
    # Split Temp into Validation and Test
    valid_set, test_set = train_test_split(temp_set, test_size=0.5, random_state=seed, shuffle=True)

    return train_set, valid_set, test_set


# Save All Splits to Disk
def save_splits(train_set: pd.DataFrame, valid_set: pd.DataFrame, test_set: pd.DataFrame, base_dir: str) -> None:
    """
    Save the train, validation, and test sets to disk.

    Args:
        train_set: Train set DataFrame.
        valid_set: Validation set DataFrame.
        test_set: Test set DataFrame.
        base_dir: Base directory to save the data.
    """
    makedirs(dir_path=base_dir)

    train_set.to_csv(os.path.join(base_dir, "cornell_train.csv"), index=False)
    valid_set.to_csv(os.path.join(base_dir, "cornell_valid.csv"), index=False)
    test_set.to_csv(os.path.join(base_dir, "cornell_test.csv"), index=False)

    print(f"\n> Data saved to {base_dir}")


# Pre-processing Steps
def preprocess_pipeline(utterances_path: str, output_dir: str, context_num: int = 5) -> None:
    """
    Dataset Preprocessing pipeline

    Args:
        utterances_path: Path to the 'utterances.jsonl' file.
        output_dir: Directory to save the preprocessed data.
        context_num: The number of context columns to include in each row.
    """
    print("\n> Parsing conversations...")
    conversations = parse_conversations(utterances_path)

    print("\n> Cleaning and building context-response pairs...")
    df = create_context_response_dataframe(conversations, context_num=context_num)

    print(f"\n> Dataset size: {len(df)} rows")
    train_set, valid_set, test_set = split_dataset(df)

    print(f"\n> Train: {len(train_set)} | Val: {len(valid_set)} | Test: {len(test_set)}")
    save_splits(train_set, valid_set, test_set, base_dir=output_dir)


In [None]:
# === Execute Pre-processing === #
preprocess_pipeline(utterances_path=UTTERANCES_PATH, output_dir=CORNELL_DIR)


In [None]:
# Presents processed data
processed_df = pd.read_csv(filepath_or_buffer=PROCESSED_DATA_PATH)
processed_df.head()


In [None]:
# Plot WordCloud
def plot_wordcloud(csv_path: str, min_sentence_length: int = 3) -> None:
    """
    Processes the 'response' column from a CSV file and generates a WordCloud.

    Args:
        csv_path: Path to the CSV file containing response-context data.
        min_sentence_length: Minimum length of a valid sentence. Defaults to 3.
    """
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"The file '{csv_path}' does not exist.")

    df = pd.read_csv(csv_path)
    if "response" not in df.columns:
        raise ValueError("The CSV file must contain a 'response' column.")

    stopwords_set = set(stopwords.words("english"))
    processed_lines = []

    for response in df["response"].dropna():
        # Remove punctuation and multiple spaces
        response = re.sub(r"[^A-Za-z\s]", " ", response)
        response = re.sub(r"\s+", " ", response).strip()

        # Check sentence length and relevance
        words = response.split()
        if len(words) >= min_sentence_length and any(word.lower() not in stopwords_set for word in words):
            processed_lines.append(response)

    # Generate and plot WordCloud
    text = " ".join(processed_lines)
    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color="white",
        colormap="viridis",
        stopwords=set(stopwords.words("english")),
        random_state=42,
    ).generate(text.lower())

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"WordCloud of Cornell Movie-Corpus")
    plt.show()


In [None]:
# Plot
plot_wordcloud(csv_path=PROCESSED_DATA_PATH)
