# Detecting BDM In Superbowl Commercials

## Notebook Settings

In [None]:

%pip install -r requirements.txt

In [2]:
import pandas as pd
import os

In [3]:
import logging
from datetime import datetime

logging.basicConfig(
    filename='log.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

In [4]:
# We have some specific knowledge about the industry and brand, so we can use that to improve the model. This data only exists for a few brands and products. Activate or deactivate as needed.
INDUSTRY_SPECIFIC_AWARENESS = True
BRAND_SPECIFIC_AWARENESS = True

# Activate if you want to reduce the selection of commercials to 20 for debugging
REDUCED_SELECTION = True



In [5]:
# Checkpoints
# Since the model takes very long to train, we save extracted features to csvs and only rerun the code if the csv "checkpoint" does not exist

BASELINE_CHECKPOINT = os.path.exists('csvs/baseline.csv')
logging.info(f"Baseline checkpoint: {BASELINE_CHECKPOINT}")
TRANSCRIPT_CHECKPOINT = os.path.exists('csvs/transcript.csv')
logging.info(f"Transcript checkpoint: {TRANSCRIPT_CHECKPOINT}")
OCR_CHECKPOINT = os.path.exists('csvs/ocr.csv')
logging.info(f"OCR checkpoint: {OCR_CHECKPOINT}")
BDM_WORDS_CHECKPOINT = os.path.exists('csvs/bdm_words.csv')
logging.info(f"BDM words checkpoint: {BDM_WORDS_CHECKPOINT}")
ADJ_NOUN_PAIRS_CHECKPOINT = os.path.exists('csvs/adj_noun_pairs.csv')
logging.info(f"Adj noun pairs checkpoint: {ADJ_NOUN_PAIRS_CHECKPOINT}")
SEMANTIC_SIMILARITY_CHECKPOINT = os.path.exists('csvs/semantic_similarity.csv')
logging.info(f"Semantic similarity checkpoint: {SEMANTIC_SIMILARITY_CHECKPOINT}")
PERSONAL_PRONOUNS_CHECKPOINT = os.path.exists('csvs/personal_pronouns.csv')
logging.info(f"Personal pronouns checkpoint: {PERSONAL_PRONOUNS_CHECKPOINT}")
# Activate if you want to completely rerun the notebook from scratch. This will delete all csvs/ saved data and start from scratch.
RESTART_FROM_SCRATCH = False


In [6]:
import shutil
csv_dir = "./csvs"
if RESTART_FROM_SCRATCH and os.path.exists(csv_dir):
    shutil.rmtree(csv_dir)
    logging.info(f"Directory '{csv_dir}' has been deleted.")

# CRISP-DM 3: Data Preparation

In [7]:

def download_videos():
  url = 'https://box.fu-berlin.de/s/zwxKp8PXkCwAwGe/download'
  download_filename = 'downloaded_archive.zip'
  target_directory = 'ADs'
  os.system(f'wget -O {download_filename} {url}')
  os.makedirs(target_directory, exist_ok=True)
  os.system(f'unzip -o {download_filename} -d {target_directory}')
  os.remove(download_filename)
  logging.info(f"Archive extracted to {target_directory} and {download_filename} removed.")

if not BASELINE_CHECKPOINT:
  download_videos()

In [8]:
if not BASELINE_CHECKPOINT:
    BDM_excel = pd.read_excel('BDM.xlsx')
    final_excel = pd.read_excel('previous_project_results.xlsx')
    final_excel = final_excel.merge(
    BDM_excel[['AdNumber', 'BDM']], 
    on='AdNumber', 
    how='left',
    suffixes=('_old', '')
    ).drop('BDM_old', axis=1, errors='ignore')

    ad_df = final_excel.groupby(['cont_primary_product_type', 'BRAND', 'AdNumber', "BDM"]).size().reset_index(name='count')
    ad_df.rename(columns={'cont_primary_product_type': 'product_category', 'BRAND': 'brand', 'AdNumber': 'commercial_number'}, inplace=True)
    ad_df.drop(columns=['count'], inplace=True)
    ad_df.head(10)


## Brand Keywords

In [9]:
if not BASELINE_CHECKPOINT:
    if BRAND_SPECIFIC_AWARENESS:
        product_brand_df = pd.read_csv("product_brands.csv")

        product_brand_df['brand'] = product_brand_df['brand'].str.replace(' ', '').str.lower()
        ad_df['brand_clean'] = ad_df['brand'].str.replace(' ', '').str.lower()

        ad_df = ad_df.merge(
            product_brand_df[['brand', 'product_brand_keywords']], 
            left_on='brand_clean',
            right_on='brand',
            how='left',
            suffixes=('', '_brand')
        )

        ad_df.drop(['brand_clean', 'brand_brand'], axis=1, inplace=True)


        ad_df.head(10)

## Product Category Keywords

In [10]:
if not BASELINE_CHECKPOINT:
    if INDUSTRY_SPECIFIC_AWARENESS:
        product_brands_df = pd.read_csv("product_categories.csv")
        product_brands_df.head(40)
        product_brands_df = product_brands_df.drop('product_cat_id', axis=1)
        ad_df = ad_df.drop('product_category', axis=1)
        display(product_brands_df)
        display(ad_df)
        brand_to_info = {}
        for _, row in product_brands_df.iterrows():
            brands = eval(row['product_cat_brands'])
            for brand in brands:
                brand = brand.replace(' ', '').lower()
                brand_to_info[brand] = {col: row[col] for col in product_brands_df.columns}

        def find_brand_info(brand):
            clean_brand = brand.replace(' ', '').lower()
            return brand_to_info.get(clean_brand)

        for col in product_brands_df.columns:
            ad_df[col] = ad_df['brand'].apply(lambda x: find_brand_info(x)[col] if find_brand_info(x) else None)

        unmapped_brands = ad_df[ad_df['product_cat_name'].isna()]['brand'].unique()
        if len(unmapped_brands) > 0:
            logging.info("Brands without category mapping:")
            for brand in unmapped_brands:
                logging.info(f"- {brand}")

        ad_df.head(10)

In [11]:
if not BASELINE_CHECKPOINT:
  if REDUCED_SELECTION:
    ad_df = ad_df.head(20)
  directory = 'csvs'
  if not os.path.exists(directory):
      os.makedirs(directory)
  ad_df.to_csv('csvs/baseline.csv', index=False)

## Feature Extraction

### Feature: Transcript

In [12]:
ad_df = pd.read_csv('csvs/baseline.csv')

In [None]:
import glob
from pathlib import Path
from transcript import transcribe_video
from ocr import ocr

if not TRANSCRIPT_CHECKPOINT:
    ads_dir = "ADs"
    def find_video_file(commercial_number, ads_dir):
        """Find the video file path for a given commercial number."""
        # Search recursively for MP4 files
        pattern = f"{ads_dir}/**/{commercial_number}.mp4"
        matches = glob.glob(pattern, recursive=True)
        return matches[0] if matches else None

    ad_df['transcript'] = ''

    for idx, row in ad_df.iterrows():
        commercial_number = row['commercial_number']
        video_path = find_video_file(commercial_number, ads_dir)
        
        if video_path:
            transcript = transcribe_video(video_path)
            ad_df.at[idx, 'transcript'] = transcript
        else:
            logging.info(f"Video not found for commercial {commercial_number}")
    ad_df[['commercial_number', 'transcript']].to_csv('csvs/transcript.csv', index=False)
    ad_df.head(10)

### Feature: OCR

In [14]:

transcript_df = pd.read_csv('csvs/transcript.csv')
ad_df = ad_df.merge(transcript_df, on='commercial_number', how='left')

In [15]:
import glob
from pathlib import Path
from transcript import transcribe_video
from ocr import ocr

if not OCR_CHECKPOINT:
    ads_dir = "ADs"
    def find_video_file(commercial_number, ads_dir):
        """Find the video file path for a given commercial number."""
        # Search recursively for MP4 files
        pattern = f"{ads_dir}/**/{commercial_number}.mp4"
        matches = glob.glob(pattern, recursive=True)
        return matches[0] if matches else None
    ad_df['ocr_text'] = ''
    for idx, row in ad_df.iterrows():
        commercial_number = row['commercial_number']
        video_path = find_video_file(commercial_number, ads_dir)
        
        if video_path:
            ocr_text = ocr(video_path)
            ad_df.at[idx, 'ocr_text'] = ocr_text
        else:
            logging.info(f"Video not found for commercial {commercial_number}")

    ad_df[['commercial_number', 'ocr_text']].to_csv('csvs/ocr.csv', index=False)

### Feature: Superlatives, Comparatives, Uniqueness

In [16]:
# Load the transcript data
ocr_df = pd.read_csv('csvs/ocr.csv')
ad_df = ad_df.merge(ocr_df, on='commercial_number', how='left')

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
import spacy
import pandas as pd
from collections import Counter
import text_analysis as ta
if not BDM_WORDS_CHECKPOINT:


    ad_df['word_count'] = 0
    ad_df['superlative_count'] = 0
    ad_df['superlative_pct'] = 0.0
    ad_df['comparative_count'] = 0
    ad_df['comparative_pct'] = 0.0
    ad_df['uniqueness_count'] = 0
    ad_df['uniqueness_pct'] = 0.0
    ad_df['total_bdm_terms_count'] = 0
    ad_df['total_bdm_terms_pct'] = 0.0

    for idx, row in ad_df.iterrows():
        word_count = len(ta.get_tokens(row['transcript']))
        ad_df.at[idx, 'word_count'] = word_count

        superlatives = ta.get_superlatives(row['transcript'])
        ad_df.at[idx, 'superlatives'] = ', '.join(superlatives) if superlatives else ''
        superlative_count = len(superlatives) if superlatives else 0
        ad_df.at[idx, 'superlative_count'] = superlative_count

        comparatives = ta.get_comparatives(row['transcript'])
        ad_df.at[idx, 'comparatives'] = ', '.join(comparatives) if comparatives else ''
        comparative_count = len(comparatives) if comparatives else 0
        ad_df.at[idx, 'comparative_count'] = comparative_count
        
        unique_words = ta.get_unique_words(row['transcript'])
        ad_df.at[idx, 'unique_words'] = ', '.join(unique_words) if unique_words else ''
        uniqueness_count = len(unique_words) if unique_words else 0
        ad_df.at[idx, 'uniqueness_count'] = uniqueness_count

        if word_count > 0:
            ad_df.at[idx, 'superlative_pct'] = superlative_count / word_count * 100
            ad_df.at[idx, 'comparative_pct'] = comparative_count / word_count * 100
            ad_df.at[idx, 'uniqueness_pct'] = uniqueness_count / word_count * 100
            
            total_bdm_terms = superlative_count + comparative_count + uniqueness_count
            ad_df.at[idx, 'total_bdm_terms_count'] = total_bdm_terms
            ad_df.at[idx, 'total_bdm_terms_pct'] = total_bdm_terms / word_count * 100

    ad_df = ad_df.sort_values(
        by=['superlative_count', 'comparative_count', 'superlative_pct', 'comparative_pct', 'uniqueness_pct'],
        ascending=[False, False, False, False, False]
    )

    ad_df[['commercial_number', 'superlatives', 'comparatives', 'unique_words', 'superlative_count', 'comparative_count', 'uniqueness_count', 'superlative_pct', 'comparative_pct', 'uniqueness_pct', 'total_bdm_terms_count', 'total_bdm_terms_pct']].to_csv('csvs/bdm_words.csv', index=False)

### Feature: Nomen + Adjektive

In [19]:
bdm_words_df = pd.read_csv('csvs/bdm_words.csv')
ad_df = ad_df.merge(bdm_words_df, on='commercial_number', how='left')

In [20]:
if not ADJ_NOUN_PAIRS_CHECKPOINT:
  ad_df["adj_noun_pairs"] = ad_df["transcript"].apply(ta.extract_adj_noun_pairs)
  ad_df["num_adj_noun_pairs"] = ad_df["adj_noun_pairs"].apply(len)
  ad_df[['commercial_number', 'adj_noun_pairs', 'num_adj_noun_pairs']].to_csv('csvs/adj_noun_pairs.csv', index=False)


### Feature: Semantische Nähe

In [21]:
adj_noun_pairs_df = pd.read_csv('csvs/adj_noun_pairs.csv')
ad_df = ad_df.merge(adj_noun_pairs_df, on='commercial_number', how='left')

In [22]:

if INDUSTRY_SPECIFIC_AWARENESS and not SEMANTIC_SIMILARITY_CHECKPOINT:
    import nltk
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    from nltk.probability import FreqDist
    from collections import defaultdict
    nltk.download('all')
    import numpy as np
    for idx, row in ad_df.iterrows():
        transcript = row['transcript']
        product_cat_keyword_similarities = {}
        for keyword in row['product_cat_keywords'][1:-1].replace("'", "").split(", "):
            similarity = round(float(ta.get_semantic_similarity(transcript, keyword)), 3)
            product_cat_keyword_similarities[keyword] = similarity
        
        sorted_keywords = sorted(product_cat_keyword_similarities.items(), key=lambda x: x[1], reverse=True)
        top_3_keywords = sorted_keywords[:3]
        top_3_average = round(float(np.mean([sim for _, sim in top_3_keywords])), 3)
        

        logging.info(f"Top 3 keywords for {row['commercial_number']}:")
        for keyword, similarity in top_3_keywords:
            logging.info(f"- {keyword}: {similarity}")
        logging.info(f"Top 3 average similarity: {top_3_average}")
        
        ad_df.at[idx, 'product_cat_keyword_similarity'] = top_3_average
        ad_df.at[idx, 'product_cat_top_keywords'] = ', '.join([keyword for keyword, _ in top_3_keywords])
if BRAND_SPECIFIC_AWARENESS and not SEMANTIC_SIMILARITY_CHECKPOINT:
    for idx, row in ad_df.iterrows():
        transcript = row['transcript']
        product_brand_keyword_similarities = {}
        
        for keyword in row['product_brand_keywords'][1:-1].replace("'", "").split(", "):
            similarity = round(float(ta.get_semantic_similarity(transcript, keyword)), 3)
            product_brand_keyword_similarities[keyword] = similarity
        
        sorted_keywords = sorted(product_brand_keyword_similarities.items(), key=lambda x: x[1], reverse=True)
        top_3_keywords = sorted_keywords[:3]
        top_3_average = round(float(np.mean([sim for _, sim in top_3_keywords])), 3)
        
        logging.info(f"Top 3 brand keywords for {row['commercial_number']}:")
        for keyword, similarity in top_3_keywords:
            logging.info(f"- {keyword}: {similarity}")
        logging.info(f"Top 3 average brand similarity: {top_3_average}")
        
        ad_df.at[idx, 'product_brand_keyword_similarity'] = top_3_average
        ad_df.at[idx, 'product_brand_top_keywords'] = ', '.join([keyword for keyword, _ in top_3_keywords])
if not SEMANTIC_SIMILARITY_CHECKPOINT:
    columns = ['commercial_number']
    if INDUSTRY_SPECIFIC_AWARENESS:
        columns.extend(['product_cat_keyword_similarity', 'product_cat_top_keywords'])
    if BRAND_SPECIFIC_AWARENESS:
        columns.extend(['product_brand_keyword_similarity', 'product_brand_top_keywords'])
    ad_df[columns].to_csv('csvs/semantic_similarity.csv', index=False)


### Feature: Personalpronomen

In [23]:
semantic_similarity_df = pd.read_csv('csvs/semantic_similarity.csv')
ad_df = ad_df.merge(semantic_similarity_df, on='commercial_number', how='left')


In [24]:
if not PERSONAL_PRONOUNS_CHECKPOINT:
    for idx, row in ad_df.iterrows():
        transcript = row['transcript']
        most_common_pronoun, most_common_pronoun_count, most_common_pronoun_pct = ta.get_dominant_pronoun_stats(transcript)
        ad_df.at[idx, 'personal_pronouns'] = most_common_pronoun
        ad_df.at[idx, 'num_personal_pronouns'] = most_common_pronoun_count
        ad_df.at[idx, 'personal_pronoun_pct'] = most_common_pronoun_pct
        ad_df[['commercial_number', 'personal_pronouns', 'num_personal_pronouns', 'personal_pronoun_pct']].to_csv('csvs/personal_pronouns.csv', index=False)    


In [25]:
personal_pronouns_df = pd.read_csv('csvs/personal_pronouns.csv')
ad_df = ad_df.merge(personal_pronouns_df, on='commercial_number', how='left')

## Handling Missing Values

In [None]:

display(ad_df[ad_df.isnull().any(axis=1)])
display(ad_df[ad_df.isna().any(axis=1)])

# Data Transformation

In [27]:
logging.info(f"Rows with BDM = 1.0: {len(ad_df[ad_df['BDM'] == 1.0])}")
logging.info(f"Rows with BDM = 0.0: {len(ad_df[ad_df['BDM'] == 0.0])}")

In [28]:
commercial_numbers = ad_df['commercial_number']


# CRISP-DM 4: Modeling

In [None]:
import models as m

data = m.prepare_model_data(ad_df, INDUSTRY_SPECIFIC_AWARENESS, BRAND_SPECIFIC_AWARENESS)
target = ad_df['BDM']
base_models = m.get_base_models()
param_distributions = m.get_param_distributions()
tuned_models = m.tune_models(data, target, base_models, param_distributions)

trained_models = m.train_models(data, target, tuned_models, INDUSTRY_SPECIFIC_AWARENESS, BRAND_SPECIFIC_AWARENESS)


# CRISP-DM 5: Evaluation

In [None]:
results_df, predictions = m.evaluate_models(data, target, trained_models)
original_data = ad_df.copy()
display(original_data.head(10))
original_data = pd.concat([original_data, predictions], axis=1)
m.display_model_results(data, target, trained_models, results_df)
predicted_data = original_data[['commercial_number', 'BDM', 'Logistic Regression_result', 'Random Forest_result', 'Support Vector Machine_result']]
predicted_data['majority_vote'] = predicted_data[['Logistic Regression_result', 'Random Forest_result', 'Support Vector Machine_result']].mode(axis=1)[0]
display(predicted_data.head(10))
m.analyze_decision_tree(data, target, tuned_models)