# Detecting BDM In Superbowl Commercials

In [1]:
import pandas as pd
import os
from dotenv import load_dotenv
load_dotenv()
%pip install -r requirements.txt

Collecting en_core_web_sm@ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
BDM_excel = pd.read_excel(f'{os.getenv("BDM_EXCEL_FILE")}')
final_excel = pd.read_excel(f'{os.getenv("FINAL_EXCEL_FILE")}')

In [3]:
BDM_excel = pd.read_excel(f'{os.getenv("BDM_EXCEL_FILE")}')
final_excel = pd.read_excel(f'{os.getenv("FINAL_EXCEL_FILE")}')



In [4]:
final_excel = final_excel.merge(
    BDM_excel[['AdNumber', 'BDM']], 
    on='AdNumber', 
    how='left',
    suffixes=('_old', '')
).drop('BDM_old', axis=1, errors='ignore')


# print number of rows where BDM is NaN, 0 and 1
print(f"Number of rows where BDM is NaN: {final_excel[final_excel['BDM'].isna()].shape[0]}")
print(f"Number of rows where BDM is 0: {final_excel[final_excel['BDM'] == 0].shape[0]}")
print(f"Number of rows where BDM is 1: {final_excel[final_excel['BDM'] == 1].shape[0]}")


Number of rows where BDM is NaN: 22
Number of rows where BDM is 0: 411
Number of rows where BDM is 1: 139


In [5]:
ad_df = final_excel.groupby(['cont_primary_product_type', 'BRAND', 'AdNumber', "BDM"]).size().reset_index(name='count')
ad_df.rename(columns={'cont_primary_product_type': 'product_category', 'BRAND': 'brand', 'AdNumber': 'commercial_number'}, inplace=True)
ad_df.drop(columns=['count'], inplace=True)
ad_df.head(10)

Unnamed: 0,product_category,brand,commercial_number,BDM
0,1.0,AvocadosfromMexico,AD0357,0.0
1,1.0,AvocadosfromMexico,AD0414,1.0
2,1.0,AvocadosfromMexico,AD0474,0.0
3,1.0,AvocadosfromMexico,AD0525,1.0
4,1.0,AvocadosfromMexico,AD0584,1.0
5,1.0,AvocadosfromMexico,AD0635,1.0
6,1.0,AvocadosfromMexico,AD0745,0.0
7,1.0,BUBLY,AD0586,1.0
8,1.0,Bai,AD0475,0.0
9,1.0,Butterfinger,AD0420,1.0


### Brand Keywords

In [6]:
product_brand_df = pd.read_csv("product_brands.csv")

product_brand_df['brand'] = product_brand_df['brand'].str.replace(' ', '').str.lower()
ad_df['brand_clean'] = ad_df['brand'].str.replace(' ', '').str.lower()

# Merge the dataframes
ad_df = ad_df.merge(
    product_brand_df[['brand', 'product_brand_keywords']], 
    left_on='brand_clean',
    right_on='brand',
    how='left',
    suffixes=('', '_brand')
)

# Clean up columns
ad_df.drop(['brand_clean', 'brand_brand'], axis=1, inplace=True)


ad_df.head(10)

Unnamed: 0,product_category,brand,commercial_number,BDM,product_brand_keywords
0,1.0,AvocadosfromMexico,AD0357,0.0,"['always in season', 'taste', 'delicious', 'go..."
1,1.0,AvocadosfromMexico,AD0414,1.0,"['always in season', 'taste', 'delicious', 'go..."
2,1.0,AvocadosfromMexico,AD0474,0.0,"['always in season', 'taste', 'delicious', 'go..."
3,1.0,AvocadosfromMexico,AD0525,1.0,"['always in season', 'taste', 'delicious', 'go..."
4,1.0,AvocadosfromMexico,AD0584,1.0,"['always in season', 'taste', 'delicious', 'go..."
5,1.0,AvocadosfromMexico,AD0635,1.0,"['always in season', 'taste', 'delicious', 'go..."
6,1.0,AvocadosfromMexico,AD0745,0.0,"['always in season', 'taste', 'delicious', 'go..."
7,1.0,BUBLY,AD0586,1.0,
8,1.0,Bai,AD0475,0.0,
9,1.0,Butterfinger,AD0420,1.0,


### Product Category Keywords

In [7]:
product_brands_df = pd.read_csv("product_categories.csv")
product_brands_df.head(40)
product_brands_df = product_brands_df.drop('product_cat_id', axis=1)
ad_df = ad_df.drop('product_category', axis=1)
display(product_brands_df)
display(ad_df)

# Create a dictionary to map brands to their product categories and other attributes
brand_to_info = {}
for _, row in product_brands_df.iterrows():
    # Convert string representation of list to actual list
    brands = eval(row['product_cat_brands'])
    for brand in brands:
        # Remove spaces and convert to lowercase for more robust matching
        brand = brand.replace(' ', '').lower()
        # Store all columns for this brand
        brand_to_info[brand] = {col: row[col] for col in product_brands_df.columns}

# Function to find category info for a brand
def find_brand_info(brand):
    # Clean brand name for matching
    clean_brand = brand.replace(' ', '').lower()
    return brand_to_info.get(clean_brand)

# Add all product category columns to ad_df
for col in product_brands_df.columns:
    ad_df[col] = ad_df['brand'].apply(lambda x: find_brand_info(x)[col] if find_brand_info(x) else None)

# Print brands that couldn't be mapped
unmapped_brands = ad_df[ad_df['product_cat_name'].isna()]['brand'].unique()
if len(unmapped_brands) > 0:
    print("Brands without category mapping:")
    for brand in unmapped_brands:
        print(f"- {brand}")

# Print number of rows with missing category
print(f"Number of rows where product category is NaN: {ad_df[ad_df['product_cat_name'].isna()].shape[0]}")


print(f"Final number of rows with missing categories: {ad_df[ad_df['product_cat_name'].isna()].shape[0]}")

ad_df.head(10)

Unnamed: 0,product_cat_name,product_cat_keywords,product_cat_brands
0,Alcoholic beverages (Beer Hard Seltzer),"['smooth', 'rich', 'refreshing', 'aromatic', '...","['Anheuser Busch InBev', 'Becks Beer', 'Bud Li..."
1,Banking & Investments,"['secure', 'reliable', 'customized', 'personal...","['BankofAmerica', 'Coinbase', 'Cryptocom', 'Di..."
2,Car Accessories & Related Services,"['safe', 'protect', 'protected', 'protection',...","['Michelin', 'Wallbox', 'Weathertech']"
3,Car Manufacturer,"['Luxurious', 'Efficient', 'Powerful', 'Innova...","['Acura', 'AlfaRomeo', 'Audi', 'BMW', 'Buick',..."
4,Car Sales & Services Platform,"['easy', 'perfect', 'love', 'expert', 'really'...","['Carvana', 'Vroom', 'Carscom', 'Carmax']"
5,"Clothing, shoes and apparel","['favorite', 'designed', 'comfortable', 'uncom...","['CalvinKlein', 'Gildan', 'HANDM', 'Marmot', '..."
6,Consumer Electronics and Appliances,"['Smart', 'Display', 'Control', 'Touchscreen',...","['BlackBerry', 'Google', 'Intel', 'LGEEAudioVi..."
7,Cosmetics and personal care products,"['luxurious', 'nourishing', 'radiant', 'smooth...","['Fitbit', 'PlanetFitness']"
8,Diet and exercise products,"['healthy', 'energizing', 'effective', 'smart'...","['CalvinKlein', 'Gildan', 'Marmot', 'Skechers'..."
9,Food Delivery Services,"['easy', 'delivered', 'local', 'convenience', ...","['DoorDash', 'UberEats']"


Unnamed: 0,brand,commercial_number,BDM,product_brand_keywords
0,AvocadosfromMexico,AD0357,0.0,"['always in season', 'taste', 'delicious', 'go..."
1,AvocadosfromMexico,AD0414,1.0,"['always in season', 'taste', 'delicious', 'go..."
2,AvocadosfromMexico,AD0474,0.0,"['always in season', 'taste', 'delicious', 'go..."
3,AvocadosfromMexico,AD0525,1.0,"['always in season', 'taste', 'delicious', 'go..."
4,AvocadosfromMexico,AD0584,1.0,"['always in season', 'taste', 'delicious', 'go..."
...,...,...,...,...
545,Wixcom,AD0408,0.0,
546,Wixcom,AD0465,0.0,
547,Wixcom,AD0522,0.0,
548,Wixcom,AD0579,0.0,


Brands without category mapping:
- AvocadosfromMexico
- Beck's Beer
- Hyndai
- Volkswagen
- LGCEAudioVisual
- Always
- Axe
- Colgate
- DollarShaveClub
- DoveMenCare
- DrSquatch
- Gillette
- Huggies
- IrishSpring
- Itsa10Haircare
- Olay
- Schick
- Weightwatchers
- UNIVERSALPARKSANDRESORTS
- BassProShops
- THEWASHINGTONPOST
- Mercari
- CaesarsSportsbook
- Century21
- Fiverr
- Groupon
- Indeed
Number of rows where product category is NaN: 38
Final number of rows with missing categories: 38


Unnamed: 0,brand,commercial_number,BDM,product_brand_keywords,product_cat_name,product_cat_keywords,product_cat_brands
0,AvocadosfromMexico,AD0357,0.0,"['always in season', 'taste', 'delicious', 'go...",,,
1,AvocadosfromMexico,AD0414,1.0,"['always in season', 'taste', 'delicious', 'go...",,,
2,AvocadosfromMexico,AD0474,0.0,"['always in season', 'taste', 'delicious', 'go...",,,
3,AvocadosfromMexico,AD0525,1.0,"['always in season', 'taste', 'delicious', 'go...",,,
4,AvocadosfromMexico,AD0584,1.0,"['always in season', 'taste', 'delicious', 'go...",,,
5,AvocadosfromMexico,AD0635,1.0,"['always in season', 'taste', 'delicious', 'go...",,,
6,AvocadosfromMexico,AD0745,0.0,"['always in season', 'taste', 'delicious', 'go...",,,
7,BUBLY,AD0586,1.0,,Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'..."
8,Bai,AD0475,0.0,,Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'..."
9,Butterfinger,AD0420,1.0,,Snacks,"['artificial', 'zero', 'refreshing', 'organic'...","['Butterfinger', 'Cheerios', 'Cheetos', 'Choba..."


In [8]:
ad_df = ad_df.dropna()
# TODO: Implement proper handling of missing values
# TODO: Remove this line
ad_df = ad_df.head(5)
display(ad_df)

Unnamed: 0,brand,commercial_number,BDM,product_brand_keywords,product_cat_name,product_cat_keywords,product_cat_brands
59,MountainDew,AD0442,1.0,"['juice', 'caffeine', 'zero', 'sugar', 'refres...",Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'..."
60,MountainDew,AD0533,0.0,"['juice', 'caffeine', 'zero', 'sugar', 'refres...",Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'..."
61,MountainDew,AD0659,1.0,"['juice', 'caffeine', 'zero', 'sugar', 'refres...",Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'..."
62,MountainDew,AD0717,0.0,"['juice', 'caffeine', 'zero', 'sugar', 'refres...",Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'..."
87,Snickers,AD0382,1.0,"['hungry', 'satisfies', 'eat', 'hostile', 'cra...",Snacks,"['artificial', 'zero', 'refreshing', 'organic'...","['Butterfinger', 'Cheerios', 'Cheetos', 'Choba..."


## Retrieving Transcript

In [9]:
import glob
from pathlib import Path

# TODO: Implement this so that you can reuse it in the streamlit app

# Get all txt files recursively from ADS_DIR
ads_dir = Path(os.getenv("ADS_DIR"))
transcript_files = glob.glob(str(ads_dir / "**/*.txt"), recursive=True)
# print transcript_files
print(transcript_files)
# Create a dictionary mapping commercial numbers to file paths
transcript_map = {Path(f).stem: f for f in transcript_files}

# Update transcripts in dataframe
ad_df['transcript'] = ''
for idx, row in ad_df.iterrows():
    commercial_num = row['commercial_number']
    if commercial_num in transcript_map:
        try:
            with open(transcript_map[commercial_num], 'r', encoding='utf-8') as f:
                ad_df.at[idx, 'transcript'] = f.read().strip()
        except FileNotFoundError:
            ad_df.at[idx, 'transcript'] = None
    else:
        ad_df.at[idx, 'transcript'] = None

ad_df[ad_df['transcript'].notna()]
ad_df["audio_only_transcript"] = ad_df["transcript"]
ad_df.head(10)

['/Users/nilst/Development/Commercial-Brand-Differentiating-Message-Analysis/ADs/ADs_IG_2018/AD0536.txt', '/Users/nilst/Development/Commercial-Brand-Differentiating-Message-Analysis/ADs/ADs_IG_2018/AD0537.txt', '/Users/nilst/Development/Commercial-Brand-Differentiating-Message-Analysis/ADs/ADs_IG_2018/AD0535.txt', '/Users/nilst/Development/Commercial-Brand-Differentiating-Message-Analysis/ADs/ADs_IG_2018/AD0534.txt', '/Users/nilst/Development/Commercial-Brand-Differentiating-Message-Analysis/ADs/ADs_IG_2018/AD0524.txt', '/Users/nilst/Development/Commercial-Brand-Differentiating-Message-Analysis/ADs/ADs_IG_2018/AD0530.txt', '/Users/nilst/Development/Commercial-Brand-Differentiating-Message-Analysis/ADs/ADs_IG_2018/AD0531.txt', '/Users/nilst/Development/Commercial-Brand-Differentiating-Message-Analysis/ADs/ADs_IG_2018/AD0525.txt', '/Users/nilst/Development/Commercial-Brand-Differentiating-Message-Analysis/ADs/ADs_IG_2018/AD0527.txt', '/Users/nilst/Development/Commercial-Brand-Differentia

Unnamed: 0,brand,commercial_number,BDM,product_brand_keywords,product_cat_name,product_cat_keywords,product_cat_brands,transcript,audio_only_transcript
59,MountainDew,AD0442,1.0,"['juice', 'caffeine', 'zero', 'sugar', 'refres...",Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'...","Man, I might just chill tonight. Puppy monkey,...","Man, I might just chill tonight. Puppy monkey,..."
60,MountainDew,AD0533,0.0,"['juice', 'caffeine', 'zero', 'sugar', 'refres...",Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'...",,
61,MountainDew,AD0659,1.0,"['juice', 'caffeine', 'zero', 'sugar', 'refres...",Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'...","Come out, come out, wherever you are. I've got...","Come out, come out, wherever you are. I've got..."
62,MountainDew,AD0717,0.0,"['juice', 'caffeine', 'zero', 'sugar', 'refres...",Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'...",Be the first to count and tweet the exact numb...,Be the first to count and tweet the exact numb...
87,Snickers,AD0382,1.0,"['hungry', 'satisfies', 'eat', 'hostile', 'cra...",Snacks,"['artificial', 'zero', 'refreshing', 'organic'...","['Butterfinger', 'Cheerios', 'Cheetos', 'Choba...","Marsha, what happened? Peter hit me in the nos...","Marsha, what happened? Peter hit me in the nos..."


## Adding OCR Text

In [10]:

ocr_to_merge = pd.read_csv("./ocr_to_merge.csv")
ad_df = ad_df.merge(ocr_to_merge, left_on='commercial_number', right_on='ad', how='left')
ad_df.drop(columns=['ad', 'recognized_text'], inplace=True)
ad_df.rename(columns={'cleaned_text': 'ocr_text'}, inplace=True)

# merge ocr_text with transcript
# TODO: Rename transcript to transcript_plus_ocr
ad_df['transcript'] = ad_df['ocr_text'] + ' ' + ad_df['transcript']
ad_df.drop(columns=['ocr_text'], inplace=True)

ad_df.head()


Unnamed: 0,brand,commercial_number,BDM,product_brand_keywords,product_cat_name,product_cat_keywords,product_cat_brands,transcript,audio_only_transcript
0,MountainDew,AD0442,1.0,"['juice', 'caffeine', 'zero', 'sugar', 'refres...",Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'...",START THREE AWESOME THINGS COMBINED DEW Kick M...,"Man, I might just chill tonight. Puppy monkey,..."
1,MountainDew,AD0533,0.0,"['juice', 'caffeine', 'zero', 'sugar', 'refres...",Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'...",,
2,MountainDew,AD0659,1.0,"['juice', 'caffeine', 'zero', 'sugar', 'refres...",Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'...",AS GOOD AS THE ORIGINAL DEW DEW and the DEW Lo...,"Come out, come out, wherever you are. I've got..."
3,MountainDew,AD0717,0.0,"['juice', 'caffeine', 'zero', 'sugar', 'refres...",Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'...",SM TR SMACK JEW Mu BOTTLE Kit Ml DEW DEW the D...,Be the first to count and tweet the exact numb...
4,Snickers,AD0382,1.0,"['hungry', 'satisfies', 'eat', 'hostile', 'cra...",Snacks,"['artificial', 'zero', 'refreshing', 'organic'...","['Butterfinger', 'Cheerios', 'Cheetos', 'Choba...",Not You When hungry Snickers TM Mars Incorpora...,"Marsha, what happened? Peter hit me in the nos..."


In [11]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# Determining Frequency of Superlatives and Comparative Adjectives

In [12]:
import spacy
import pandas as pd
from collections import Counter
import text_analysis as ta

# Create new columns to store the counts and percentages
ad_df['word_count'] = 0
ad_df['superlative_count'] = 0
ad_df['comparative_count'] = 0
ad_df['uniqueness_count'] = 0
ad_df['superlative_pct'] = 0.0
ad_df['comparative_pct'] = 0.0
ad_df['uniqueness_pct'] = 0.0
ad_df['total_bdm_terms_count'] = 0
ad_df['total_bdm_terms_pct'] = 0.0

# Process each row
for idx, row in ad_df.iterrows():
    word_count = len(ta.get_tokens(row['transcript']))
    ad_df.at[idx, 'word_count'] = word_count

    superlatives = ta.get_superlatives(row['transcript'])
    ad_df.at[idx, 'superlatives'] = ', '.join(superlatives) if superlatives else ''
    superlative_count = len(superlatives) if superlatives else 0
    ad_df.at[idx, 'superlative_count'] = superlative_count

    comparatives = ta.get_comparatives(row['transcript'])
    ad_df.at[idx, 'comparatives'] = ', '.join(comparatives) if comparatives else ''
    comparative_count = len(comparatives) if comparatives else 0
    ad_df.at[idx, 'comparative_count'] = comparative_count
    
    unique_words = ta.get_unique_words(row['transcript'])
    ad_df.at[idx, 'unique_words'] = ', '.join(unique_words) if unique_words else ''
    uniqueness_count = len(unique_words) if unique_words else 0
    ad_df.at[idx, 'uniqueness_count'] = uniqueness_count

    if word_count > 0:
        ad_df.at[idx, 'superlative_pct'] = superlative_count / word_count * 100
        ad_df.at[idx, 'comparative_pct'] = comparative_count / word_count * 100
        ad_df.at[idx, 'uniqueness_pct'] = uniqueness_count / word_count * 100
        
        total_bdm_terms = superlative_count + comparative_count + uniqueness_count
        ad_df.at[idx, 'total_bdm_terms_count'] = total_bdm_terms
        ad_df.at[idx, 'total_bdm_terms_pct'] = total_bdm_terms / word_count * 100

# Sort the DataFrame
ad_df = ad_df.sort_values(
    by=['superlative_count', 'comparative_count', 'superlative_pct', 'comparative_pct', 'uniqueness_pct'],
    ascending=[False, False, False, False, False]
)

# Display top 10 results
ad_df.head(10)

  from .autonotebook import tqdm as notebook_tqdm
python(74154) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Unnamed: 0,brand,commercial_number,BDM,product_brand_keywords,product_cat_name,product_cat_keywords,product_cat_brands,transcript,audio_only_transcript,word_count,...,comparative_count,uniqueness_count,superlative_pct,comparative_pct,uniqueness_pct,total_bdm_terms_count,total_bdm_terms_pct,superlatives,comparatives,unique_words
2,MountainDew,AD0659,1.0,"['juice', 'caffeine', 'zero', 'sugar', 'refres...",Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'...",AS GOOD AS THE ORIGINAL DEW DEW and the DEW Lo...,"Come out, come out, wherever you are. I've got...",67,...,0,2,0.0,0.0,2.985075,2,2.985075,,,"ORIGINAL, original"
3,MountainDew,AD0717,0.0,"['juice', 'caffeine', 'zero', 'sugar', 'refres...",Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'...",SM TR SMACK JEW Mu BOTTLE Kit Ml DEW DEW the D...,Be the first to count and tweet the exact numb...,95,...,0,1,0.0,0.0,1.052632,1,1.052632,,,first
0,MountainDew,AD0442,1.0,"['juice', 'caffeine', 'zero', 'sugar', 'refres...",Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'...",START THREE AWESOME THINGS COMBINED DEW Kick M...,"Man, I might just chill tonight. Puppy monkey,...",81,...,0,0,0.0,0.0,0.0,0,0.0,,,
1,MountainDew,AD0533,0.0,"['juice', 'caffeine', 'zero', 'sugar', 'refres...",Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'...",,,1,...,0,0,0.0,0.0,0.0,0,0.0,,,
4,Snickers,AD0382,1.0,"['hungry', 'satisfies', 'eat', 'hostile', 'cra...",Snacks,"['artificial', 'zero', 'refreshing', 'organic'...","['Butterfinger', 'Cheerios', 'Cheetos', 'Choba...",Not You When hungry Snickers TM Mars Incorpora...,"Marsha, what happened? Peter hit me in the nos...",115,...,0,0,0.0,0.0,0.0,0,0.0,,,


In [13]:
# remove superlatives, comparatives and unique_words from ad_df
# TODO: Comment back in
# ad_df.drop(columns=['superlatives', 'comparatives', 'unique_words', 'bdm_words'], inplace=True)
ad_df.drop(columns=['comparative_pct', 'superlative_pct', 'uniqueness_pct'], inplace=True)
ad_df.head(10)


Unnamed: 0,brand,commercial_number,BDM,product_brand_keywords,product_cat_name,product_cat_keywords,product_cat_brands,transcript,audio_only_transcript,word_count,superlative_count,comparative_count,uniqueness_count,total_bdm_terms_count,total_bdm_terms_pct,superlatives,comparatives,unique_words
2,MountainDew,AD0659,1.0,"['juice', 'caffeine', 'zero', 'sugar', 'refres...",Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'...",AS GOOD AS THE ORIGINAL DEW DEW and the DEW Lo...,"Come out, come out, wherever you are. I've got...",67,0,0,2,2,2.985075,,,"ORIGINAL, original"
3,MountainDew,AD0717,0.0,"['juice', 'caffeine', 'zero', 'sugar', 'refres...",Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'...",SM TR SMACK JEW Mu BOTTLE Kit Ml DEW DEW the D...,Be the first to count and tweet the exact numb...,95,0,0,1,1,1.052632,,,first
0,MountainDew,AD0442,1.0,"['juice', 'caffeine', 'zero', 'sugar', 'refres...",Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'...",START THREE AWESOME THINGS COMBINED DEW Kick M...,"Man, I might just chill tonight. Puppy monkey,...",81,0,0,0,0,0.0,,,
1,MountainDew,AD0533,0.0,"['juice', 'caffeine', 'zero', 'sugar', 'refres...",Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'...",,,1,0,0,0,0,0.0,,,
4,Snickers,AD0382,1.0,"['hungry', 'satisfies', 'eat', 'hostile', 'cra...",Snacks,"['artificial', 'zero', 'refreshing', 'organic'...","['Butterfinger', 'Cheerios', 'Cheetos', 'Choba...",Not You When hungry Snickers TM Mars Incorpora...,"Marsha, what happened? Peter hit me in the nos...",115,0,0,0,0,0.0,,,


## Nomen + Adjektive

In [14]:
# Apply the function to each transcript individually using pandas apply
ad_df["adj_noun_pairs"] = ad_df["transcript"].apply(ta.extract_adj_noun_pairs)

# Add a column for the number of adjective-noun pairs
ad_df["num_adj_noun_pairs"] = ad_df["adj_noun_pairs"].apply(len)

# Display the first 10 rows of the DataFrame
display(ad_df.head(10))

Unnamed: 0,brand,commercial_number,BDM,product_brand_keywords,product_cat_name,product_cat_keywords,product_cat_brands,transcript,audio_only_transcript,word_count,superlative_count,comparative_count,uniqueness_count,total_bdm_terms_count,total_bdm_terms_pct,superlatives,comparatives,unique_words,adj_noun_pairs,num_adj_noun_pairs
2,MountainDew,AD0659,1.0,"['juice', 'caffeine', 'zero', 'sugar', 'refres...",Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'...",AS GOOD AS THE ORIGINAL DEW DEW and the DEW Lo...,"Come out, come out, wherever you are. I've got...",67,0,0,2,2,2.985075,,,"ORIGINAL, original",[refreshing taste],1
3,MountainDew,AD0717,0.0,"['juice', 'caffeine', 'zero', 'sugar', 'refres...",Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'...",SM TR SMACK JEW Mu BOTTLE Kit Ml DEW DEW the D...,Be the first to count and tweet the exact numb...,95,0,0,1,1,1.052632,,,first,"[complete Rules, Potential winner, exact number]",3
0,MountainDew,AD0442,1.0,"['juice', 'caffeine', 'zero', 'sugar', 'refres...",Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'...",START THREE AWESOME THINGS COMBINED DEW Kick M...,"Man, I might just chill tonight. Puppy monkey,...",81,0,0,0,0,0.0,,,,[],0
1,MountainDew,AD0533,0.0,"['juice', 'caffeine', 'zero', 'sugar', 'refres...",Softdrinks,"['refreshing', 'organic', 'fizzy', 'sparkling'...","['Bai', 'BUBLY', 'CocaCola', 'DietCoke', 'MiO'...",,,1,0,0,0,0,0.0,,,,[],0
4,Snickers,AD0382,1.0,"['hungry', 'satisfies', 'eat', 'hostile', 'cra...",Snacks,"['artificial', 'zero', 'refreshing', 'organic'...","['Butterfinger', 'Cheerios', 'Cheetos', 'Choba...",Not You When hungry Snickers TM Mars Incorpora...,"Marsha, what happened? Peter hit me in the nos...",115,0,0,0,0,0.0,,,,[],0


In [15]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from collections import defaultdict

nltk.download('all')


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /Users/nilst/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/nilst/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/nilst/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /Users/nilst/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/nilst/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_dat

True

In [16]:
# Process each transcript
import numpy as np
for idx, row in ad_df.iterrows():
    transcript = row['transcript']
    product_cat_keyword_similarities = {}
    for keyword in row['product_cat_keywords'][1:-1].replace("'", "").split(", "):
        similarity = round(float(ta.get_semantic_similarity(transcript, keyword)), 3)
        product_cat_keyword_similarities[keyword] = similarity
    
    sorted_keywords = sorted(product_cat_keyword_similarities.items(), key=lambda x: x[1], reverse=True)
    top_3_keywords = sorted_keywords[:3]
    top_3_average = round(float(np.mean([sim for _, sim in top_3_keywords])), 3)
    
    print(f"\nTranscript {row['commercial_number']}:")
    print(f"Top 3 keywords:")
    for keyword, similarity in top_3_keywords:
        print(f"- {keyword}: {similarity}")
    print(f"Top 3 average similarity: {top_3_average}")
    
    ad_df.at[idx, 'product_cat_keyword_similarity'] = top_3_average
    ad_df.at[idx, 'product_cat_top_keywords'] = ', '.join([keyword for keyword, _ in top_3_keywords])

for idx, row in ad_df.iterrows():
    transcript = row['transcript']
    product_brand_keyword_similarities = {}
    
    for keyword in row['product_brand_keywords'][1:-1].replace("'", "").split(", "):
        similarity = round(float(ta.get_semantic_similarity(transcript, keyword)), 3)
        product_brand_keyword_similarities[keyword] = similarity
    
    sorted_keywords = sorted(product_brand_keyword_similarities.items(), key=lambda x: x[1], reverse=True)
    top_3_keywords = sorted_keywords[:3]
    top_3_average = round(float(np.mean([sim for _, sim in top_3_keywords])), 3)
    
    print(f"\nTranscript {row['commercial_number']}:")
    print(f"Top 3 brand keywords:")
    for keyword, similarity in top_3_keywords:
        print(f"- {keyword}: {similarity}")
    print(f"Top 3 average brand similarity: {top_3_average}")
    
    ad_df.at[idx, 'product_brand_keyword_similarity'] = top_3_average
    ad_df.at[idx, 'product_brand_top_keywords'] = ', '.join([keyword for keyword, _ in top_3_keywords])

python(74635) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.



Transcript AD0659:
Top 3 keywords:
- sugar: 0.34
- flavor: 0.305
- fresh: 0.274
Top 3 average similarity: 0.306


: 

: 

In [None]:


ad_df[ad_df.isnull().any(axis=1)].head()
ad_df[ad_df.isna().any(axis=1)].head()

ad_df = ad_df[ad_df['transcript'] != '']
ad_df = ad_df[ad_df['transcript'] != '']

# print all from ad_df with empty values
print(ad_df[ad_df.isnull().any(axis=1)])
print(ad_df[ad_df.isna().any(axis=1)])

In [None]:
ad_df.head(20)

In [None]:

# show all rows ehere bdm is 1.0 and where industry is product_cat_id 4

nice_df = ad_df[ad_df['BDM'] == 1.0]


# Create a function to find shared words
def get_shared_words(row):
    # Convert keywords string to list if it's a string
    keywords = eval(row['product_cat_keywords']) if isinstance(row['product_cat_keywords'], str) else row['product_cat_keywords']
    
    # Convert all keywords to lowercase for better matching
    keywords = [word.lower() for word in keywords]
    
    # Split transcript into words and convert to lowercase
    transcript_words = set(word.lower() for word in str(row['transcript']).split())
    
    # Find intersection between keywords and transcript words
    shared = [word for word in keywords if word in transcript_words]
    
    return shared

# Add new column for shared words
ad_df['shared_keywords'] = ad_df.apply(get_shared_words, axis=1)

# Add column for count of shared words
ad_df['shared_keywords_count'] = ad_df['shared_keywords'].str.len()

# Update the display code
nice_df = ad_df[ad_df['BDM'] == 1.0]
display(nice_df)

In [None]:
# Calculate the minimum number of samples in each group
# 
min_samples = min(len(ad_df[ad_df['BDM'] == 1]), len(ad_df[ad_df['BDM'] == 0]))
ad_df.to_csv('ad_df.csv', index=False)
# Perform undersampling
ad_df_balanced = pd.concat([
    ad_df[ad_df['BDM'] == 1].sample(n=min_samples, random_state=42),
    ad_df[ad_df['BDM'] == 0].sample(n=min_samples, random_state=42)
]).reset_index(drop=True)

# Print the results
print(f"Total rows: {len(ad_df_balanced)}")
print(f"Rows with BDM = 1.0: {len(ad_df_balanced[ad_df_balanced['BDM'] == 1.0])}")
print(f"Rows with BDM = 0.0: {len(ad_df_balanced[ad_df_balanced['BDM'] == 0.0])}")



commercial_numbers = ad_df_balanced['commercial_number']

# TODO: remove the following line after testing
ad_df_balanced = ad_df

ad_df_balanced.head(20)

## Ansatz 1 (Machine learning)

In [None]:
import models as m
data, target = m.prepare_model_data(ad_df_balanced)

base_models = m.get_base_models()
param_distributions = m.get_param_distributions()
tuned_models = m.tune_models(data, target, base_models, param_distributions)

trained_models = m.train_models(data, target, tuned_models)

# Evaluate the trained models
results_df, predictions = m.evaluate_models(data, target, trained_models)

original_data = ad_df_balanced.copy()
original_data = pd.concat([original_data, commercial_numbers], axis=1)

original_data = pd.concat([original_data, predictions], axis=1)

# After your existing model training code
m.display_model_results(data, target, trained_models, results_df)


predicted_data = original_data

# only include the top 3 models prediction results
predicted_data = predicted_data[['commercial_number', 'BDM', 'Logistic Regression_result', 'Random Forest_result', 'Support Vector Machine_result']]
# write the majority result of the colums logistic regression, random forest and support vector machine to a new column majority vote
predicted_data['majority_vote'] = predicted_data[['Logistic Regression_result', 'Random Forest_result', 'Support Vector Machine_result']].mode(axis=1)[0]
display(predicted_data.head(10))


m.analyze_decision_tree(data, target, tuned_models)


## Ansatz 2 - RNN + LSTM

In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_hub as hub

## Ansatz 2 - RNN + LSTM

In [38]:
df = pd.read_csv('df.csv')

In [None]:
train, val, test = np.split(df.sample(frac=1), [int(0.8*len(df)), int(0.9*len(df))])

In [40]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    df = dataframe.copy()
    labels = df.pop('label')
    
    # Clean the descriptions - replace NaN with empty string and ensure all items are strings
    descriptions = df["description"].fillna("").astype(str).tolist()
    
    # Convert to tensors
    ds = tf.data.Dataset.from_tensor_slices((descriptions, labels))
    
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [41]:
train_data = df_to_dataset(train)
valid_data = df_to_dataset(val)
test_data = df_to_dataset(test)

# Embedding + Model

In [42]:
embedding = "https://tfhub.dev/google/nnlm-en-dim50/2"
hub_layer = hub.KerasLayer(embedding, dtype=tf.string, trainable=True)

In [None]:
hub_layer(list(train_data)[0][0])

In [44]:
import tf_keras

model = tf_keras.Sequential([
    hub_layer,
    tf_keras.layers.Dense(16, activation='relu'),
    tf_keras.layers.Dropout(0.4),
    tf_keras.layers.Dense(16, activation='relu'),
    tf_keras.layers.Dropout(0.4),
    tf_keras.layers.Dense(1, activation='sigmoid')
])

In [45]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [None]:
model.evaluate(train_data)

In [None]:
model.evaluate(valid_data)

In [48]:
# history = model.fit(train_data, epochs=3, validation_data=valid_data)

In [49]:
# 


In [None]:
model.evaluate(test_data)

In [51]:
from sklearn.model_selection import KFold

def cross_validate_confusion_matrix(df, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    all_true = []
    all_pred = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
        # Split the data
        train = df.iloc[train_idx]
        val = df.iloc[val_idx]
        
        # Create datasets
        train_data = df_to_dataset(train)
        val_data = df_to_dataset(val)
        
        # Train model
        model = tf_keras.Sequential([
            hub_layer,
            tf_keras.layers.Dense(16, activation='relu'),
            tf_keras.layers.Dropout(0.4),
            tf_keras.layers.Dense(16, activation='relu'),
            tf_keras.layers.Dropout(0.4),
            tf_keras.layers.Dense(1, activation='sigmoid')
        ])
        
        model.compile(optimizer='adam',
                     loss=tf.keras.losses.BinaryCrossentropy(),
                     metrics=['accuracy'])
        
        model.fit(train_data, epochs=3, verbose=0)
        
        # Get predictions for this fold's validation set
        val_predictions = model.predict(val_data, verbose=0)
        val_predictions = (val_predictions > 0.5).astype(int)
        
        # Store true labels and predictions
        val_labels = np.concatenate([y for x, y in val_data], axis=0)
        all_true.extend(val_labels)
        all_pred.extend(val_predictions)
    
    return np.array(all_true), np.array(all_pred)

In [52]:
import seaborn as sns
from sklearn.metrics import confusion_matrix
def plot_confusion_matrix(true_labels, predictions, title="Model Confusion Matrix"):
    # Create confusion matrix
    cm = confusion_matrix(true_labels, predictions)

    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(title)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

    # Calculate and print metrics
    tn, fp, fn, tp = cm.ravel()
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(f"\nModel Metrics:")
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"F1 Score: {f1:.3f}")

In [None]:
# Get cross-validation results
true_labels, predictions = cross_validate_confusion_matrix(df)

# Plot confusion matrix for all data
plot_confusion_matrix(true_labels, predictions, "Cross-Validated Model Confusion Matrix")

# LSTM

In [54]:
def cross_validate_lstm_confusion_matrix(df, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    all_true = []
    all_pred = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
        # Split the data
        train = df.iloc[train_idx]
        val = df.iloc[val_idx]
        
        # Create datasets
        train_data = df_to_dataset(train)
        val_data = df_to_dataset(val)
        
        # Create and compile model
        model = tf.keras.Sequential([
            encoder,
            tf.keras.layers.Embedding(
                input_dim=len(encoder.get_vocabulary()),
                output_dim=32,
                mask_zero=True
            ),
            tf.keras.layers.LSTM(32, use_cudnn=False),  # Disable cuDNN
            tf.keras.layers.Dense(32, activation='relu'),
            tf.keras.layers.Dropout(0.4),
            tf.keras.layers.Dense(1, activation='sigmoid')
        ])

        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                     loss=tf.keras.losses.BinaryCrossentropy(),
                     metrics=['accuracy'])
        
        # Train model
        model.fit(train_data, epochs=5, verbose=0)
        
        # Get predictions for this fold's validation set
        val_predictions = model.predict(val_data, verbose=0)
        val_predictions = (val_predictions > 0.5).astype(int)
        
        # Store true labels and predictions
        val_labels = np.concatenate([y for x, y in val_data], axis=0)
        all_true.extend(val_labels)
        all_pred.extend(val_predictions)
    
    return np.array(all_true), np.array(all_pred)

In [55]:
encoder = tf.keras.layers.TextVectorization(max_tokens=600)
encoder.adapt(train_data.map(lambda text, label: text))

In [None]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

In [57]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=32,
        mask_zero=True
    ),
    tf.keras.layers.LSTM(32, use_cudnn=False),  # Disable cuDNN
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [58]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [None]:
model.evaluate(train_data)
model.evaluate(valid_data)

In [None]:
history = model.fit(train_data, epochs=5, validation_data=valid_data)

In [None]:
model.evaluate(test_data)

In [None]:
# Get cross-validation results for LSTM
true_labels_lstm, predictions_lstm = cross_validate_lstm_confusion_matrix(df)

# Plot confusion matrix for all data
plot_confusion_matrix(true_labels_lstm, predictions_lstm, "Cross-Validated LSTM Model Confusion Matrix")

<div class="alert alert-block alert-info">
<b>Fazit</b>


Warum performen die ML Modelle besser als die Neural Networks

1. Datensatzgröße:
- 250 Werbespots sind ein sehr kleiner Datensatz für neuronale Netze, die typischerweise Tausende oder Zehntausende von Beispielen benötigen, um effektiv zu lernen


- Feature-Extraktionsansatz bietet eine explizite Struktur, die bei der begrenzten Menge schonmal eine Grundlage schaffen, die die NNs nicht haben

2. Featurequalität vs. Rohtext:
- Unsere handgefertigten Features erfassen domänenspezifisches Wissen darüber, was eine BDM effektiv macht

- Neuronale Netze müssen diese Muster von Grund auf nur aus Rohtext lernen, was mit begrenzten Daten viel schwieriger ist

</div>