# Detecting BDM In Superbowl Commercials

## Assigning Brands To Productcategories  

In [89]:
import pandas as pd
import os
from dotenv import load_dotenv
load_dotenv()

True

In [90]:

BDM_excel= pd.read_excel(f"{os.getenv("BDM_EXCEL_FILE")}")
BDM_excel.head(30)
final_excel = pd.read_excel(f"{os.getenv("FINAL_EXCEL_FILE")}")



In [91]:
final_excel = final_excel.merge(
    BDM_excel[['AdNumber', 'BDM']], 
    on='AdNumber', 
    how='left',
    suffixes=('_old', '')
).drop('BDM_old', axis=1, errors='ignore')

In [92]:
ad_df = final_excel.groupby(['cont_primary_product_type', 'BRAND', 'AdNumber', "BDM"]).size().reset_index(name='count')
ad_df.rename(columns={'cont_primary_product_type': 'product_category', 'BRAND': 'brand', 'AdNumber': 'commercial_number'}, inplace=True)
ad_df.drop(columns=['count'], inplace=True)
ad_df.head(30)

Unnamed: 0,product_category,brand,commercial_number,BDM
0,1.0,AvocadosfromMexico,AD0357,0.0
1,1.0,AvocadosfromMexico,AD0414,1.0
2,1.0,AvocadosfromMexico,AD0474,0.0
3,1.0,AvocadosfromMexico,AD0525,1.0
4,1.0,AvocadosfromMexico,AD0584,1.0
5,1.0,AvocadosfromMexico,AD0635,1.0
6,1.0,AvocadosfromMexico,AD0745,0.0
7,1.0,BUBLY,AD0586,1.0
8,1.0,Bai,AD0475,0.0
9,1.0,Butterfinger,AD0420,1.0


## Retrieving Transcript

In [93]:
import glob
from pathlib import Path

# Get all txt files recursively from ADS_DIR
ads_dir = Path(os.getenv("ADS_DIR"))
transcript_files = glob.glob(str(ads_dir / "**/*.txt"), recursive=True)
# print transcript_files
print(transcript_files)
# Create a dictionary mapping commercial numbers to file paths
transcript_map = {Path(f).stem: f for f in transcript_files}

# Update transcripts in dataframe
ad_df['transcript'] = ''
for idx, row in ad_df.iterrows():
    commercial_num = row['commercial_number']
    if commercial_num in transcript_map:
        try:
            with open(transcript_map[commercial_num], 'r', encoding='utf-8') as f:
                ad_df.at[idx, 'transcript'] = f.read().strip()
        except FileNotFoundError:
            ad_df.at[idx, 'transcript'] = None
    else:
        ad_df.at[idx, 'transcript'] = None

ad_df[ad_df['transcript'].notna()]
ad_df.head(30)

['/home/arkastor/Development/Commercial-Brand-Differentiating-Message-Analysis/ADs/ADs_IG_2015/AD0380.txt', '/home/arkastor/Development/Commercial-Brand-Differentiating-Message-Analysis/ADs/ADs_IG_2015/AD0408.txt', '/home/arkastor/Development/Commercial-Brand-Differentiating-Message-Analysis/ADs/ADs_IG_2015/AD0399.txt', '/home/arkastor/Development/Commercial-Brand-Differentiating-Message-Analysis/ADs/ADs_IG_2015/AD0359.txt', '/home/arkastor/Development/Commercial-Brand-Differentiating-Message-Analysis/ADs/ADs_IG_2015/AD0383.txt', '/home/arkastor/Development/Commercial-Brand-Differentiating-Message-Analysis/ADs/ADs_IG_2015/AD0393.txt', '/home/arkastor/Development/Commercial-Brand-Differentiating-Message-Analysis/ADs/ADs_IG_2015/AD0370.txt', '/home/arkastor/Development/Commercial-Brand-Differentiating-Message-Analysis/ADs/ADs_IG_2015/AD0377.txt', '/home/arkastor/Development/Commercial-Brand-Differentiating-Message-Analysis/ADs/ADs_IG_2015/AD0404.txt', '/home/arkastor/Development/Commerci

Unnamed: 0,product_category,brand,commercial_number,BDM,transcript
0,1.0,AvocadosfromMexico,AD0357,0.0,"With the next pick in the first draft ever, Au..."
1,1.0,AvocadosfromMexico,AD0414,1.0,Over here we have the cube of Rubik. This simp...
2,1.0,AvocadosfromMexico,AD0474,0.0,"Guys, how can we be a secret society if we can..."
3,1.0,AvocadosfromMexico,AD0525,1.0,"Well, we did it. We sealed the bad out and kep..."
4,1.0,AvocadosfromMexico,AD0584,1.0,"Pageantry, poise, welcome to the dog show wher..."
5,1.0,AvocadosfromMexico,AD0635,1.0,Welcome to the Avocados from Mexico shopping n...
6,1.0,AvocadosfromMexico,AD0745,0.0,This is officially the worst tailgate I've eve...
7,1.0,BUBLY,AD0586,1.0,"Ooh, Blackberry Bublé, my favorite. You mean B..."
8,1.0,Bai,AD0475,0.0,I don't want to make it tough. I just want to ...
9,1.0,Butterfinger,AD0420,1.0,"A bull rider is bold. Now, a bull rider on a p..."


## Extract most frequent Keywords for each Brand from trasncript

In [94]:
brand_df = ad_df.groupby('brand').agg({
    'transcript': lambda x: '\n\n'.join(str(t) for t in x if pd.notna(t)),
    'commercial_number': lambda x: list(x),  # collect all ad numbers
}).reset_index()

# Add number of ads column
brand_df['number_of_ads'] = brand_df['commercial_number'].str.len()

# Reorder columns and sort by number_of_ads
brand_df = brand_df[['brand', 'number_of_ads', 'commercial_number', 'transcript']].sort_values(
    by='number_of_ads', 
    ascending=False
)
brand_df.head(30)

Unnamed: 0,brand,number_of_ads,commercial_number,transcript
213,TMobile,22,"[AD0347, AD0348, AD0349, AD0399, AD0400, AD045...",\n\nTim Tebow here. Everyone thinks I want a c...
33,BudLight,15,"[AD0306, AD0307, AD0308, AD0359, AD0416, AD047...","Outro Music\n\nJoe, good evening. Hello, Lily...."
35,Budweiser,14,"[AD0257, AD0258, AD0260, AD0309, AD0310, AD036...",We summon the finest of this nation to help us...
218,Toyota,14,"[AD0298, AD0350, AD0401, AD0402, AD0460, AD056...",Hey there Henderson family I'm your rav4 genie...
155,NFL,13,"[AD0285, AD0286, AD0339, AD0390, AD0443, AD050...","What a year. Lock RG3 and Russell Wilson. Now,..."
71,Doritos,13,"[AD0267, AD0268, AD0319, AD0320, AD0367, AD036...","Daddy, can you paint Pinterest with me? Sweeth..."
147,MichelobULTRA,12,"[AD0439, AD0498, AD0546, AD0547, AD0604, AD060...",Ah\n\nHere we go! Get set! Go! Sometimes you w...
216,Tide,12,"[AD0297, AD0509, AD0510, AD0562, AD0563, AD056...","Yes! Oh! Dude, you got Montana on your jersey...."
231,Weathertech,11,"[AD0353, AD0406, AD0464, AD0520, AD0577, AD063...",You want a loan to build a factory in America?...
120,Kia,10,"[AD0278, AD0279, AD0334, AD0377, AD0436, AD049...","\n\nDad, where do babies come from? Oh, well, ..."


In [95]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from collections import defaultdict

nltk.download('all')


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /home/arkastor/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/arkastor/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /home/arkastor/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /home/arkastor/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /home/arkastor/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]  

True

In [96]:
def extract_top_keywords(transcript, num_keywords=10):
    if pd.isna(transcript):
        return ""
    
    # Tokenize and convert to lowercase
    tokens = word_tokenize(transcript.lower())
    
    # Enhanced stopwords - remove common commercial words that aren't BDM-related
    stop_words = set(stopwords.words('english'))
    commercial_stopwords = {'like', 'get', 'one', 'now', 'see', 'look', 'come', 'go'}
    stop_words.update(commercial_stopwords)
    
    # Keep only meaningful words and potential phrases
    tokens = [word for word in tokens if (
        word.isalnum() and 
        word not in stop_words and
        len(word) > 2  # Remove very short words
    )]
    
    # Get frequency distribution
    fdist = FreqDist(tokens)
    
    # Include frequency to help identify emphasis
    top_keywords = [f"{word} ({freq})" for word, freq in fdist.most_common(num_keywords)]
    return ', '.join(top_keywords)
# TODO: Replace with actual manually selected keywords
brand_df['transcript_keywords'] = brand_df['transcript'].apply(extract_top_keywords)
# add a column for manually selected keywords, make it empty for now
brand_df['manually_selected_keywords'] = ''
brand_df.loc[brand_df['brand'] == 'AvocadosfromMexico', 'manually_selected_keywords'] = 'fresh avocados, authentic Mexican flavor, healthy snack, versatile, green gold, Mexican avocados, avocado recipes, nutrient-rich, healthy fats, premium quality, farm to table, rich in fiber, natural, creamy texture, avocado lovers, guacamole, farm fresh, sustainable farming, avocado health benefits, heart-healthy, clean eating, avocado toast, fresh ingredients, Mexican agriculture, rich taste, sustainable, healthy lifestyle, Mexican heritage, protein-rich, gluten-free, non-GMO, best avocados'

brand_df.loc[brand_df['brand'] == 'CocaCola', 'manually_selected_keywords'] = 'classic cola, refreshing, Coca-Cola taste, soda, iconic drink, carbonated beverage, sweet refreshment, Coca-Cola experience, soda culture, family-friendly, original formula, Coca-Cola Zero, taste of happiness, joyful moments, Coca-Cola taste test, global brand, quench thirst, cola drink, refreshing soda, caffeine boost, nostalgia, tradition, great taste, pop culture, Coca-Cola family, carbonated refreshment, classic flavor, cool drink, all-time favorite, global reach, social moments, refreshment moments'

brand_df.loc[brand_df['brand'] == 'Doritos', 'manually_selected_keywords'] = 'bold flavor, crunchy snack, snack time, Doritos crunch, nacho cheese, tortilla chips, bold taste, snack attack, cheesy, bold chips, extreme flavor, Doritos flavor, signature snack, bold chips, spicy chips, Doritos dip, snack culture, Fiesta, snack lovers, party snack, tortilla chip, cheese lovers, snackable, bagged chips, bold snack, epic flavor, nacho lovers, bold spices, cheesy snack, intense flavor, snack obsession, snack innovation, Doritos taco'

brand_df.loc[brand_df['brand'] == 'Pepsi', 'manually_selected_keywords'] = 'refreshing, Pepsi generation, bold flavors, cola, better taste, Pepsi challenge, carbonated beverage, Pepsi vs Coke, great taste, drink Pepsi, PepsiCo, unique formula, original soda, sugary drink, classic soda, sweet taste, flavor boost, thirst-quenching, family-friendly, fun beverage, Pepsi moments, summer drinks, carbonated refreshment, drink refreshment, youth culture, new flavors, taste test, Pepsi flavors, cool drinks, sports sponsorship, pop culture, refreshing drink, Pepsi Zero'

brand_df.loc[brand_df['brand'] == 'Pringles', 'manually_selected_keywords'] = 'stackable chips, crisp texture, Pringles crunch, fun snack, potato crisps, unique packaging, endless flavor combinations, chip innovation, Pringles variety, perfect crunch, Pringles flavors, satisfying snack, snackable, Pringles moments, can-shaped packaging, thin crispy chips, chips lovers, snack cravings, flavor-packed, family snack, Pringles party size, potato chips, snack time, variety packs, Pringles classic, flavor-packed crisps, irresistibly crunchy, Pringles chips, crispy texture, unique snack, on-the-go snack'

brand_df.loc[brand_df['brand'] == 'Snickers', 'manually_selected_keywords'] = 'hungry, satisfy hunger, chocolate bar, peanuts, caramel, Snickers satisfaction, chocolate lovers, energy boost, satisfying snack, indulgent treat, hunger cure, sweet snack, Snickers candy, full satisfaction, sweet chocolate, peanut-filled, caramel center, peanut snack, Snickers moments, hunger solution, snack break, chocolate cravings, Snickers bites, energizing chocolate, hunger pangs, fun size, treat yourself, snackable chocolate, premium chocolate, candy bar, indulgence, ultimate chocolate'

brand_df.loc[brand_df['brand'] == 'WonderfulPistachios', 'manually_selected_keywords'] = 'healthy snack, pistachios, nut lovers, protein-rich, snackable, roasted pistachios, heart-healthy, natural snack, high in fiber, superfood snack, Wonderful nuts, California pistachios, healthy fats, clean snack, green nuts, sustainable farming, premium pistachios, plant-based protein, guilt-free snack, roasted nuts, fiber-rich, energy boost, snack with benefits, healthy lifestyle, on-the-go snack, crunchy pistachios, Wonderful brand, nut benefits, weight management, heart health, antioxidant-rich, nutty goodness'

brand_df.loc[brand_df['brand'] == 'BudLight', 'manually_selected_keywords'] = 'refreshing beer, light beer, crisp taste, easy-drinking, Bud Light taste, casual beer, beer lovers, refreshing lager, smooth beer, Bud Light experience, perfect for parties, beer with friends, light refreshment, crisp lager, Bud Light flavor, low-calorie beer, beer culture, popular beer, go-to beer, drink responsibly, chill moments, beer variety, low-carb beer, brewmaster, light lager, beer enjoyment, Bud Light moments, thirst-quenching beer, party beer, American lager, easy-going beer'

brand_df.loc[brand_df['brand'] == 'Budweiser', 'manually_selected_keywords'] = 'king of beers, American beer, full-flavored lager, classic beer, Budweiser taste, beer heritage, iconic beer, beer lovers, best lager, bold beer, smooth finish, premium beer, refreshing lager, American brewing, original Budweiser, thirst-quenching, rich taste, brewmasters, full-bodied beer, crisp refreshment, golden beer, beer culture, American-made, beer brand, top beer, Budweiser experience, great beer, beer quality, rich flavors, beer moments, party beer, legendary beer'

brand_df.loc[brand_df['brand'] == 'MichelobULTRA', 'manually_selected_keywords'] = 'low-calorie beer, fitness beer, light lager, refreshing, Michelob ULTRA taste, active lifestyle, ultra-refreshing, premium light beer, healthy beer choice, crisp lager, clean beer, Michelob experience, Michelob ULTRA flavor, superior taste, low-carb beer, alcohol-free, beer for athletes, light refreshment, golden beer, refreshing lager, fit beer, best light beer, healthy drinking, active refreshment, calorie-conscious, fitness-friendly beer, Michelob brand, balanced beer, smooth beer, refreshing beer, sports beer, ultra-refreshing'

brand_df.loc[brand_df['brand'] == 'Audi', 'manually_selected_keywords'] = 'luxury cars, innovative technology, quattro all-wheel drive, precision engineering, performance, premium interiors, Audi design, advanced safety, cutting-edge technology, high-performance, quattro system, Audi TT, stylish cars, eco-friendly Audi, hybrid models, driving experience, high-end vehicles, sophisticated design, iconic cars, luxury sedans, sport cars, luxury SUVs, digital cockpit, precision engineering, top-tier performance, ultimate driving experience, car enthusiasts, sleek design, Audi A4, car luxury, refined craftsmanship, performance engineering'

brand_df.loc[brand_df['brand'] == 'Hyundai', 'manually_selected_keywords'] = 'affordable cars, innovative designs, Hyundai SUV, eco-friendly vehicles, advanced tech, reliable performance, value for money, hybrid cars, fuel-efficient, safety features, compact cars, family-friendly vehicles, clean energy cars, reliable vehicles, smart technology, modern interiors, efficient design, comfort, affordable SUVs, Hyundai Santa Fe, electric cars, futuristic tech, long warranty, connected cars, top safety ratings, cutting-edge designs, urban mobility, efficient driving, fuel economy, Hyundai Elantra, best warranty, future-driven'

brand_df.loc[brand_df['brand'] == 'Jeep', 'manually_selected_keywords'] = 'off-road vehicles, rugged design, adventure-ready, Jeep experience, all-terrain, iconic SUV, 4x4, Jeep Wrangler, tough vehicles, outdoor adventure, Jeep off-roading, reliable 4WD, adventure culture, Jeep power, rugged reliability, tough performance, Jeep Cherokee, adventure seekers, off-road enthusiast, Jeep spirit, SUV performance, outdoor lifestyle, Jeep Trailhawk, all-wheel drive, wilderness exploration, Jeep adventure, off-road dominance, Jeep heritage, off-road thrill, Jeep Gladiator, outdoor freedom'

brand_df.loc[brand_df['brand'] == 'Kia', 'manually_selected_keywords'] = 'affordable cars, modern design, reliable vehicles, eco-friendly cars, Kia SUV, smart tech, budget-friendly, safety features, efficient performance, hybrid models, stylish cars, advanced technology, Kia experience, comfortable driving, family-friendly cars, great warranties, high performance, Kia Sportage, cutting-edge design, fuel-efficient vehicles, fun-to-drive cars, low-cost maintenance, futuristic car models, Kia Forte, well-equipped cars, tech-savvy, sleek vehicles, efficient fuel economy, user-friendly tech, Kia Sorento'

brand_df.loc[brand_df['brand'] == 'Toyota', 'manually_selected_keywords'] = 'reliable cars, durable vehicles, Toyota innovation, hybrid cars, fuel-efficient, eco-friendly cars, Toyota SUVs, advanced safety, Toyota Camry, trusted performance, quality engineering, all-wheel drive, cutting-edge tech, Toyota Corolla, best-selling car, fuel-efficient vehicles, Toyota Prius, hybrid technology, smart safety features, comfort-driven, family-friendly vehicles, rugged trucks, advanced engineering, Toyota Tacoma, dependable cars, Toyota Tundra, high performance, quality craftsmanship, top safety ratings, efficient driving, sustainable transportation'

brand_df = brand_df.head(30)
brand_df.head()

Unnamed: 0,brand,number_of_ads,commercial_number,transcript,transcript_keywords,manually_selected_keywords
213,TMobile,22,"[AD0347, AD0348, AD0349, AD0399, AD0400, AD045...",\n\nTim Tebow here. Everyone thinks I want a c...,"mama (12), got (11), fees (11), unlimited (10)...",
33,BudLight,15,"[AD0306, AD0307, AD0308, AD0359, AD0416, AD047...","Outro Music\n\nJoe, good evening. Hello, Lily....","light (34), bud (33), corn (13), syrup (13), g...","refreshing beer, light beer, crisp taste, easy..."
35,Budweiser,14,"[AD0257, AD0258, AD0260, AD0309, AD0310, AD036...",We summon the finest of this nation to help us...,"typical (9), know (7), beer (6), home (6), sta...","king of beers, American beer, full-flavored la..."
218,Toyota,14,"[AD0298, AD0350, AD0401, AD0402, AD0460, AD056...",Hey there Henderson family I'm your rav4 genie...,"got (10), wish (8), yeah (8), let (7), said (6...","reliable cars, durable vehicles, Toyota innova..."
155,NFL,13,"[AD0285, AD0286, AD0339, AD0390, AD0443, AD050...","What a year. Lock RG3 and Russell Wilson. Now,...","got (16), yes (14), let (14), yeah (12), kid (...",


In [97]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


#TODO: REmove later on, -  drop any rows where manually_selected_keywords is empty or an empty string or whatever
brand_df = brand_df[brand_df['manually_selected_keywords'].notna()]
brand_df = brand_df[brand_df['manually_selected_keywords'] != '']
# TODO: Remove later on, - drop any rows from ad_df where transcript is empty or an empty string or whatever
ad_df = ad_df[ad_df['transcript'].notna()]
ad_df = ad_df[ad_df['transcript'] != '']
# Create a dictionary mapping brands to their keywords for quick lookup
brand_keywords_dict = dict(zip(brand_df['brand'], brand_df['manually_selected_keywords']))

# Initialize the model outside the loop for better performance
model = SentenceTransformer('all-MiniLM-L6-v2')

# Add similarity score column to ad_df
ad_df['keyword_similarity'] = None

# Calculate similarity for each ad
for idx, row in ad_df.iterrows():
    if pd.isna(row['transcript']) or row['brand'] not in brand_keywords_dict:
        continue
        
    keywords = brand_keywords_dict[row['brand']]
    transcript = row['transcript']
    
    # Get embeddings
    transcript_embedding = model.encode([transcript])
    keyword_embedding = model.encode([keywords])
    
    # Calculate similarity
    similarity = cosine_similarity(transcript_embedding, keyword_embedding)[0][0]
    ad_df.at[idx, 'keyword_similarity'] = similarity
    
    # Print details where similarity is found
    print(f"Brand: {row['brand']}")
    print(f"Transcript: {transcript}")
    print(f"Keywords: {keywords}")
    print(f"Similarity: {similarity}\n")

# Display results
#get rid of any rows with missing values in transcript column
ad_df = ad_df.dropna(subset=['transcript'])
# TODO: Remove later on
# drop rows where keyword_similarity is 1.0
ad_df = ad_df[ad_df['keyword_similarity'] != 1.0]
# sort by highest similarity
ad_df = ad_df.sort_values(by='keyword_similarity', ascending=False)
ad_df.head(20)


Brand: AvocadosfromMexico
Transcript: With the next pick in the first draft ever, Australia selects the kangaroo. Yeah, I like that pick. Get up, hops, jump vertical. Brazil selects the sloth. Off the field issues. Mauritius selects the dodo bird. And Mexico selects the avocado. Great pick. Rich volcanic soil. Perfect weather. That'll make avocados from Mexico the ideal year-round snack. Avocados from Mexico.
Keywords: fresh avocados, authentic Mexican flavor, healthy snack, versatile, green gold, Mexican avocados, avocado recipes, nutrient-rich, healthy fats, premium quality, farm to table, rich in fiber, natural, creamy texture, avocado lovers, guacamole, farm fresh, sustainable farming, avocado health benefits, heart-healthy, clean eating, avocado toast, fresh ingredients, Mexican agriculture, rich taste, sustainable, healthy lifestyle, Mexican heritage, protein-rich, gluten-free, non-GMO, best avocados
Similarity: 0.3939376771450043

Brand: AvocadosfromMexico
Transcript: Over here 

Unnamed: 0,product_category,brand,commercial_number,BDM,transcript,keyword_similarity
533,25.0,Squarespace,AD0397,0.0,Oh,0.618068
139,2.0,MichelobULTRA,AD0605,1.0,Let's all experience something together. This ...,0.591464
118,2.0,Budweiser,AD0257,0.0,We summon the finest of this nation to help us...,0.583307
96,1.0,WonderfulPistachios,AD0523,0.0,I recently discovered that pistachios are a go...,0.567073
119,2.0,Budweiser,AD0258,0.0,The loud. The savvy. The famous. It took all o...,0.543326
113,2.0,BudLight,AD0636,0.0,So you got Bud Light? Back there. Bud Light ma...,0.527644
112,2.0,BudLight,AD0590,1.0,"Better light is made with barley, butter, hops...",0.491088
79,1.0,Pringles,AD0667,0.0,Stack Pringles flavors. Make new ones. How muc...,0.489769
137,2.0,MichelobULTRA,AD0547,0.0,"Hey, did you ever hear from Michelob Ultra? I ...",0.484184
78,1.0,Pringles,AD0614,0.0,"I'm stacking cheddar, jalapeno, and sour cream...",0.482724


## Ansatz 1 (Machine learning)

In [98]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyClassifier
from imblearn.under_sampling import RandomUnderSampler

# Prepare features
X = ad_df[['keyword_similarity']].copy()
y = ad_df['BDM'].astype(float)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Balance the dataset using undersampling
undersampler = RandomUnderSampler(random_state=42)
X_balanced, y_balanced = undersampler.fit_resample(X, y)

# Split the balanced data
X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, test_size=0.2, random_state=42
)

# Train Random Forest (no need for class_weight now since data is balanced)
rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)
y_pred_proba = rf.predict_proba(X_test)[:, 1]

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

dummy = DummyClassifier(strategy='uniform', random_state=42)

dummy.fit(X_train, y_train)

# Get dummy predictions
dummy_pred = dummy.predict(X_test)
dummy_pred_proba = dummy.predict_proba(X_test)[:, 1]

# Calculate metrics for both models
print("Random Forest Performance:")
print(f"Accuracy: {accuracy:.3f}")
print(f"ROC AUC: {roc_auc:.3f}")
print("\nDetailed Classification Report (Random Forest):")
print(classification_report(y_test, y_pred))

print("\nDummy Classifier Performance:")
print(f"Accuracy: {accuracy_score(y_test, dummy_pred):.3f}")
print(f"ROC AUC: {roc_auc_score(y_test, dummy_pred_proba):.3f}")
print("\nDetailed Classification Report (Dummy):")
print(classification_report(y_test, dummy_pred))

# Feature importance
importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)
print("\nFeature Importance:")
print(importance_df)
print("\nOriginal BDM Distribution:")
bdm_counts = y.value_counts()
print(f"BDM = 0: {bdm_counts[0]} rows")
print(f"BDM = 1: {bdm_counts[1]} rows")
print(f"Total: {len(y)} rows")
print(f"Percentage of BDM=1: {(bdm_counts[1]/len(y))*100:.1f}%")

print("\nBalanced BDM Distribution:")
balanced_counts = y_balanced.value_counts()
print(f"BDM = 0: {balanced_counts[0]} rows")
print(f"BDM = 1: {balanced_counts[1]} rows")
print(f"Total: {len(y_balanced)} rows")
print(f"Percentage of BDM=1: {(balanced_counts[1]/len(y_balanced))*100:.1f}%")

Random Forest Performance:
Accuracy: 0.500
ROC AUC: 0.525

Detailed Classification Report (Random Forest):
              precision    recall  f1-score   support

         0.0       0.55      0.21      0.31        28
         1.0       0.49      0.81      0.61        26

    accuracy                           0.50        54
   macro avg       0.52      0.51      0.46        54
weighted avg       0.52      0.50      0.45        54


Dummy Classifier Performance:
Accuracy: 0.574
ROC AUC: 0.500

Detailed Classification Report (Dummy):
              precision    recall  f1-score   support

         0.0       0.60      0.54      0.57        28
         1.0       0.55      0.62      0.58        26

    accuracy                           0.57        54
   macro avg       0.58      0.58      0.57        54
weighted avg       0.58      0.57      0.57        54


Feature Importance:
              feature  importance
0  keyword_similarity         1.0

Original BDM Distribution:
BDM = 0: 388 rows
B

## Ansatz 2 (Deep learning) ~ Spam Classification

In [99]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, f1_score


# 2. Dataset class for PyTorch
class AdDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        transcript = self.dataframe.iloc[index]["transcript"]
        label = self.dataframe.iloc[index]["BDM"]
        inputs = self.tokenizer(transcript, padding='max_length', max_length=self.max_len, truncation=True, return_tensors="pt")
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 3. Tokenizer and Dataset Preparation
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset = AdDataset(ad_df, tokenizer, max_len=128)

# Split into train and test datasets
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

# 4. Model Setup
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = AdamW(model.parameters(), lr=2e-5)

# 5. Training Loop
def train_model():
    model.train()
    for epoch in range(3):  # Training for 3 epochs
        for batch in train_loader:
            optimizer.zero_grad()
            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                labels=batch['labels']
            )
            loss = outputs.loss
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

# Train the model
train_model()

# 6. Evaluation Function
def evaluate_model():
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask']
            )
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
            labels = batch['labels'].detach().cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels)

    acc = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    print(f"Accuracy: {acc:.2f}")
    print(f"F1 Score: {f1:.2f}")
    # roc auc score
    roc_auc = roc_auc_score(true_labels, predictions)
    print(f"ROC AUC Score: {roc_auc:.2f}")

# Evaluate the model
evaluate_model()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.4285528063774109
Epoch 2, Loss: 0.4771921634674072
Epoch 3, Loss: 0.11486964672803879
Accuracy: 0.76
F1 Score: 0.32
ROC AUC Score: 0.58
