# Detecting BDM In Superbowl Commercials

## Notebook Settings

In [None]:

%pip install -r requirements.txt

In [2]:
import pandas as pd
import os

In [None]:
import logging
from datetime import datetime

logging.basicConfig(
    filename='log.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

def update_df_with_csv(df, csv_filepath, merge_key):
    """
    Update the dataframe by merging with data from a CSV file.
    Prefers the incoming CSV's columns when there's a conflict.

    Parameters:
    - df: DataFrame to update.
    - csv_filepath: Path to the CSV file to merge with.
    - merge_key: Column name to merge on.

    Returns:
    - Updated DataFrame.
    """
    # Read CSV file
    incoming_df = pd.read_csv(csv_filepath)
    
    # Merge with suffixes to identify incoming columns
    updated_df = df.merge(incoming_df, on=merge_key, how='left', suffixes=('', '_incoming'))
    
    # Handle conflicts: drop original columns and rename incoming columns
    conflicting_columns = [col for col in incoming_df.columns if col in df.columns and col != merge_key]
    for col in conflicting_columns:
        updated_df.drop(col, axis=1, inplace=True)  # Drop the original column
        updated_df.rename(columns={f'{col}_incoming': col}, inplace=True)  # Rename the incoming column
    
    return updated_df

In [4]:
# We have some specific knowledge about the industry and brand, so we can use that to improve the model. This data only exists for a few brands and products. Activate or deactivate as needed.

# Activate if you want to reduce the selection of commercials to 20 for debugging

DEBUGGING = False

CSV_PATH = 'csvs'

In [5]:
# Activate if you want to completely rerun the notebook from scratch. This will delete all csvs/ saved data and start from scratch.
RESTART_FROM_SCRATCH = False
import shutil
csv_dir = "./csvs"
if RESTART_FROM_SCRATCH and os.path.exists(csv_dir):
    shutil.rmtree(csv_dir)
    logging.info(f"Directory '{csv_dir}' has been deleted.")

In [6]:
# Checkpoints
# Since the model takes very long to train, we save extracted features to csvs and only rerun the code if the csv "checkpoint" does not exist

BASELINE_CHECKPOINT = os.path.exists(f'{CSV_PATH}/baseline.csv')
logging.info(f"Baseline checkpoint: {BASELINE_CHECKPOINT}")
TRANSCRIPT_CHECKPOINT = os.path.exists(f'{CSV_PATH}/transcript.csv')
logging.info(f"Transcript checkpoint: {TRANSCRIPT_CHECKPOINT}")
OCR_CHECKPOINT = os.path.exists(f'{CSV_PATH}/ocr.csv')
logging.info(f"OCR checkpoint: {OCR_CHECKPOINT}")
BDM_WORDS_CHECKPOINT = os.path.exists(f'{CSV_PATH}/bdm_words.csv')
logging.info(f"BDM words checkpoint: {BDM_WORDS_CHECKPOINT}")
ADJ_NOUN_PAIRS_CHECKPOINT = os.path.exists(f'{CSV_PATH}/adj_noun_pairs.csv')
logging.info(f"Adj noun pairs checkpoint: {ADJ_NOUN_PAIRS_CHECKPOINT}")
PRODUCT_SEMANTIC_SIMILARITY_CHECKPOINT = os.path.exists(f'{CSV_PATH}/product_semantic_similarity.csv')
logging.info(f"Product semantic similarity checkpoint: {PRODUCT_SEMANTIC_SIMILARITY_CHECKPOINT}")
BRAND_SEMANTIC_SIMILARITY_CHECKPOINT = os.path.exists(f'{CSV_PATH}/brand_semantic_similarity.csv')
logging.info(f"Brand semantic similarity checkpoint: {BRAND_SEMANTIC_SIMILARITY_CHECKPOINT}")
PERSONAL_PRONOUNS_CHECKPOINT = os.path.exists(f'{CSV_PATH}/personal_pronouns.csv')
logging.info(f"Personal pronouns checkpoint: {PERSONAL_PRONOUNS_CHECKPOINT}")
COMPARISONS_CHECKPOINT = os.path.exists(f'{CSV_PATH}/comparisons.csv')
logging.info(f"Comparisons checkpoint: {COMPARISONS_CHECKPOINT}")



# CRISP-DM 3: Data Preparation

In [7]:

def download_videos():
  url = 'https://box.fu-berlin.de/s/zwxKp8PXkCwAwGe/download'
  download_filename = 'downloaded_archive.zip'
  target_directory = 'ADs'
  os.system(f'wget -O {download_filename} {url}')
  os.makedirs(target_directory, exist_ok=True)
  os.system(f'unzip -o {download_filename} -d {target_directory}')
  os.remove(download_filename)
  logging.info(f"Archive extracted to {target_directory} and {download_filename} removed.")

if not BASELINE_CHECKPOINT:
  pass
  # download_videos()

In [8]:
ad_df = pd.DataFrame()
if not BASELINE_CHECKPOINT:
    BDM_excel = pd.read_excel('BDM.xlsx')
    final_excel = pd.read_excel('previous_project_results.xlsx')
    final_excel = final_excel.merge(
    BDM_excel[['AdNumber', 'BDM']], 
    on='AdNumber', 
    how='left',
    suffixes=('_old', '')
    ).drop('BDM_old', axis=1, errors='ignore')
    ad_df = final_excel
    ad_df = ad_df[['cont_primary_product_type', 'BRAND', 'AdNumber', 'cont_com_appeal', 'cont_csr_type', 'Emotion_from_Dialogue', 'BDM']]
    ad_df = ad_df.rename(columns={'cont_primary_product_type': 'product_category', 'BRAND': 'brand', 'AdNumber': 'commercial_number', 'cont_com_appeal': 'commercial_appeal', 'cont_csr_type': 'csr_type', 'Emotion_from_Dialogue': 'emotion_from_dialogue'})  # Changed here
    ad_df.head(10)
    # Manual coding by the marketing team: 1 = rational, 2 = balanced, 3 = emotional
    sentiment_columns = ad_df["commercial_appeal"]

    # Sentiment analysis from the last project group for audio transcription
    # Encoding 0: p < 0.8 = neutral
    # Encoding 1: p > 0.8 = emotional

    emotion_columns = ad_df["emotion_from_dialogue"]

    # List of values to be encoded as 1
    target_emotions = ['love', 'joy', 'surprise', 'sadness', 'anger', 'fear']
    ad_df.loc[:, 'encoded_emotion'] = emotion_columns.apply(lambda x: 1 if x in target_emotions else 0)  # Changed here
    ad_df = ad_df.drop(['emotion_from_dialogue', 'commercial_appeal'], axis=1)

    # List of commercial numbers to update
    commercial_numbers = ['AD0262', 'AD0284', 'AD0332', 'AD0348', 'AD0370', 'AD0375', 'AD0399', 'AD0482', 'AD0539', 'AD0749']
    ad_df.loc[ad_df['commercial_number'].isin(commercial_numbers), 'BDM'] = 1.0

    # drop all rows with no commercial number
    ad_df = ad_df[ad_df['commercial_number'].notna()]
    display(ad_df.head(10))

## Brand Keywords

In [9]:
if not BASELINE_CHECKPOINT:
        product_brand_df = pd.read_csv("product_brands.csv")

        product_brand_df['brand'] = product_brand_df['brand'].str.replace(' ', '').str.lower()
        ad_df['brand_clean'] = ad_df['brand'].str.replace(' ', '').str.lower()

        ad_df = ad_df.merge(
            product_brand_df[['brand', 'product_brand_keywords']], 
            left_on='brand_clean',
            right_on='brand',
            how='left',
            suffixes=('', '_brand')
        )

        ad_df.drop(['brand_clean', 'brand_brand'], axis=1, inplace=True)


## Product Category Keywords

In [10]:
if not BASELINE_CHECKPOINT:
        product_brands_df = pd.read_csv("product_categories.csv")
        product_brands_df = product_brands_df.drop('product_cat_id', axis=1)
        ad_df = ad_df.drop('product_category', axis=1)
        display(product_brands_df)
        display(ad_df)
        brand_to_info = {}
        for _, row in product_brands_df.iterrows():
            brands = eval(row['product_cat_brands'])
            for brand in brands:
                brand = brand.replace(' ', '').lower()
                brand_to_info[brand] = {col: row[col] for col in product_brands_df.columns}

        def find_brand_info(brand):
            if pd.isna(brand):
                return None
            clean_brand = brand.replace(' ', '').lower()
            return brand_to_info.get(clean_brand)

        for col in product_brands_df.columns:
            ad_df[col] = ad_df['brand'].apply(lambda x: find_brand_info(x)[col] if find_brand_info(x) else None)

        unmapped_brands = ad_df[ad_df['product_cat_name'].isna()]['brand'].unique()
        if len(unmapped_brands) > 0:
            logging.info("Brands without category mapping:")
            for brand in unmapped_brands:
                logging.info(f"- {brand}")

        ad_df.head(10)
        # drop all where product_cat_name is nan


In [11]:
# Debugging
if not BASELINE_CHECKPOINT:
    if DEBUGGING:
        ad_df = ad_df.head(20)

In [12]:
BASELINE_COLUMNS = ad_df.columns
if not BASELINE_CHECKPOINT:
  os.makedirs(f'{CSV_PATH}', exist_ok=True)
  ad_df[BASELINE_COLUMNS].to_csv(f'{CSV_PATH}/baseline.csv', index=False)

## Feature Extraction

In [13]:
ad_df = pd.read_csv(f'{CSV_PATH}/baseline.csv')

### Feature: Transcript

In [None]:
import glob
from pathlib import Path
from transcript import transcribe_video
from ocr import ocr
TRANSCRIPT_COLUMNS = ['transcript']
if not TRANSCRIPT_CHECKPOINT:
    ads_dir = "ADs"
    def find_video_file(commercial_number, ads_dir):
        """Find the video file path for a given commercial number."""
        # Search recursively for MP4 files
        pattern = f"{ads_dir}/**/{commercial_number}.mp4"
        matches = glob.glob(pattern, recursive=True)
        return matches[0] if matches else None

    # Initialize a list to collect indices of rows to drop
    rows_to_drop = []

    for idx, row in ad_df.iterrows():
        transcript = ' '
        commercial_number = row['commercial_number']
        video_path = find_video_file(commercial_number, ads_dir)
        
        if video_path:
            transcript = transcribe_video(video_path)
            ad_df.at[idx, 'transcript'] = transcript
        else:
            logging.info(f"Video not found for commercial {commercial_number}")
            # Collect index of row to drop
            rows_to_drop.append(idx)

    # Drop the rows where no video was found
    ad_df = ad_df.drop(rows_to_drop)
    ad_df[TRANSCRIPT_COLUMNS + ['commercial_number']].to_csv(f'{CSV_PATH}/transcript.csv', index=False)
    ad_df.head(10)

In [15]:
ad_df = update_df_with_csv(ad_df, f'{CSV_PATH}/transcript.csv', 'commercial_number')
ad_df['transcript'] = ad_df['transcript'].fillna('')


### Feature: OCR

In [None]:
import glob
from pathlib import Path
from transcript import transcribe_video
from ocr import ocr
OCR_COLUMNS = ['ocr_text']
if not OCR_CHECKPOINT:
    ads_dir = "ADs"
    def find_video_file(commercial_number, ads_dir):
        """Find the video file path for a given commercial number."""
        # Search recursively for MP4 files
        pattern = f"{ads_dir}/**/{commercial_number}.mp4"
        matches = glob.glob(pattern, recursive=True)
        return matches[0] if matches else None
    for idx, row in ad_df.iterrows():
        ocr_text = ' '
        commercial_number = row['commercial_number']
        video_path = find_video_file(commercial_number, ads_dir)
        
        if video_path:
            if DEBUGGING:
                ocr_text = ' '
            else:
                ocr_text = ocr(video_path)
        else:
            logging.info(f"Video not found for commercial {commercial_number}")

        ad_df.at[idx, 'ocr_text'] = ocr_text


    ad_df[OCR_COLUMNS + ['commercial_number']].to_csv(f'{CSV_PATH}/ocr.csv', index=False)

In [17]:
ad_df = update_df_with_csv(ad_df, f'{CSV_PATH}/ocr.csv', 'commercial_number')

ad_df['ocr_text'] = ad_df['ocr_text'].fillna('')

### Feature: Superlatives, Comparatives, Uniqueness

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
import spacy
import pandas as pd
from collections import Counter
import text_analysis as ta

BDM_WORDS_COLUMNS = ['transcript_superlatives', 'transcript_comparatives', 'transcript_unique_words', 'transcript_superlative_count', 'transcript_comparative_count', 'transcript_uniqueness_count', 'transcript_superlative_pct', 'transcript_comparative_pct', 'transcript_uniqueness_pct', 'transcript_total_bdm_terms_count', 'transcript_total_bdm_terms_pct', 'ocr_text_superlatives', 'ocr_text_comparatives', 'ocr_text_unique_words', 'ocr_text_superlative_count', 'ocr_text_comparative_count', 'ocr_text_uniqueness_count', 'ocr_text_superlative_pct', 'ocr_text_comparative_pct', 'ocr_text_uniqueness_pct', 'ocr_text_total_bdm_terms_count', 'ocr_text_total_bdm_terms_pct']


if not BDM_WORDS_CHECKPOINT:
    ad_df = ta.process_text_data(ad_df, 'transcript')
    ad_df = ta.process_text_data(ad_df, 'ocr_text')

    columns = ['commercial_number'] + BDM_WORDS_COLUMNS
    ad_df[columns].to_csv(f'{CSV_PATH}/bdm_words.csv', index=False)

In [20]:
ad_df = update_df_with_csv(ad_df, f'{CSV_PATH}/bdm_words.csv', 'commercial_number')

# fillna object columns with empty string, float columns with 0
for column in ad_df[BDM_WORDS_COLUMNS].columns:
    if ad_df[column].dtype == 'object':
        ad_df[column] = ad_df[column].fillna('')
    elif pd.api.types.is_numeric_dtype(ad_df[column]):
        ad_df[column] = ad_df[column].fillna(0)

### Feature: Nomen + Adjektive

In [21]:
ADJ_NOUN_PAIRS_COLUMNS = ['transcript_adj_noun_pairs', 'transcript_num_adj_noun_pairs', 'ocr_text_adj_noun_pairs', 'ocr_text_num_adj_noun_pairs']
if not ADJ_NOUN_PAIRS_CHECKPOINT:
  ad_df["transcript_adj_noun_pairs"] = ad_df["transcript"].apply(ta.extract_adj_noun_pairs)
  ad_df["transcript_num_adj_noun_pairs"] = ad_df["transcript_adj_noun_pairs"].apply(len)
  ad_df["ocr_text_adj_noun_pairs"] = ad_df["ocr_text"].apply(ta.extract_adj_noun_pairs)
  ad_df["ocr_text_num_adj_noun_pairs"] = ad_df["ocr_text_adj_noun_pairs"].apply(len)
  ad_df[ADJ_NOUN_PAIRS_COLUMNS + ['commercial_number']].to_csv(f'{CSV_PATH}/adj_noun_pairs.csv', index=False)

In [22]:
ad_df = update_df_with_csv(ad_df, f'{CSV_PATH}/adj_noun_pairs.csv', 'commercial_number')
for column in ad_df[ADJ_NOUN_PAIRS_COLUMNS].columns:
    if ad_df[column].dtype == 'object':
        ad_df[column] = ad_df[column].fillna('')
    elif pd.api.types.is_numeric_dtype(ad_df[column]):
        ad_df[column] = ad_df[column].fillna(0)

### Feature: Semantische Nähe

In [None]:
display(ad_df.head(5))

In [24]:
import numpy as np

import logging
import text_analysis as ta

# Function to calculate semantic similarities and log top keywords
PRODUCT_SEMANTIC_SIMILARITY_COLUMNS = ['transcript_product_cat_keywords_similarity', 'transcript_product_cat_keywords_top_keywords', 'ocr_text_product_cat_keywords_similarity', 'ocr_text_product_cat_keywords_top_keywords']
BRAND_SEMANTIC_SIMILARITY_COLUMNS = ['transcript_product_brand_keywords_similarity', 'transcript_product_brand_keywords_top_keywords', 'ocr_text_product_brand_keywords_similarity', 'ocr_text_product_brand_keywords_top_keywords']

if not PRODUCT_SEMANTIC_SIMILARITY_CHECKPOINT:
       ad_df = ta.calculate_semantic_similarities(ad_df, 'transcript', 'product_cat_keywords')
       ad_df = ta.calculate_semantic_similarities(ad_df, 'ocr_text',  'product_cat_keywords')
if not BRAND_SEMANTIC_SIMILARITY_CHECKPOINT:
       ad_df = ta.calculate_semantic_similarities(ad_df, 'transcript',  'product_brand_keywords')
       ad_df = ta.calculate_semantic_similarities(ad_df, 'ocr_text', 'product_brand_keywords')



       ad_df[PRODUCT_SEMANTIC_SIMILARITY_COLUMNS + ['commercial_number']].to_csv(f'{CSV_PATH}/product_semantic_similarity.csv', index=False)
       ad_df[BRAND_SEMANTIC_SIMILARITY_COLUMNS + ['commercial_number']].to_csv(f'{CSV_PATH}/brand_semantic_similarity.csv', index=False)

In [25]:
ad_df = update_df_with_csv(ad_df, f'{CSV_PATH}/product_semantic_similarity.csv', 'commercial_number')
for column in ad_df[PRODUCT_SEMANTIC_SIMILARITY_COLUMNS].columns:
    if ad_df[column].dtype == 'object':
        ad_df[column] = ad_df[column].fillna('')
    elif pd.api.types.is_numeric_dtype(ad_df[column]):
        ad_df[column] = ad_df[column].fillna(0)
ad_df = update_df_with_csv(ad_df, f'{CSV_PATH}/brand_semantic_similarity.csv', 'commercial_number')
for column in ad_df[BRAND_SEMANTIC_SIMILARITY_COLUMNS].columns:
    if ad_df[column].dtype == 'object':
        ad_df[column] = ad_df[column].fillna('')

output_df = ad_df.copy()
# drop empty values
output_df = output_df.dropna()
output_df.to_csv(f'{CSV_PATH}/ad_df.csv', index=False)


### Feature: Zahlenvergleiche

In [26]:
#test---------------------------
#text = """Our product is 10 times faster and 50% more efficient. """
COMPARISONS_COLUMNS = ['transcript_comparisons', 'ocr_text_comparisons', 'transcript_num_comparisons', 'ocr_text_num_comparisons']
if not COMPARISONS_CHECKPOINT:
    ad_df["transcript_comparisons"] = ad_df["transcript"].apply(ta.apply_on_transcript)
    ad_df["ocr_text_comparisons"] = ad_df["ocr_text"].apply(ta.apply_on_transcript)
    ad_df["transcript_num_comparisons"] = ad_df["transcript_comparisons"].apply(len)
    ad_df["ocr_text_num_comparisons"] = ad_df["ocr_text_comparisons"].apply(len)
    ad_df[COMPARISONS_COLUMNS + ['commercial_number']].to_csv(f'{CSV_PATH}/comparisons.csv', index=False)


In [27]:
ad_df = update_df_with_csv(ad_df, f'{CSV_PATH}/comparisons.csv', 'commercial_number')
for column in ad_df[COMPARISONS_COLUMNS].columns:
    if ad_df[column].dtype == 'object':
        ad_df[column] = ad_df[column].fillna('')
    elif pd.api.types.is_numeric_dtype(ad_df[column]):
        ad_df[column] = ad_df[column].fillna(0)


### Feature: Personalpronomen

In [28]:
import text_analysis as ta
PERSONAL_PRONOUNS_COLUMNS = [
               'commercial_number', 
               'transcript_contains_i', 
               'ocr_text_contains_i', 
               'transcript_contains_we', 
               'ocr_text_contains_we', 
               'transcript_contains_you', 
               'ocr_text_contains_you', 
               'transcript_contains_he', 
               'ocr_text_contains_he', 
               'transcript_contains_she', 
               'ocr_text_contains_she', 
                'transcript_contains_it', 
                'ocr_text_contains_it', 
                'transcript_contains_they', 
                'ocr_text_contains_they', 
                ]

if not PERSONAL_PRONOUNS_CHECKPOINT:
    ad_df['transcript_contains_i'] = ad_df['transcript'].apply(ta.contains_i)
    ad_df['ocr_text_contains_i'] = ad_df['ocr_text'].apply(ta.contains_i)
    ad_df['transcript_contains_we'] = ad_df['transcript'].apply(ta.contains_we)
    ad_df['ocr_text_contains_we'] = ad_df['ocr_text'].apply(ta.contains_we)
    ad_df['transcript_contains_you'] = ad_df['transcript'].apply(ta.contains_you)
    ad_df['ocr_text_contains_you'] = ad_df['ocr_text'].apply(ta.contains_you)
    ad_df['transcript_contains_he'] = ad_df['transcript'].apply(ta.contains_he)
    ad_df['ocr_text_contains_he'] = ad_df['ocr_text'].apply(ta.contains_he)
    ad_df['transcript_contains_she'] = ad_df['transcript'].apply(ta.contains_she)
    ad_df['ocr_text_contains_she'] = ad_df['ocr_text'].apply(ta.contains_she)
    ad_df['transcript_contains_it'] = ad_df['transcript'].apply(ta.contains_it)
    ad_df['ocr_text_contains_it'] = ad_df['ocr_text'].apply(ta.contains_it)
    ad_df['transcript_contains_they'] = ad_df['transcript'].apply(ta.contains_they)
    ad_df['ocr_text_contains_they'] = ad_df['ocr_text'].apply(ta.contains_they)

    ad_df[PERSONAL_PRONOUNS_COLUMNS + ['commercial_number']].to_csv(f'{CSV_PATH}/personal_pronouns.csv', index=False)

In [29]:
ad_df = update_df_with_csv(ad_df, f'{CSV_PATH}/personal_pronouns.csv', 'commercial_number')
for column in ad_df[PERSONAL_PRONOUNS_COLUMNS].columns:
  ad_df[column] = ad_df[column].fillna(0)


## Data Selection

In [30]:
# Basic function to check for missing values
def check_missing_values(df, check_for_empty_string=False):
    missing_summary = df.isnull().sum()
    display(missing_summary[missing_summary > 0])  # Display only columns with missing values
    if df.isnull().sum().sum() == 0:
        print("✅ No missing values found")
    else:
        print("❌")
        display(df[df.isnull().any(axis=1)])
        if check_for_empty_string:
            missing_rows = df[df.isnull().any(axis=1)]
            display(missing_rows[['commercial_number'] + list(missing_summary[missing_summary > 0].index)])

In [31]:


BRAND_SPECIFIC_COLUMNS = [
                         'transcript_product_brand_keywords_similarity',
                         'ocr_text_product_brand_keywords_similarity',                         
]



INDUSTRY_SPECIFIC_COLUMNS = [
                             'transcript_product_cat_keywords_similarity', 
                             'ocr_text_product_cat_keywords_similarity', 
                             ]

In [32]:
ad_df.loc[ad_df['ocr_text'].isna(), 'ocr_text'] = ''

In [None]:
import models as m
ad_df = m.prepare_df_for_modeling(ad_df)
# get rid of all brand and industry specific columns 
vanilla_df = ad_df.drop(columns=BRAND_SPECIFIC_COLUMNS + INDUSTRY_SPECIFIC_COLUMNS)

brand_specific_df = ad_df[ad_df['transcript_product_brand_keywords_similarity'].notna()]
brand_specific_df = brand_specific_df.drop(columns=INDUSTRY_SPECIFIC_COLUMNS)

industry_specific_df = ad_df[ad_df['transcript_product_cat_keywords_similarity'].notna()]
industry_specific_df = industry_specific_df.drop(columns=BRAND_SPECIFIC_COLUMNS)

industry_and_brand_specific_df = ad_df[ad_df['transcript_product_cat_keywords_similarity'].notna() & ad_df['transcript_product_brand_keywords_similarity'].notna()]

datasets = [
  {"name": "vanilla", "df": vanilla_df, "brand_specific": False, "industry_specific": False},
  {"name": "industry_specific", "df": industry_specific_df, "brand_specific": False, "industry_specific": True},
  {"name": "brand_specific", "df": brand_specific_df, "brand_specific": True, "industry_specific": False},
  {"name": "industry_and_brand_specific", "df": industry_and_brand_specific_df, "brand_specific": True, "industry_specific": True}
]

for dataset in datasets:
  dataset["df"].to_csv(f'{CSV_PATH}/{dataset["name"]}.csv', index=False)
for dataset in datasets:
  check_missing_values(dataset["df"])

# CRISP-DM 4: Modeling, 5 Evaluation

In [None]:

from IPython.display import Markdown
import models as m






for dataset in datasets:
    display(Markdown(f"# {dataset['name']} Model"))
    check_missing_values(dataset["df"])
    ad_df = dataset["df"]
    commercial_numbers = ad_df['commercial_number']
    ad_df = ad_df.drop(columns=['commercial_number'])
    original_data = ad_df.copy()

    target = ad_df['BDM']
    data = ad_df
    data.drop(columns=['BDM'], inplace=True)
    display(data.head(10))


    X_train, X_test, y_train, y_test = m.train_test_split(data, target, test_size=0.2, random_state=42)

    base_models = m.get_base_models()
    param_distributions = m.get_param_distributions()
    tuned_models = m.tune_models(data, target, base_models, param_distributions)



    trained_models = m.train_models(data, target, tuned_models, dataset["industry_specific"], dataset["brand_specific"])

    display(Markdown(f"## 5 Evaluation"))
    
    results_df, predictions = m.evaluate_models(data, target, trained_models)
    m.display_model_results(data, target, trained_models, results_df, predictions)
