In [1]:
!pip install transformers



In [2]:
!pip install sentence-transformers --quiet

In [3]:
!pip install textstat --quiet

In [4]:
!pip install catboost --quiet

In [105]:
import re
import os
import numpy as np
import pandas as pd
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import textstat
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor

### Data Preprocessing

In [6]:
df = pd.read_csv("ads_creative_text_sample.csv")
df

Unnamed: 0,text,dimensions
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,"(160, 600)"
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,"(300, 250)"
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,"(300, 600)"
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,"(300, 250)"
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"(728, 90)"
...,...,...
995,Ochrome OS\nDevice management\nmade simple and...,"(300, 600)"
996,"Continue to Recipe\n3 Easy Steps:\n1) Click ""C...","(336, 280)"
997,WIN THIS HOUSE\nAND START\nEVERY DAY HERE\nAll...,"(970, 250)"
998,EVIDENT OLYMPUS\nSZX-AR1 Augmented\nReality Mi...,"(300, 250)"


In [7]:
def clean_ad_text(text):
    text = text.replace("\n", " ")
    text = " ".join(text.split()) 
    return text


In [8]:
df["clean_text"] = df["text"].apply(clean_ad_text)
df[["text", "clean_text"]].head(5)

Unnamed: 0,text,clean_text
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,Up to $100 off roundtrip flights to Ireland. T...
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,yp The Real ур Yellow Pages Find cheap gas nea...
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,Food Navigator USA Plant-based meat: Beyond th...
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos..."


In [9]:
def capital_features(text):
    if not isinstance(text, str):
        text = ""

    words = text.split()
    num_words = len(words)
    caps_words = sum(1 for w in words if w.isupper() and len(w) > 1)
    capital_ratio = caps_words / num_words if num_words else 0
    num_exclamations = text.count("!")

    return pd.Series({
        "num_words": num_words,
        "num_caps_words": caps_words,
        "capital_ratio": min(capital_ratio, 0.6),
        "num_exclamations": num_exclamations
    })


In [10]:
cap_df = df["clean_text"].apply(capital_features)
cap_df

Unnamed: 0,num_words,num_caps_words,capital_ratio,num_exclamations
0,27.0,1.0,0.037037,0.0
1,51.0,5.0,0.098039,1.0
2,23.0,14.0,0.600000,0.0
3,27.0,11.0,0.407407,0.0
4,18.0,5.0,0.277778,2.0
...,...,...,...,...
995,22.0,1.0,0.045455,0.0
996,19.0,0.0,0.000000,1.0
997,15.0,8.0,0.533333,0.0
998,10.0,3.0,0.300000,0.0


In [11]:
df = pd.concat([df, cap_df], axis=1)

In [12]:
df.columns

Index(['text', 'dimensions', 'clean_text', 'num_words', 'num_caps_words',
       'capital_ratio', 'num_exclamations'],
      dtype='object')

In [13]:
df.head()

Unnamed: 0,text,dimensions,clean_text,num_words,num_caps_words,capital_ratio,num_exclamations
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,"(160, 600)",Up to $100 off roundtrip flights to Ireland. T...,27.0,1.0,0.037037,0.0
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,"(300, 250)",yp The Real ур Yellow Pages Find cheap gas nea...,51.0,5.0,0.098039,1.0
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,"(300, 600)",Food Navigator USA Plant-based meat: Beyond th...,23.0,14.0,0.6,0.0
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,"(300, 250)",MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,27.0,11.0,0.407407,0.0
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"(728, 90)","YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",18.0,5.0,0.277778,2.0


In [14]:
df.isna().sum()

text                0
dimensions          0
clean_text          0
num_words           0
num_caps_words      0
capital_ratio       0
num_exclamations    0
dtype: int64

### Cleaning of Clean_Text

In [15]:
df["clean_text"] = df["clean_text"].fillna("")

In [16]:
df["clean_text"].head(5)

0    Up to $100 off roundtrip flights to Ireland. T...
1    yp The Real ур Yellow Pages Find cheap gas nea...
2    Food Navigator USA Plant-based meat: Beyond th...
3    MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...
4    YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...
Name: clean_text, dtype: object

### Twitter Roberta Sentiment Model

In [17]:
sentiment_pipe = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment"
)



Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: cardiffnlp/twitter-roberta-base-sentiment
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [18]:
sentiment_pipe("LIMITED TIME OFFER! BUY ONE GET ONE FREE")

[{'label': 'LABEL_2', 'score': 0.7480954527854919}]

In [19]:
sentiment_pipe("Buy one Get one free")

[{'label': 'LABEL_1', 'score': 0.6027004718780518}]

In [20]:
def sentiment_score(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0.5  # neutral fallback

    result = sentiment_pipe(text)[0]["label"]

    mapping = {
        "LABEL_2": 1.0,  # positive
        "LABEL_1": 0.5,  # neutral
        "LABEL_0": 0.0   # negative
    }

    return mapping[result]


In [21]:
df["sentiment_score_test"] = df["clean_text"].head(10).apply(sentiment_score)

In [22]:
df[["clean_text", "sentiment_score_test"]].head(10)

Unnamed: 0,clean_text,sentiment_score_test
0,Up to $100 off roundtrip flights to Ireland. T...,0.5
1,yp The Real ур Yellow Pages Find cheap gas nea...,0.5
2,Food Navigator USA Plant-based meat: Beyond th...,0.5
3,MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,0.5
4,"YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",1.0
5,ROSELINLIN FREE SHIPPING 10,0.5
6,MENARDS GIFT CARD MENARDS GIFT CARDS MEWARDS G...,0.5
7,"From Hello to Help, 211 is Here 2.1.1 Get Conn...",0.5
8,ALL ABOUT CIRCUITS Need a Battery Refresher? 9...,0.5
9,UNITED MileagePlus. Get ready and bon voyage U...,0.5


In [23]:
df[["clean_text", "sentiment_score_test"]].head(10)

Unnamed: 0,clean_text,sentiment_score_test
0,Up to $100 off roundtrip flights to Ireland. T...,0.5
1,yp The Real ур Yellow Pages Find cheap gas nea...,0.5
2,Food Navigator USA Plant-based meat: Beyond th...,0.5
3,MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,0.5
4,"YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",1.0
5,ROSELINLIN FREE SHIPPING 10,0.5
6,MENARDS GIFT CARD MENARDS GIFT CARDS MEWARDS G...,0.5
7,"From Hello to Help, 211 is Here 2.1.1 Get Conn...",0.5
8,ALL ABOUT CIRCUITS Need a Battery Refresher? 9...,0.5
9,UNITED MileagePlus. Get ready and bon voyage U...,0.5


In [24]:
df["sentiment_score"] = df["clean_text"].apply(sentiment_score)

In [25]:
df["sentiment_score"].value_counts()

sentiment_score
0.5    729
1.0    252
0.0     19
Name: count, dtype: int64

In [26]:
df[["clean_text", "sentiment_score"]].sample(10)

Unnamed: 0,clean_text,sentiment_score
728,WHITE LABS PURE YEAST & FERMENTATION WHITE LAR...,0.5
467,UMCOR SUNDAY Support the sustainable work of U...,1.0
656,SUPPORT A LOCAL CHURCH GIVE THANKS FOR MOM Ove...,1.0
223,IDCITA BUSSE CON RAME BT SEMESTA BOGOR BERPEST...,0.5
522,CITH TH EE Times JANUARY 16 MARCH 31 Analog Ev...,0.5
541,HE HELM $50 OFF YOUR FIRST PAIR,0.5
704,OPEN A HIGH YIELD SAVINGS ACCOUNT Member FDIC ...,0.5
32,Olet the ASHLEY light shine shop now →→,0.5
65,O DOWNLOAD APLIKASINYA 16:57 Tribunnews.com a ...,0.5
237,Discover more biological insights into cancer....,1.0


In [27]:
df.head(20)

Unnamed: 0,text,dimensions,clean_text,num_words,num_caps_words,capital_ratio,num_exclamations,sentiment_score_test,sentiment_score
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,"(160, 600)",Up to $100 off roundtrip flights to Ireland. T...,27.0,1.0,0.037037,0.0,0.5,0.5
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,"(300, 250)",yp The Real ур Yellow Pages Find cheap gas nea...,51.0,5.0,0.098039,1.0,0.5,0.5
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,"(300, 600)",Food Navigator USA Plant-based meat: Beyond th...,23.0,14.0,0.6,0.0,0.5,0.5
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,"(300, 250)",MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,27.0,11.0,0.407407,0.0,0.5,0.5
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"(728, 90)","YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",18.0,5.0,0.277778,2.0,1.0,1.0
5,ROSELINLIN\nFREE SHIPPING\n10,"(970, 250)",ROSELINLIN FREE SHIPPING 10,4.0,3.0,0.6,0.0,0.5,0.5
6,MENARDS\nGIFT CARD\nMENARDS\nGIFT CARDS\nMEWAR...,"(300, 250)",MENARDS GIFT CARD MENARDS GIFT CARDS MEWARDS G...,19.0,19.0,0.6,1.0,0.5,0.5
7,"From Hello\nto Help,\n211 is Here\n2.1.1\nGet ...","(300, 250)","From Hello to Help, 211 is Here 2.1.1 Get Conn...",14.0,2.0,0.142857,0.0,0.5,0.5
8,ALL ABOUT\nCIRCUITS\nNeed a Battery\nRefresher...,"(300, 250)",ALL ABOUT CIRCUITS Need a Battery Refresher? 9...,23.0,13.0,0.565217,1.0,0.5,0.5
9,UNITED\nMileagePlus.\nGet ready and bon voyage...,"(300, 250)",UNITED MileagePlus. Get ready and bon voyage U...,21.0,1.0,0.047619,0.0,0.5,0.5


In [28]:
df.to_csv("ads_with_sentiment.csv", index=False)

In [29]:
df.head()

Unnamed: 0,text,dimensions,clean_text,num_words,num_caps_words,capital_ratio,num_exclamations,sentiment_score_test,sentiment_score
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,"(160, 600)",Up to $100 off roundtrip flights to Ireland. T...,27.0,1.0,0.037037,0.0,0.5,0.5
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,"(300, 250)",yp The Real ур Yellow Pages Find cheap gas nea...,51.0,5.0,0.098039,1.0,0.5,0.5
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,"(300, 600)",Food Navigator USA Plant-based meat: Beyond th...,23.0,14.0,0.6,0.0,0.5,0.5
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,"(300, 250)",MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,27.0,11.0,0.407407,0.0,0.5,0.5
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"(728, 90)","YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",18.0,5.0,0.277778,2.0,1.0,1.0


### Semantic Persuation Score

In [30]:
# sentence-transformers/all-MiniLM-L6-v2

In [31]:
df["clean_text"].head(5)

0    Up to $100 off roundtrip flights to Ireland. T...
1    yp The Real ур Yellow Pages Find cheap gas nea...
2    Food Navigator USA Plant-based meat: Beyond th...
3    MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...
4    YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...
Name: clean_text, dtype: object

In [32]:
df["clean_text"] = df["clean_text"].fillna("")

In [33]:
sbert = SentenceTransformer("all-MiniLM-L6-v2")

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [34]:
persuasion_anchors = [
    "limited time offer",
    "exclusive deal just for you",
    "win big prizes",
    "best price guaranteed",
    "don’t miss this opportunity",
    "start your journey today",
    "high quality product at discount"
]

In [35]:
anchor_embeddings = sbert.encode(
    persuasion_anchors,
    convert_to_tensor=True
)

In [36]:
### Test data frame
test_text = "LIMITED TIME OFFER! BUY NOW AND SAVE BIG"

test_emb = sbert.encode(test_text, convert_to_tensor=True)
similarity = util.cos_sim(test_emb, anchor_embeddings)

similarity

tensor([[0.7249, 0.3643, 0.2568, 0.3477, 0.1331, 0.0988, 0.2707]])

In [37]:
similarity.max().item()

0.7248973846435547

In [38]:
def persuasion_score(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0.0

    emb = sbert.encode(text, convert_to_tensor=True)
    sim = util.cos_sim(emb, anchor_embeddings).max().item()

    return float(np.clip(sim, 0, 1))


In [39]:
df["persuasion_score_test"] = df["clean_text"].head(10).apply(persuasion_score)
df[["clean_text", "persuasion_score_test"]].head(10)

Unnamed: 0,clean_text,persuasion_score_test
0,Up to $100 off roundtrip flights to Ireland. T...,0.198016
1,yp The Real ур Yellow Pages Find cheap gas nea...,0.222818
2,Food Navigator USA Plant-based meat: Beyond th...,0.151504
3,MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,0.262872
4,"YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",0.17279
5,ROSELINLIN FREE SHIPPING 10,0.294787
6,MENARDS GIFT CARD MENARDS GIFT CARDS MEWARDS G...,0.187448
7,"From Hello to Help, 211 is Here 2.1.1 Get Conn...",0.173179
8,ALL ABOUT CIRCUITS Need a Battery Refresher? 9...,0.105081
9,UNITED MileagePlus. Get ready and bon voyage U...,0.336201


In [40]:
df["persuasion_score"] = df["clean_text"].apply(persuasion_score)

In [41]:
df["persuasion_score"].describe()

count    1000.000000
mean        0.217312
std         0.081536
min         0.029564
25%         0.160739
50%         0.208714
75%         0.262613
max         0.543086
Name: persuasion_score, dtype: float64

In [42]:
df.drop(columns=["persuasion_score_test"], inplace=True)

### CTA Score

In [43]:
df["clean_text"].head(5)

0    Up to $100 off roundtrip flights to Ireland. T...
1    yp The Real ур Yellow Pages Find cheap gas nea...
2    Food Navigator USA Plant-based meat: Beyond th...
3    MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...
4    YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...
Name: clean_text, dtype: object

In [44]:
df["clean_text"] = df["clean_text"].fillna("")

In [45]:
CTA_WORDS = [
    "buy", "shop", "order", "sign up", "signup", "register",
    "download", "click", "start", "get", "try", "apply",
    "claim", "win", "join", "subscribe", "book",

    "buy now", "shop now", "order now", "start now",
    "get started", "join now", "apply now", "book now",
    "reserve now", "enroll now",

    "free trial", "try free", "start free", "get free",
    "free access", "download free", "free preview",
    "no cost", "risk free", "cancel anytime",

    "save now", "get discount", "unlock savings",
    "best deal", "exclusive offer", "special price",
    "flat off", "price drop", "cashback available",
    "grab the deal",

    "limited time", "ends soon", "last chance",
    "act now", "hurry", "don’t miss",
    "today only", "offer expires", "final call",

    "learn more", "explore", "discover",
    "see how it works", "view details",
    "check it out", "find out", "watch demo",

    "get instant access", "claim your offer",
    "buy now and save", "join free today",
    "limited offer act now", "download now it’s free"
]


In [46]:
## Basic CTA scoring function
def cta_score(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0.0

    text = text.lower()

    hits = sum(1 for word in CTA_WORDS if word in text)

    # normalize: 0 CTAs → 0, 3+ → 1.0
    return min(hits / 3, 1.0)


In [47]:
test_ads = [
    "BUY NOW and save big",
    "Sign up today to get exclusive access",
    "Food Navigator USA",
    "Click here to download now",
    "Searching for Speakers"
]

for ad in test_ads:
    print(ad, "→", cta_score(ad))


BUY NOW and save big → 1.0
Sign up today to get exclusive access → 0.6666666666666666
Food Navigator USA → 0.0
Click here to download now → 0.6666666666666666
Searching for Speakers → 0.0


In [48]:
df["cta_score_test"] = df["clean_text"].head(10).apply(cta_score)
df[["clean_text", "cta_score_test"]].head(10)

Unnamed: 0,clean_text,cta_score_test
0,Up to $100 off roundtrip flights to Ireland. T...,1.0
1,yp The Real ур Yellow Pages Find cheap gas nea...,0.333333
2,Food Navigator USA Plant-based meat: Beyond th...,0.333333
3,MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,0.0
4,"YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",0.666667
5,ROSELINLIN FREE SHIPPING 10,0.0
6,MENARDS GIFT CARD MENARDS GIFT CARDS MEWARDS G...,0.666667
7,"From Hello to Help, 211 is Here 2.1.1 Get Conn...",0.333333
8,ALL ABOUT CIRCUITS Need a Battery Refresher? 9...,0.333333
9,UNITED MileagePlus. Get ready and bon voyage U...,0.666667


In [49]:
df["cta_score"] = df["clean_text"].apply(cta_score)

In [50]:
df["cta_score"].value_counts()

cta_score
0.000000    396
0.333333    356
0.666667    185
1.000000     63
Name: count, dtype: int64

In [51]:
df[
    ["clean_text", "cta_score"]
].sample(10)

Unnamed: 0,clean_text,cta_score
225,trint Speech to Text to Magic Collaborate on v...,1.0
214,JBL Discover the JBL 4309 Studio Monitor Books...,0.666667
418,STOP THE TALL BOOT FLOP ADD BOOT STUFFERS www....,0.333333
65,O DOWNLOAD APLIKASINYA 16:57 Tribunnews.com a ...,0.666667
366,Not liberal. Not conservative. Just local. The...,0.333333
556,ADVANTAGE HOME CURRENT SPECIALS CONTRACTING Th...,0.0
947,FACTORING eagle MADE SIMPLE,0.0
674,SUMMER COURSES Join us this summer for a uniqu...,0.333333
550,GET MORE FROM IPAD Combo Touch - now for iPad ...,1.0
663,Look for challenges. See opportunities. Experi...,0.0


In [52]:
df.drop(columns=["cta_score_test"], inplace=True)

In [53]:
df["clean_text"].head(5)

0    Up to $100 off roundtrip flights to Ireland. T...
1    yp The Real ур Yellow Pages Find cheap gas nea...
2    Food Navigator USA Plant-based meat: Beyond th...
3    MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...
4    YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...
Name: clean_text, dtype: object

In [54]:
df["clean_text"] = df["clean_text"].fillna("")

In [55]:
examples = [
    "Buy now and save big",
    "Searching for speakers",
    "The company has announced a comprehensive strategy to optimize operational efficiency"
]

for text in examples:
    print(text, "→", textstat.flesch_reading_ease(text))

Buy now and save big → 117.16000000000003
Searching for speakers → 62.79000000000002
The company has announced a comprehensive strategy to optimize operational efficiency → -19.6754545454545


### Readability Score

In [56]:
def readability_score(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0.5  # neutral fallback

    score = textstat.flesch_reading_ease(text)

    return max(0, min(score / 100, 1))

In [57]:
df["readability_score_test"] = df["clean_text"].head(10).apply(readability_score)

In [58]:
df[["clean_text", "readability_score_test"]].head(10)

Unnamed: 0,clean_text,readability_score_test
0,Up to $100 off roundtrip flights to Ireland. T...,0.699938
1,yp The Real ур Yellow Pages Find cheap gas nea...,0.88905
2,Food Navigator USA Plant-based meat: Beyond th...,0.179683
3,MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,0.840504
4,"YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",0.452624
5,ROSELINLIN FREE SHIPPING 10,0.75875
6,MENARDS GIFT CARD MENARDS GIFT CARDS MEWARDS G...,0.717816
7,"From Hello to Help, 211 is Here 2.1.1 Get Conn...",0.970014
8,ALL ABOUT CIRCUITS Need a Battery Refresher? 9...,0.519229
9,UNITED MileagePlus. Get ready and bon voyage U...,0.7272


In [59]:
df["readability_score"] = df["clean_text"].apply(readability_score)

In [60]:
df["readability_score"].describe()

count    1000.000000
mean        0.593930
std         0.251079
min         0.000000
25%         0.418518
50%         0.616373
75%         0.782097
max         1.000000
Name: readability_score, dtype: float64

In [61]:
df[["clean_text", "readability_score"]].sample(10)

Unnamed: 0,clean_text,readability_score
342,America's Weather Team™ FOX WEATHER WATCH NOW,0.547014
528,THE AVALANCHES LIVE! SEPTEMBER 22 GREEK THEATRE,0.340825
670,inspire and fuel their curiosity with Nat Geo ...,0.584075
147,INTECE The Basics of Conducting an Incident In...,0.138974
997,WIN THIS HOUSE AND START EVERY DAY HERE All to...,0.920625
750,"Seismic Ready to bring speed, simplicity, and ...",0.6753
172,a million little things WEDNESDAY 10/9c abc,0.788729
237,Discover more biological insights into cancer....,0.147698
144,Spring forward with expert resources from the ...,0.105897
804,BIG IDEAS THAT BENEFIT OUR WORLD EXPLORE MORE TCU,0.567


In [62]:
df.drop(columns=["readability_score_test"], inplace=True)

## Final CTR Calculation

In [63]:
required_cols = [
    "sentiment_score",
    "capital_ratio",
    "persuasion_score",
    "cta_score",
    "readability_score"
]

missing = [c for c in required_cols if c not in df.columns]
missing

[]

In [64]:
df[required_cols] = df[required_cols].fillna(0)

In [65]:
## Proxy function
def compute_ctr_proxy(row):
    ctr = (
        0.25 * row["sentiment_score"] +
        0.20 * row["capital_ratio"] +
        0.25 * row["persuasion_score"] +
        0.15 * row["cta_score"] +
        0.15 * row["readability_score"]
    )
    return max(0, min(ctr, 1))


In [66]:
df["ctr_proxy_test"] = df.head(10).apply(compute_ctr_proxy, axis=1)

df[[
        "clean_text",
        "sentiment_score",
        "capital_ratio",
        "persuasion_score",
        "cta_score",
        "readability_score",
        "ctr_proxy_test"
]].head(10)

Unnamed: 0,clean_text,sentiment_score,capital_ratio,persuasion_score,cta_score,readability_score,ctr_proxy_test
0,Up to $100 off roundtrip flights to Ireland. T...,0.5,0.037037,0.198016,1.0,0.699938,0.436902
1,yp The Real ур Yellow Pages Find cheap gas nea...,0.5,0.098039,0.222818,0.333333,0.88905,0.38367
2,Food Navigator USA Plant-based meat: Beyond th...,0.5,0.6,0.151504,0.333333,0.179683,0.359828
3,MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,0.5,0.407407,0.262872,0.0,0.840504,0.398275
4,"YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",1.0,0.277778,0.17279,0.666667,0.452624,0.516646
5,ROSELINLIN FREE SHIPPING 10,0.5,0.6,0.294787,0.0,0.75875,0.432509
6,MENARDS GIFT CARD MENARDS GIFT CARDS MEWARDS G...,0.5,0.6,0.187448,0.666667,0.717816,0.499534
7,"From Hello to Help, 211 is Here 2.1.1 Get Conn...",0.5,0.142857,0.173179,0.333333,0.970014,0.392368
8,ALL ABOUT CIRCUITS Need a Battery Refresher? 9...,0.5,0.565217,0.105081,0.333333,0.519229,0.392198
9,UNITED MileagePlus. Get ready and bon voyage U...,0.5,0.047619,0.336201,0.666667,0.7272,0.427654


In [67]:
df["ctr_proxy"] = df.apply(compute_ctr_proxy, axis=1)

In [68]:
df["ctr_proxy"].describe()

count    1000.000000
mean        0.416045
std         0.102411
min         0.141852
25%         0.349670
50%         0.408393
75%         0.487074
max         0.740362
Name: ctr_proxy, dtype: float64

In [69]:
def ctr_bucket(ctr):
    if ctr >= 0.6:
        return "High"
    elif ctr >= 0.3:
        return "Medium"
    else:
        return "Low"

df["ctr_bucket"] = df["ctr_proxy"].apply(ctr_bucket)

In [70]:
df["ctr_bucket"].value_counts()

ctr_bucket
Medium    836
Low       123
High       41
Name: count, dtype: int64

In [71]:
df.drop(columns=["ctr_proxy_test"], inplace=True)

In [72]:
df

Unnamed: 0,text,dimensions,clean_text,num_words,num_caps_words,capital_ratio,num_exclamations,sentiment_score_test,sentiment_score,persuasion_score,cta_score,readability_score,ctr_proxy,ctr_bucket
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,"(160, 600)",Up to $100 off roundtrip flights to Ireland. T...,27.0,1.0,0.037037,0.0,0.5,0.5,0.198016,1.000000,0.699938,0.436902,Medium
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,"(300, 250)",yp The Real ур Yellow Pages Find cheap gas nea...,51.0,5.0,0.098039,1.0,0.5,0.5,0.222818,0.333333,0.889050,0.383670,Medium
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,"(300, 600)",Food Navigator USA Plant-based meat: Beyond th...,23.0,14.0,0.600000,0.0,0.5,0.5,0.151504,0.333333,0.179683,0.359828,Medium
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,"(300, 250)",MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,27.0,11.0,0.407407,0.0,0.5,0.5,0.262872,0.000000,0.840504,0.398275,Medium
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"(728, 90)","YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",18.0,5.0,0.277778,2.0,1.0,1.0,0.172790,0.666667,0.452624,0.516646,Medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Ochrome OS\nDevice management\nmade simple and...,"(300, 600)",Ochrome OS Device management made simple and s...,22.0,1.0,0.045455,0.0,,1.0,0.131270,1.000000,0.724917,0.550646,Medium
996,"Continue to Recipe\n3 Easy Steps:\n1) Click ""C...","(336, 280)","Continue to Recipe 3 Easy Steps: 1) Click ""Con...",19.0,0.0,0.000000,1.0,,1.0,0.089067,0.333333,0.458030,0.390971,Medium
997,WIN THIS HOUSE\nAND START\nEVERY DAY HERE\nAll...,"(970, 250)",WIN THIS HOUSE AND START EVERY DAY HERE All to...,15.0,8.0,0.533333,0.0,,1.0,0.380776,0.666667,0.920625,0.689954,High
998,EVIDENT OLYMPUS\nSZX-AR1 Augmented\nReality Mi...,"(300, 250)",EVIDENT OLYMPUS SZX-AR1 Augmented Reality Micr...,10.0,3.0,0.300000,0.0,,0.5,0.167346,0.000000,0.000000,0.226836,Low


In [73]:
df.head()

Unnamed: 0,text,dimensions,clean_text,num_words,num_caps_words,capital_ratio,num_exclamations,sentiment_score_test,sentiment_score,persuasion_score,cta_score,readability_score,ctr_proxy,ctr_bucket
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,"(160, 600)",Up to $100 off roundtrip flights to Ireland. T...,27.0,1.0,0.037037,0.0,0.5,0.5,0.198016,1.0,0.699938,0.436902,Medium
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,"(300, 250)",yp The Real ур Yellow Pages Find cheap gas nea...,51.0,5.0,0.098039,1.0,0.5,0.5,0.222818,0.333333,0.88905,0.38367,Medium
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,"(300, 600)",Food Navigator USA Plant-based meat: Beyond th...,23.0,14.0,0.6,0.0,0.5,0.5,0.151504,0.333333,0.179683,0.359828,Medium
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,"(300, 250)",MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,27.0,11.0,0.407407,0.0,0.5,0.5,0.262872,0.0,0.840504,0.398275,Medium
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"(728, 90)","YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",18.0,5.0,0.277778,2.0,1.0,1.0,0.17279,0.666667,0.452624,0.516646,Medium


In [74]:
df.tail()

Unnamed: 0,text,dimensions,clean_text,num_words,num_caps_words,capital_ratio,num_exclamations,sentiment_score_test,sentiment_score,persuasion_score,cta_score,readability_score,ctr_proxy,ctr_bucket
995,Ochrome OS\nDevice management\nmade simple and...,"(300, 600)",Ochrome OS Device management made simple and s...,22.0,1.0,0.045455,0.0,,1.0,0.13127,1.0,0.724917,0.550646,Medium
996,"Continue to Recipe\n3 Easy Steps:\n1) Click ""C...","(336, 280)","Continue to Recipe 3 Easy Steps: 1) Click ""Con...",19.0,0.0,0.0,1.0,,1.0,0.089067,0.333333,0.45803,0.390971,Medium
997,WIN THIS HOUSE\nAND START\nEVERY DAY HERE\nAll...,"(970, 250)",WIN THIS HOUSE AND START EVERY DAY HERE All to...,15.0,8.0,0.533333,0.0,,1.0,0.380776,0.666667,0.920625,0.689954,High
998,EVIDENT OLYMPUS\nSZX-AR1 Augmented\nReality Mi...,"(300, 250)",EVIDENT OLYMPUS SZX-AR1 Augmented Reality Micr...,10.0,3.0,0.3,0.0,,0.5,0.167346,0.0,0.0,0.226836,Low
999,Searching for Speakers?\nCompare\nBefore You B...,"(300, 250)",Searching for Speakers? Compare Before You Buy...,13.0,2.0,0.153846,0.0,,0.5,0.253527,0.333333,0.570683,0.354753,Medium


In [75]:
df["ctr_proxy"].value_counts()

ctr_proxy
0.441610    3
0.394136    2
0.384116    2
0.390971    2
0.454150    2
           ..
0.428113    1
0.381535    1
0.368323    1
0.345655    1
0.354753    1
Name: count, Length: 994, dtype: int64

## Dataset Test Final

In [76]:
df.shape

(1000, 14)

In [77]:
df.columns.tolist()

['text',
 'dimensions',
 'clean_text',
 'num_words',
 'num_caps_words',
 'capital_ratio',
 'num_exclamations',
 'sentiment_score_test',
 'sentiment_score',
 'persuasion_score',
 'cta_score',
 'readability_score',
 'ctr_proxy',
 'ctr_bucket']

In [78]:
df.dtypes

text                     object
dimensions               object
clean_text               object
num_words               float64
num_caps_words          float64
capital_ratio           float64
num_exclamations        float64
sentiment_score_test    float64
sentiment_score         float64
persuasion_score        float64
cta_score               float64
readability_score       float64
ctr_proxy               float64
ctr_bucket               object
dtype: object

In [79]:
df.isna().sum()

text                      0
dimensions                0
clean_text                0
num_words                 0
num_caps_words            0
capital_ratio             0
num_exclamations          0
sentiment_score_test    990
sentiment_score           0
persuasion_score          0
cta_score                 0
readability_score         0
ctr_proxy                 0
ctr_bucket                0
dtype: int64

In [80]:
df.sort_values("ctr_proxy", ascending=False)[
    ["clean_text", "ctr_proxy"]
].head(10)


Unnamed: 0,clean_text,ctr_proxy
386,BLACK FRIDAY ZOLUCKY Soft.clouds UP TO 50% OFF...,0.740362
421,LAMPS PLUS Let yourself shine. SHOP NOW,0.695817
997,WIN THIS HOUSE AND START EVERY DAY HERE All to...,0.689954
778,"HEAVY ON PRIZES. WIN UP TO $500,000! $20 PRECI...",0.688459
922,J.ME Black FRIDAY SALE 70% OFF *Free Shipping ...,0.684204
350,"Together, we game. KIOXIA FIOMA EXCERIA SSD EX...",0.681992
107,THE FASTEST & SMARTEST WAY TO SHOP FOR YOUR NE...,0.66823
685,big yachts LET US GET YOU ON THE WATER!,0.661982
550,GET MORE FROM IPAD Combo Touch - now for iPad ...,0.660988
939,★★. MEMORIAL DAY SALE Cp SAVE OVER $1600 4 PIE...,0.66028


In [81]:
df.head()

Unnamed: 0,text,dimensions,clean_text,num_words,num_caps_words,capital_ratio,num_exclamations,sentiment_score_test,sentiment_score,persuasion_score,cta_score,readability_score,ctr_proxy,ctr_bucket
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,"(160, 600)",Up to $100 off roundtrip flights to Ireland. T...,27.0,1.0,0.037037,0.0,0.5,0.5,0.198016,1.0,0.699938,0.436902,Medium
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,"(300, 250)",yp The Real ур Yellow Pages Find cheap gas nea...,51.0,5.0,0.098039,1.0,0.5,0.5,0.222818,0.333333,0.88905,0.38367,Medium
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,"(300, 600)",Food Navigator USA Plant-based meat: Beyond th...,23.0,14.0,0.6,0.0,0.5,0.5,0.151504,0.333333,0.179683,0.359828,Medium
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,"(300, 250)",MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,27.0,11.0,0.407407,0.0,0.5,0.5,0.262872,0.0,0.840504,0.398275,Medium
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"(728, 90)","YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",18.0,5.0,0.277778,2.0,1.0,1.0,0.17279,0.666667,0.452624,0.516646,Medium


## The Catboost Regressor Model

In [82]:
feature_cols = [
    "sentiment_score",
    # "capital_ratio",
    "persuasion_score",
    "cta_score",
    "readability_score"
]

X = df[feature_cols]
Y = df["ctr_proxy"]

In [83]:
X.head()

Unnamed: 0,sentiment_score,persuasion_score,cta_score,readability_score
0,0.5,0.198016,1.0,0.699938
1,0.5,0.222818,0.333333,0.88905
2,0.5,0.151504,0.333333,0.179683
3,0.5,0.262872,0.0,0.840504
4,1.0,0.17279,0.666667,0.452624


In [84]:
Y.head()

0    0.436902
1    0.383670
2    0.359828
3    0.398275
4    0.516646
Name: ctr_proxy, dtype: float64

### train test split

In [85]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X,
    Y,
    test_size=0.2,
    random_state=42
)

In [86]:
textModel = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    loss_function="RMSE",
    eval_metric="RMSE",
    random_seed=42,
    verbose=100
)

In [87]:
textModel.fit(
    X_train,
    Y_train,
    eval_set=(X_test, Y_test),
    use_best_model=True
)

0:	learn: 0.0974938	test: 0.1055711	best: 0.1055711 (0)	total: 161ms	remaining: 1m 20s
100:	learn: 0.0395842	test: 0.0468176	best: 0.0468176 (100)	total: 297ms	remaining: 1.17s
200:	learn: 0.0364798	test: 0.0462062	best: 0.0460184 (160)	total: 415ms	remaining: 618ms
300:	learn: 0.0337684	test: 0.0463390	best: 0.0460184 (160)	total: 538ms	remaining: 356ms
400:	learn: 0.0315363	test: 0.0466534	best: 0.0460184 (160)	total: 668ms	remaining: 165ms
499:	learn: 0.0296697	test: 0.0467857	best: 0.0460184 (160)	total: 812ms	remaining: 0us

bestTest = 0.04601836714
bestIteration = 160

Shrink model to first 161 iterations.


<catboost.core.CatBoostRegressor at 0x214c3ad2ba0>

In [88]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

Y_pred = textModel.predict(X_test)

rmse = np.sqrt(mean_squared_error(Y_test, Y_pred))
mae = mean_absolute_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

rmse, mae, r2

(np.float64(0.04601836612669249), 0.04000149359687263, 0.8211035478452269)

In [89]:
comparison = df.loc[Y_test.index, ["clean_text"]].copy()
comparison["actual_ctr"] = Y_test
comparison["predicted_ctr"] = Y_pred

comparison.head(10)

Unnamed: 0,clean_text,actual_ctr,predicted_ctr
521,IN THE MIDST OF PLENTY THE NEW FOLK MUSICAL FR...,0.362397,0.336055
737,Young Alfred Home insurance from $35/mo Get Qu...,0.3647,0.44033
740,Anderson Family Heath-Anderson FUNERAL HOMES &...,0.209247,0.197936
660,GROCERY WORKER'S APPRECIATION FUND KJ KENDALL ...,0.312208,0.261328
411,DISCOVER BOTSWANA Africa's Most Exclusive Dest...,0.345605,0.299908
678,S NEWSLETTER THE Get The Star headline Stories...,0.352963,0.378622
626,Canva extend /* The _first_ Canva Developers e...,0.254734,0.32968
513,Take a break from 'the usual' VISIT UPTOWNCHAR...,0.273324,0.296883
859,Decanter Wine Experiences Discover great wines...,0.463879,0.515215
136,★2023★ CHOICE THE OFFICIAL AWARDS Tri State's ...,0.604714,0.544986


## Feature Importance --> Need to put this in PPT

In [90]:
### Feature importance
importance = textModel.get_feature_importance()
fi_df = pd.DataFrame({
    "feature": feature_cols,
    "importance": importance
}).sort_values(by="importance", ascending=False)

fi_df

Unnamed: 0,feature,importance
0,sentiment_score,38.74446
2,cta_score,26.361182
3,readability_score,19.987629
1,persuasion_score,14.906729


### R2 Score

In [91]:
from sklearn.metrics import r2_score

r2 = r2_score(Y_test, Y_pred)
r2

0.8211035478452269

## Save Model

In [92]:
textModel.save_model("catboost_ctr_model.cbm")

# Image Analysis

In [149]:
pip install pytesseract

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
Note: you may need to restart the kernel to use updated packages.


In [150]:
import cv2
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import pytesseract

In [101]:
def load_image(path):
    if path is None:
        return None

    img = cv2.imread(path)

    if img is None:
        return None
    else:
        return img

### face detector

In [95]:
face_cascade = cv2.CascadeClassifier(
    cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
)

### Image features extractor

In [102]:
def extract_img_features(image_path):
    img = load_image(image_path)

    if img is None:
        return {
            "brightness": 0.5,
            "contrast": 0.5,
            "sharpness": 0.5,
            "edge_density": 0.5,
            "face_present": 0
        }
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    brightness = gray.mean() / 255
    contrast = gray.std() / 128
    sharpness = cv2.Laplacian(gray, cv2.CV_64F).var() / 1000
    edges = cv2.Canny(gray, 100, 200)
    edge_density = edges.mean() / 255

    faces = face_cascade.detectMultiScale(gray, 1.3, 5)

    if len(faces) > 0:
        face_present = 1
    else:
        face_present = 0

    return {
        "brightness": np.clip(brightness, 0, 1),
        "contrast": np.clip(contrast, 0, 1),
        "sharpness": np.clip(sharpness, 0, 1),
        "edge_density": np.clip(edge_density, 0, 1),
        "face_present": face_present
    }

### Testing on sample image

In [104]:
image_path = "13.png"
img_features = extract_img_features(image_path)
img_features

{'brightness': np.float64(0.8127021490196078),
 'contrast': np.float64(0.5958701935832448),
 'sharpness': np.float64(1.0),
 'edge_density': np.float64(0.070376),
 'face_present': 0}

## feature extraction of all images

In [106]:
IMAGE_ROOT = r"C:\Users\subha\OneDrive\Desktop\projects\dentsu\images-dataset"

In [107]:
image_paths = []

for root, dirs, files in os.walk(IMAGE_ROOT):
    for file in files:
        if file.lower().endswith(".png"):
            image_paths.append(os.path.join(root, file))

len(image_paths)


302

In [108]:
image_paths[:5]

['C:\\Users\\subha\\OneDrive\\Desktop\\projects\\dentsu\\images-dataset\\p1\\1\\1.png',
 'C:\\Users\\subha\\OneDrive\\Desktop\\projects\\dentsu\\images-dataset\\p1\\1\\10.png',
 'C:\\Users\\subha\\OneDrive\\Desktop\\projects\\dentsu\\images-dataset\\p1\\1\\11.png',
 'C:\\Users\\subha\\OneDrive\\Desktop\\projects\\dentsu\\images-dataset\\p1\\1\\12.png',
 'C:\\Users\\subha\\OneDrive\\Desktop\\projects\\dentsu\\images-dataset\\p1\\1\\13.png']

## Separating the text dataset and image dataset

In [115]:
df_text = df.copy()

In [117]:
image_paths

['C:\\Users\\subha\\OneDrive\\Desktop\\projects\\dentsu\\images-dataset\\p1\\1\\1.png',
 'C:\\Users\\subha\\OneDrive\\Desktop\\projects\\dentsu\\images-dataset\\p1\\1\\10.png',
 'C:\\Users\\subha\\OneDrive\\Desktop\\projects\\dentsu\\images-dataset\\p1\\1\\11.png',
 'C:\\Users\\subha\\OneDrive\\Desktop\\projects\\dentsu\\images-dataset\\p1\\1\\12.png',
 'C:\\Users\\subha\\OneDrive\\Desktop\\projects\\dentsu\\images-dataset\\p1\\1\\13.png',
 'C:\\Users\\subha\\OneDrive\\Desktop\\projects\\dentsu\\images-dataset\\p1\\1\\14.png',
 'C:\\Users\\subha\\OneDrive\\Desktop\\projects\\dentsu\\images-dataset\\p1\\1\\15.png',
 'C:\\Users\\subha\\OneDrive\\Desktop\\projects\\dentsu\\images-dataset\\p1\\1\\16.png',
 'C:\\Users\\subha\\OneDrive\\Desktop\\projects\\dentsu\\images-dataset\\p1\\1\\2.png',
 'C:\\Users\\subha\\OneDrive\\Desktop\\projects\\dentsu\\images-dataset\\p1\\1\\3.png',
 'C:\\Users\\subha\\OneDrive\\Desktop\\projects\\dentsu\\images-dataset\\p1\\1\\4.png',
 'C:\\Users\\subha\\OneDr

In [118]:
df_images = pd.DataFrame({
    "image_path": image_paths
})

df_images.head()

Unnamed: 0,image_path
0,C:\Users\subha\OneDrive\Desktop\projects\dents...
1,C:\Users\subha\OneDrive\Desktop\projects\dents...
2,C:\Users\subha\OneDrive\Desktop\projects\dents...
3,C:\Users\subha\OneDrive\Desktop\projects\dents...
4,C:\Users\subha\OneDrive\Desktop\projects\dents...


In [122]:
image_features = []

for path in tqdm(df_images["image_path"], total=len(df_images)):
    feats = extract_img_features(path)
    image_features.append(feats)

100%|██████████| 302/302 [00:06<00:00, 46.44it/s]


In [124]:
image_feat_df = pd.DataFrame(image_features)

df_images = pd.concat(
    [df_images.reset_index(drop=True), image_feat_df],
    axis=1
)

df_images.head()

Unnamed: 0,image_path,brightness,contrast,sharpness,edge_density,face_present,brightness.1,contrast.1,sharpness.1,edge_density.1,face_present.1
0,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.967216,0.298577,1.0,0.02954,0,0.967216,0.298577,1.0,0.02954,0
1,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.961849,0.335428,0.792032,0.014588,0,0.961849,0.335428,0.792032,0.014588,0
2,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.504698,0.635967,1.0,0.071668,1,0.504698,0.635967,1.0,0.071668,1
3,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.616133,0.645663,0.704462,0.054024,0,0.616133,0.645663,0.704462,0.054024,0
4,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.812702,0.59587,1.0,0.070376,0,0.812702,0.59587,1.0,0.070376,0


In [130]:
df_images = df_images.loc[:, ~df_images.columns.duplicated()]

In [131]:
df_images.head()

Unnamed: 0,image_path,brightness,contrast,sharpness,edge_density,face_present
0,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.967216,0.298577,1.0,0.02954,0
1,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.961849,0.335428,0.792032,0.014588,0
2,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.504698,0.635967,1.0,0.071668,1
3,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.616133,0.645663,0.704462,0.054024,0
4,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.812702,0.59587,1.0,0.070376,0


In [132]:
df_text.shape, df_images.shape

((1000, 14), (302, 6))

In [133]:
df_text.columns

Index(['text', 'dimensions', 'clean_text', 'num_words', 'num_caps_words',
       'capital_ratio', 'num_exclamations', 'sentiment_score_test',
       'sentiment_score', 'persuasion_score', 'cta_score', 'readability_score',
       'ctr_proxy', 'ctr_bucket'],
      dtype='object')

In [135]:
df_images.columns

Index(['image_path', 'brightness', 'contrast', 'sharpness', 'edge_density',
       'face_present'],
      dtype='object')

In [136]:
df_text = df_text.reset_index(drop=True)
df_images = df_images.reset_index(drop=True)

df_text["ad_id"] = df_text.index
df_images["ad_id"] = df_images.index

In [143]:
df_final = df_text.merge(df_images, on="ad_id", how="left")
df_final.head()

Unnamed: 0,text,dimensions,clean_text,num_words,num_caps_words,capital_ratio,num_exclamations,sentiment_score_test,sentiment_score,persuasion_score,...,readability_score,ctr_proxy,ctr_bucket,ad_id,image_path,brightness,contrast,sharpness,edge_density,face_present
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,"(160, 600)",Up to $100 off roundtrip flights to Ireland. T...,27.0,1.0,0.037037,0.0,0.5,0.5,0.198016,...,0.699938,0.436902,Medium,0,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.967216,0.298577,1.0,0.02954,0.0
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,"(300, 250)",yp The Real ур Yellow Pages Find cheap gas nea...,51.0,5.0,0.098039,1.0,0.5,0.5,0.222818,...,0.88905,0.38367,Medium,1,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.961849,0.335428,0.792032,0.014588,0.0
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,"(300, 600)",Food Navigator USA Plant-based meat: Beyond th...,23.0,14.0,0.6,0.0,0.5,0.5,0.151504,...,0.179683,0.359828,Medium,2,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.504698,0.635967,1.0,0.071668,1.0
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,"(300, 250)",MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,27.0,11.0,0.407407,0.0,0.5,0.5,0.262872,...,0.840504,0.398275,Medium,3,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.616133,0.645663,0.704462,0.054024,0.0
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"(728, 90)","YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",18.0,5.0,0.277778,2.0,1.0,1.0,0.17279,...,0.452624,0.516646,Medium,4,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.812702,0.59587,1.0,0.070376,0.0


In [148]:
df_text_final = df_text.copy()
df_text_final["brightness"]   = 0.5
df_text_final["contrast"]     = 0.5
df_text_final["sharpness"]    = 0.5
df_text_final["edge_density"] = 0.5
df_text_final["face_present"] = 0
df_text_final.head()

Unnamed: 0,text,dimensions,clean_text,num_words,num_caps_words,capital_ratio,num_exclamations,sentiment_score_test,sentiment_score,persuasion_score,cta_score,readability_score,ctr_proxy,ctr_bucket,ad_id,brightness,contrast,sharpness,edge_density,face_present
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,"(160, 600)",Up to $100 off roundtrip flights to Ireland. T...,27.0,1.0,0.037037,0.0,0.5,0.5,0.198016,1.0,0.699938,0.436902,Medium,0,0.5,0.5,0.5,0.5,0
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,"(300, 250)",yp The Real ур Yellow Pages Find cheap gas nea...,51.0,5.0,0.098039,1.0,0.5,0.5,0.222818,0.333333,0.88905,0.38367,Medium,1,0.5,0.5,0.5,0.5,0
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,"(300, 600)",Food Navigator USA Plant-based meat: Beyond th...,23.0,14.0,0.6,0.0,0.5,0.5,0.151504,0.333333,0.179683,0.359828,Medium,2,0.5,0.5,0.5,0.5,0
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,"(300, 250)",MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,27.0,11.0,0.407407,0.0,0.5,0.5,0.262872,0.0,0.840504,0.398275,Medium,3,0.5,0.5,0.5,0.5,0
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"(728, 90)","YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",18.0,5.0,0.277778,2.0,1.0,1.0,0.17279,0.666667,0.452624,0.516646,Medium,4,0.5,0.5,0.5,0.5,0


## OCR text

In [151]:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [152]:
def extract_text_from_image(image_path):
    img = load_image(image_path)

    if img is None:
        return ""

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Optional: improve OCR quality
    gray = cv2.threshold(
        gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
    )[1]

    text = pytesseract.image_to_string(gray)

    return text.strip()


In [153]:
sample_path = image_paths[0]

ocr_text = extract_text_from_image(sample_path)
ocr_text

'Blue Shoes - 25% Off\n\nFree Shipping, Exchanges & Returns\nBlue Shoes at anesshoes.com! Code: BLUE\nwww.anesshoes.comicolors'

In [154]:
clean_ad = clean_ad_text(ocr_text)
clean_ad

'Blue Shoes - 25% Off Free Shipping, Exchanges & Returns Blue Shoes at anesshoes.com! Code: BLUE www.anesshoes.comicolors'

In [157]:
df_text.head()

Unnamed: 0,text,dimensions,clean_text,num_words,num_caps_words,capital_ratio,num_exclamations,sentiment_score_test,sentiment_score,persuasion_score,cta_score,readability_score,ctr_proxy,ctr_bucket,ad_id
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,"(160, 600)",Up to $100 off roundtrip flights to Ireland. T...,27.0,1.0,0.037037,0.0,0.5,0.5,0.198016,1.0,0.699938,0.436902,Medium,0
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,"(300, 250)",yp The Real ур Yellow Pages Find cheap gas nea...,51.0,5.0,0.098039,1.0,0.5,0.5,0.222818,0.333333,0.88905,0.38367,Medium,1
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,"(300, 600)",Food Navigator USA Plant-based meat: Beyond th...,23.0,14.0,0.6,0.0,0.5,0.5,0.151504,0.333333,0.179683,0.359828,Medium,2
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,"(300, 250)",MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,27.0,11.0,0.407407,0.0,0.5,0.5,0.262872,0.0,0.840504,0.398275,Medium,3
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"(728, 90)","YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",18.0,5.0,0.277778,2.0,1.0,1.0,0.17279,0.666667,0.452624,0.516646,Medium,4


In [155]:
clean_ocr_texts = []

for path in tqdm(df_images["image_path"], total=len(df_images)):
    raw_text = extract_text_from_image(path)
    clean_text = clean_ad_text(raw_text)
    clean_ocr_texts.append(clean_text)

100%|██████████| 302/302 [00:54<00:00,  5.59it/s]


In [161]:
df_images["ocr_text"] = clean_ocr_texts
len(df_images)

302

In [164]:
df_images.isna().sum()

image_path      0
brightness      0
contrast        0
sharpness       0
edge_density    0
face_present    0
ad_id           0
ocr_text        0
dtype: int64

In [165]:
df_images[["image_path", "ocr_text"]].head(10)

Unnamed: 0,image_path,ocr_text
0,C:\Users\subha\OneDrive\Desktop\projects\dents...,"Blue Shoes - 25% Off Free Shipping, Exchanges ..."
1,C:\Users\subha\OneDrive\Desktop\projects\dents...,H&M - Leather belt - Black - Ladies £12.99 H&M
2,C:\Users\subha\OneDrive\Desktop\projects\dents...,STAY WARM IN STYLE shop ‘ns seasor’s oLterwear...
3,C:\Users\subha\OneDrive\Desktop\projects\dents...,ine shopping site
4,C:\Users\subha\OneDrive\Desktop\projects\dents...,
5,C:\Users\subha\OneDrive\Desktop\projects\dents...,American Apparel Shopping Site
6,C:\Users\subha\OneDrive\Desktop\projects\dents...,BEAOREDIT. - .
7,C:\Users\subha\OneDrive\Desktop\projects\dents...,WALLETS “apart au irey nt he you poche oo Lge ...
8,C:\Users\subha\OneDrive\Desktop\projects\dents...,Shoes On Sale at Zappos - Zappos.com ‘wwnw.zap...
9,C:\Users\subha\OneDrive\Desktop\projects\dents...,Bellroy Slim Wallets - Shop Our Slim Leather W...


In [166]:
(df_images["ocr_text"].str.len() > 0).sum()

np.int64(279)

In [170]:
df_images.head()

Unnamed: 0,image_path,brightness,contrast,sharpness,edge_density,face_present,ad_id,ocr_text,sentiment_score
0,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.967216,0.298577,1.0,0.02954,0,0,"Blue Shoes - 25% Off Free Shipping, Exchanges ...",1.0
1,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.961849,0.335428,0.792032,0.014588,0,1,H&M - Leather belt - Black - Ladies £12.99 H&M,0.5
2,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.504698,0.635967,1.0,0.071668,1,2,STAY WARM IN STYLE shop ‘ns seasor’s oLterwear...,0.5
3,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.616133,0.645663,0.704462,0.054024,0,3,ine shopping site,0.5
4,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.812702,0.59587,1.0,0.070376,0,4,,0.5


In [172]:
def capital_ratio_only(text):
    words = text.split()
    if not words:
        return 0.0
    caps = sum(1 for w in words if w.isupper() and len(w) > 1)
    return min(caps / len(words), 0.6)

In [173]:
df_images["sentiment_score"]   = df_images["ocr_text"].apply(sentiment_score)
df_images["capital_ratio"]     = df_images["ocr_text"].apply(capital_ratio_only)
df_images["persuasion_score"]  = df_images["ocr_text"].apply(persuasion_score)
df_images["cta_score"]         = df_images["ocr_text"].apply(cta_score)
df_images["readability_score"] = df_images["ocr_text"].apply(readability_score)

In [177]:
df_images.columns

Index(['image_path', 'brightness', 'contrast', 'sharpness', 'edge_density',
       'face_present', 'ad_id', 'ocr_text', 'sentiment_score', 'capital_ratio',
       'persuasion_score', 'cta_score', 'readability_score'],
      dtype='object')

In [178]:
df_text.columns

Index(['text', 'dimensions', 'clean_text', 'num_words', 'num_caps_words',
       'capital_ratio', 'num_exclamations', 'sentiment_score_test',
       'sentiment_score', 'persuasion_score', 'cta_score', 'readability_score',
       'ctr_proxy', 'ctr_bucket', 'ad_id'],
      dtype='object')

In [179]:
text_ctr = (
    0.25 * df_images["sentiment_score"] +
    0.20 * df_images["capital_ratio"] +
    0.25 * df_images["persuasion_score"] +
    0.15 * df_images["cta_score"] +
    0.15 * df_images["readability_score"]
)
image_ctr = (
    0.25 * df_images["brightness"] +
    0.20 * df_images["contrast"] +
    0.20 * df_images["sharpness"] +
    0.15 * df_images["edge_density"] +
    0.20 * df_images["face_present"]
)

In [180]:
df_images["ctr_proxy"] = (
    0.6 * text_ctr +
    0.4 * image_ctr
)

In [181]:
df_images["ctr_proxy"] = df_images["ctr_proxy"].clip(0.0, 1.0)

In [182]:
df_images["ctr_proxy"].describe()

count    302.000000
mean       0.422375
std        0.059188
min        0.264921
25%        0.385695
50%        0.421950
75%        0.461416
max        0.616602
Name: ctr_proxy, dtype: float64

In [183]:
has_text = (df_images["ocr_text"].str.len() > 0).astype(int)

df_images["ctr_proxy"] = (
    has_text * (0.6 * text_ctr + 0.4 * image_ctr) +
    (1 - has_text) * image_ctr
)

In [184]:
df_images.head()

Unnamed: 0,image_path,brightness,contrast,sharpness,edge_density,face_present,ad_id,ocr_text,sentiment_score,capital_ratio,persuasion_score,cta_score,readability_score,ctr_proxy
0,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.967216,0.298577,1.0,0.02954,0,0,"Blue Shoes - 25% Off Free Shipping, Exchanges ...",1.0,0.058824,0.339537,0.0,0.807825,0.483074
1,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.961849,0.335428,0.792032,0.014588,0,1,H&M - Leather belt - Black - Ladies £12.99 H&M,0.5,0.2,0.337148,0.0,0.909586,0.418692
2,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.504698,0.635967,1.0,0.071668,1,2,STAY WARM IN STYLE shop ‘ns seasor’s oLterwear...,0.5,0.6,0.19403,0.333333,0.655555,0.530752
3,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.616133,0.645663,0.704462,0.054024,0,3,ine shopping site,0.5,0.0,0.376853,0.333333,0.9099,0.416284
4,C:\Users\subha\OneDrive\Desktop\projects\dents...,0.812702,0.59587,1.0,0.070376,0,4,,0.5,0.0,0.0,0.0,0.5,0.532906


In [185]:
df_text.columns

Index(['text', 'dimensions', 'clean_text', 'num_words', 'num_caps_words',
       'capital_ratio', 'num_exclamations', 'sentiment_score_test',
       'sentiment_score', 'persuasion_score', 'cta_score', 'readability_score',
       'ctr_proxy', 'ctr_bucket', 'ad_id'],
      dtype='object')

In [186]:
df_images.columns

Index(['image_path', 'brightness', 'contrast', 'sharpness', 'edge_density',
       'face_present', 'ad_id', 'ocr_text', 'sentiment_score', 'capital_ratio',
       'persuasion_score', 'cta_score', 'readability_score', 'ctr_proxy'],
      dtype='object')

### Features listing

In [188]:
feature_cols = [
    # TEXT FEATURES
    "sentiment_score",
    "capital_ratio",
    "persuasion_score",
    "cta_score",
    "readability_score",

    # IMAGE FEATURES
    "brightness",
    "contrast",
    "sharpness",
    "edge_density",
    "face_present"
]

In [189]:
df_text_final = df_text.copy()

df_text_final["brightness"]   = 0.5
df_text_final["contrast"]     = 0.5
df_text_final["sharpness"]    = 0.5
df_text_final["edge_density"] = 0.5
df_text_final["face_present"] = 0

In [190]:
df_images_final = df_images.copy()

In [191]:
df_text_final  = df_text_final[feature_cols + ["ctr_proxy"]]
df_images_final = df_images_final[feature_cols + ["ctr_proxy"]]

In [192]:
df_text_final.head()

Unnamed: 0,sentiment_score,capital_ratio,persuasion_score,cta_score,readability_score,brightness,contrast,sharpness,edge_density,face_present,ctr_proxy
0,0.5,0.037037,0.198016,1.0,0.699938,0.5,0.5,0.5,0.5,0,0.436902
1,0.5,0.098039,0.222818,0.333333,0.88905,0.5,0.5,0.5,0.5,0,0.38367
2,0.5,0.6,0.151504,0.333333,0.179683,0.5,0.5,0.5,0.5,0,0.359828
3,0.5,0.407407,0.262872,0.0,0.840504,0.5,0.5,0.5,0.5,0,0.398275
4,1.0,0.277778,0.17279,0.666667,0.452624,0.5,0.5,0.5,0.5,0,0.516646


In [193]:
df_images_final.head()

Unnamed: 0,sentiment_score,capital_ratio,persuasion_score,cta_score,readability_score,brightness,contrast,sharpness,edge_density,face_present,ctr_proxy
0,1.0,0.058824,0.339537,0.0,0.807825,0.967216,0.298577,1.0,0.02954,0,0.483074
1,0.5,0.2,0.337148,0.0,0.909586,0.961849,0.335428,0.792032,0.014588,0,0.418692
2,0.5,0.6,0.19403,0.333333,0.655555,0.504698,0.635967,1.0,0.071668,1,0.530752
3,0.5,0.0,0.376853,0.333333,0.9099,0.616133,0.645663,0.704462,0.054024,0,0.416284
4,0.5,0.0,0.0,0.0,0.5,0.812702,0.59587,1.0,0.070376,0,0.532906


In [194]:
len(df_images_final)

302

In [196]:
df_final = pd.concat(
    [df_text_final, df_images_final],
    axis=0,
    ignore_index=True
)
len(df_final)

1302

In [197]:
df_final.head()

Unnamed: 0,sentiment_score,capital_ratio,persuasion_score,cta_score,readability_score,brightness,contrast,sharpness,edge_density,face_present,ctr_proxy
0,0.5,0.037037,0.198016,1.0,0.699938,0.5,0.5,0.5,0.5,0,0.436902
1,0.5,0.098039,0.222818,0.333333,0.88905,0.5,0.5,0.5,0.5,0,0.38367
2,0.5,0.6,0.151504,0.333333,0.179683,0.5,0.5,0.5,0.5,0,0.359828
3,0.5,0.407407,0.262872,0.0,0.840504,0.5,0.5,0.5,0.5,0,0.398275
4,1.0,0.277778,0.17279,0.666667,0.452624,0.5,0.5,0.5,0.5,0,0.516646


In [198]:
df_final.isna().sum()

sentiment_score      0
capital_ratio        0
persuasion_score     0
cta_score            0
readability_score    0
brightness           0
contrast             0
sharpness            0
edge_density         0
face_present         0
ctr_proxy            0
dtype: int64

In [199]:
df_final.describe()

Unnamed: 0,sentiment_score,capital_ratio,persuasion_score,cta_score,readability_score,brightness,contrast,sharpness,edge_density,face_present,ctr_proxy
count,1302.0,1302.0,1302.0,1302.0,1302.0,1302.0,1302.0,1302.0,1302.0,1302.0,1302.0
mean,0.62404,0.301146,0.22426,0.291347,0.596568,0.593154,0.467519,0.602635,0.392708,0.009985,0.420886
std,0.233144,0.239299,0.08772,0.302203,0.244707,0.179091,0.108725,0.196943,0.195513,0.099461,0.094521
min,0.0,0.0,0.0,0.0,0.0,0.362839,0.121375,0.430266,0.012032,0.0,0.141852
25%,0.5,0.0625,0.165743,0.0,0.44405,0.5,0.5,0.5,0.5,0.0,0.36384
50%,0.5,0.272727,0.218478,0.333333,0.618161,0.5,0.5,0.5,0.5,0.0,0.418904
75%,1.0,0.6,0.278134,0.333333,0.778107,0.5,0.5,0.5,0.5,0.0,0.482296
max,1.0,0.6,0.543086,1.0,1.0,0.993277,0.882242,1.0,0.5,1.0,0.740362


## Feeding Dataset

In [200]:
X = df_final[feature_cols]
Y = df_final["ctr_proxy"]

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

In [201]:
multiModel = CatBoostRegressor(
    iterations=600,
    depth=6,
    learning_rate=0.05,
    loss_function="RMSE",
    random_seed=42,
    verbose=100
)

multiModel.fit(X_train, Y_train, eval_set=(X_test, Y_test))

0:	learn: 0.0887992	test: 0.1014983	best: 0.1014983 (0)	total: 2.39ms	remaining: 1.43s
100:	learn: 0.0123020	test: 0.0185522	best: 0.0185522 (100)	total: 156ms	remaining: 769ms
200:	learn: 0.0062012	test: 0.0124502	best: 0.0124502 (200)	total: 317ms	remaining: 629ms
300:	learn: 0.0044122	test: 0.0109582	best: 0.0109582 (300)	total: 471ms	remaining: 468ms
400:	learn: 0.0035187	test: 0.0103892	best: 0.0103892 (400)	total: 631ms	remaining: 313ms
500:	learn: 0.0029135	test: 0.0100691	best: 0.0100683 (499)	total: 785ms	remaining: 155ms
599:	learn: 0.0024649	test: 0.0098531	best: 0.0098531 (599)	total: 942ms	remaining: 0us

bestTest = 0.009853088267
bestIteration = 599



<catboost.core.CatBoostRegressor at 0x214f1398e10>

In [203]:
multiModel.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,sentiment_score,29.870381
1,cta_score,18.742893
2,capital_ratio,18.668201
3,readability_score,15.176948
4,persuasion_score,7.540891
5,sharpness,4.337745
6,brightness,2.733982
7,edge_density,1.923563
8,face_present,0.528667
9,contrast,0.476727


In [206]:
Y_pred = multiModel.predict(X_test)
rmse = np.sqrt(mean_squared_error(Y_test, Y_pred))
rmse

np.float64(0.009853087913627621)

In [207]:
r2 = r2_score(Y_test, Y_pred)
r2

0.9911299454947338

In [209]:
multiModel.save_model("catboost_ctr_model_v2.cbm")