In [6]:
!pip install transformers

Collecting transformers
  Downloading transformers-5.0.0-py3-none-any.whl.metadata (37 kB)
Collecting huggingface-hub<2.0,>=1.3.0 (from transformers)
  Downloading huggingface_hub-1.3.4-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.2-cp39-abi3-win_amd64.whl.metadata (7.4 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.7.0-cp38-abi3-win_amd64.whl.metadata (4.2 kB)
Collecting hf-xet<2.0.0,>=1.2.0 (from huggingface-hub<2.0,>=1.3.0->transformers)
  Downloading hf_xet-1.2.0-cp37-abi3-win_amd64.whl.metadata (5.0 kB)
Downloading transformers-5.0.0-py3-none-any.whl (10.1 MB)
   ---------------------------------------- 0.0/10.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/10.1 MB ? eta -:--:--
   - -------------------------------------- 0.3/10.1 MB ? eta -:--:--
   -- ------------------------------------- 0.5/10.1 MB 952.2 kB/s eta 0:00:11
   --- -------------------

In [53]:
!pip install sentence-transformers --quiet

In [85]:
!pip install textstat --quiet

In [86]:
import re
import numpy as np
import pandas as pd
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import textstat

### Data Preprocessing

In [22]:
df = pd.read_csv("ads_creative_text_sample.csv")
df

Unnamed: 0,text,dimensions
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,"(160, 600)"
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,"(300, 250)"
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,"(300, 600)"
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,"(300, 250)"
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"(728, 90)"
...,...,...
995,Ochrome OS\nDevice management\nmade simple and...,"(300, 600)"
996,"Continue to Recipe\n3 Easy Steps:\n1) Click ""C...","(336, 280)"
997,WIN THIS HOUSE\nAND START\nEVERY DAY HERE\nAll...,"(970, 250)"
998,EVIDENT OLYMPUS\nSZX-AR1 Augmented\nReality Mi...,"(300, 250)"


In [23]:
def clean_ad_text(text):
    text = text.replace("\n", " ")
    text = " ".join(text.split()) 
    return text


In [24]:
df["clean_text"] = df["text"].apply(clean_ad_text)
df[["text", "clean_text"]].head(5)

Unnamed: 0,text,clean_text
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,Up to $100 off roundtrip flights to Ireland. T...
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,yp The Real ур Yellow Pages Find cheap gas nea...
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,Food Navigator USA Plant-based meat: Beyond th...
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos..."


In [25]:
def capital_features(text):
    if not isinstance(text, str):
        text = ""

    words = text.split()
    num_words = len(words)
    caps_words = sum(1 for w in words if w.isupper() and len(w) > 1)
    capital_ratio = caps_words / num_words if num_words else 0
    num_exclamations = text.count("!")

    return pd.Series({
        "num_words": num_words,
        "num_caps_words": caps_words,
        "capital_ratio": min(capital_ratio, 0.6),
        "num_exclamations": num_exclamations
    })


In [27]:
cap_df = df["clean_text"].apply(capital_features)
cap_df

Unnamed: 0,num_words,num_caps_words,capital_ratio,num_exclamations
0,27.0,1.0,0.037037,0.0
1,51.0,5.0,0.098039,1.0
2,23.0,14.0,0.600000,0.0
3,27.0,11.0,0.407407,0.0
4,18.0,5.0,0.277778,2.0
...,...,...,...,...
995,22.0,1.0,0.045455,0.0
996,19.0,0.0,0.000000,1.0
997,15.0,8.0,0.533333,0.0
998,10.0,3.0,0.300000,0.0


In [28]:
df = pd.concat([df, cap_df], axis=1)

In [29]:
df.columns

Index(['text', 'dimensions', 'clean_text', 'num_words', 'num_caps_words',
       'capital_ratio', 'num_exclamations'],
      dtype='object')

In [30]:
df.head()

Unnamed: 0,text,dimensions,clean_text,num_words,num_caps_words,capital_ratio,num_exclamations
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,"(160, 600)",Up to $100 off roundtrip flights to Ireland. T...,27.0,1.0,0.037037,0.0
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,"(300, 250)",yp The Real ур Yellow Pages Find cheap gas nea...,51.0,5.0,0.098039,1.0
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,"(300, 600)",Food Navigator USA Plant-based meat: Beyond th...,23.0,14.0,0.6,0.0
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,"(300, 250)",MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,27.0,11.0,0.407407,0.0
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"(728, 90)","YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",18.0,5.0,0.277778,2.0


In [31]:
df.isna().sum()

text                0
dimensions          0
clean_text          0
num_words           0
num_caps_words      0
capital_ratio       0
num_exclamations    0
dtype: int64

### Cleaning of Clean_Text

In [33]:
df["clean_text"] = df["clean_text"].fillna("")

In [34]:
df["clean_text"].head(5)

0    Up to $100 off roundtrip flights to Ireland. T...
1    yp The Real ур Yellow Pages Find cheap gas nea...
2    Food Navigator USA Plant-based meat: Beyond th...
3    MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...
4    YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...
Name: clean_text, dtype: object

### Twitter Roberta Sentiment Model

In [35]:
sentiment_pipe = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment"
)

config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: cardiffnlp/twitter-roberta-base-sentiment
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [38]:
sentiment_pipe("LIMITED TIME OFFER! BUY ONE GET ONE FREE")

[{'label': 'LABEL_2', 'score': 0.7480954527854919}]

In [39]:
sentiment_pipe("Buy one Get one free")

[{'label': 'LABEL_1', 'score': 0.6027004718780518}]

In [41]:
def sentiment_score(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0.5  # neutral fallback

    result = sentiment_pipe(text)[0]["label"]

    mapping = {
        "LABEL_2": 1.0,  # positive
        "LABEL_1": 0.5,  # neutral
        "LABEL_0": 0.0   # negative
    }

    return mapping[result]


In [42]:
df["sentiment_score_test"] = df["clean_text"].head(10).apply(sentiment_score)

In [43]:
df[["clean_text", "sentiment_score_test"]].head(10)

Unnamed: 0,clean_text,sentiment_score_test
0,Up to $100 off roundtrip flights to Ireland. T...,0.5
1,yp The Real ур Yellow Pages Find cheap gas nea...,0.5
2,Food Navigator USA Plant-based meat: Beyond th...,0.5
3,MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,0.5
4,"YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",1.0
5,ROSELINLIN FREE SHIPPING 10,0.5
6,MENARDS GIFT CARD MENARDS GIFT CARDS MEWARDS G...,0.5
7,"From Hello to Help, 211 is Here 2.1.1 Get Conn...",0.5
8,ALL ABOUT CIRCUITS Need a Battery Refresher? 9...,0.5
9,UNITED MileagePlus. Get ready and bon voyage U...,0.5


In [44]:
df[["clean_text", "sentiment_score_test"]].head(10)

Unnamed: 0,clean_text,sentiment_score_test
0,Up to $100 off roundtrip flights to Ireland. T...,0.5
1,yp The Real ур Yellow Pages Find cheap gas nea...,0.5
2,Food Navigator USA Plant-based meat: Beyond th...,0.5
3,MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,0.5
4,"YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",1.0
5,ROSELINLIN FREE SHIPPING 10,0.5
6,MENARDS GIFT CARD MENARDS GIFT CARDS MEWARDS G...,0.5
7,"From Hello to Help, 211 is Here 2.1.1 Get Conn...",0.5
8,ALL ABOUT CIRCUITS Need a Battery Refresher? 9...,0.5
9,UNITED MileagePlus. Get ready and bon voyage U...,0.5


In [45]:
df["sentiment_score"] = df["clean_text"].apply(sentiment_score)

In [46]:
df["sentiment_score"].value_counts()

sentiment_score
0.5    729
1.0    252
0.0     19
Name: count, dtype: int64

In [48]:
df[["clean_text", "sentiment_score"]].sample(10)

Unnamed: 0,clean_text,sentiment_score
178,INTERACT WITH GQ BE A PART OF GOCOMMUNITY JOIN...,0.5
866,MARS 4K A 4K Wireless Video Transmission Syste...,0.5
221,goodness Celebrating all that's good. PG Spons...,1.0
907,Ponce Bank OF BESTTH BRONX PRESENTED BY PONCE ...,0.5
153,FANDANGO F FANDANGO we love movies F FANDANGO ...,1.0
718,FIND RARE PARTS THE ONLY EBAY STORE WHERE YOU ...,0.5
959,A mortgage.net Today's Refinance Rate 1.997 %*...,0.5
878,WHAT'S YOUR NEXT MOVE? Create your own headlin...,0.5
508,"HERE COMES Santa Claus Friday, December 3 News...",0.5
513,Take a break from 'the usual' VISIT UPTOWNCHAR...,0.5


In [49]:
df.to_csv("ads_with_sentiment.csv", index=False)

In [50]:
df.head()

Unnamed: 0,text,dimensions,clean_text,num_words,num_caps_words,capital_ratio,num_exclamations,sentiment_score_test,sentiment_score
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,"(160, 600)",Up to $100 off roundtrip flights to Ireland. T...,27.0,1.0,0.037037,0.0,0.5,0.5
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,"(300, 250)",yp The Real ур Yellow Pages Find cheap gas nea...,51.0,5.0,0.098039,1.0,0.5,0.5
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,"(300, 600)",Food Navigator USA Plant-based meat: Beyond th...,23.0,14.0,0.6,0.0,0.5,0.5
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,"(300, 250)",MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,27.0,11.0,0.407407,0.0,0.5,0.5
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"(728, 90)","YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",18.0,5.0,0.277778,2.0,1.0,1.0


### Semantic Persuation Score

In [None]:
# sentence-transformers/all-MiniLM-L6-v2

In [51]:
df["clean_text"].head(5)

0    Up to $100 off roundtrip flights to Ireland. T...
1    yp The Real ур Yellow Pages Find cheap gas nea...
2    Food Navigator USA Plant-based meat: Beyond th...
3    MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...
4    YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...
Name: clean_text, dtype: object

In [52]:
df["clean_text"] = df["clean_text"].fillna("")

In [55]:
sbert = SentenceTransformer("all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [56]:
persuasion_anchors = [
    "limited time offer",
    "exclusive deal just for you",
    "win big prizes",
    "best price guaranteed",
    "don’t miss this opportunity",
    "start your journey today",
    "high quality product at discount"
]

In [57]:
anchor_embeddings = sbert.encode(
    persuasion_anchors,
    convert_to_tensor=True
)

In [58]:
### Test data frame
test_text = "LIMITED TIME OFFER! BUY NOW AND SAVE BIG"

test_emb = sbert.encode(test_text, convert_to_tensor=True)
similarity = util.cos_sim(test_emb, anchor_embeddings)

similarity

tensor([[0.7249, 0.3643, 0.2568, 0.3477, 0.1331, 0.0988, 0.2707]])

In [59]:
similarity.max().item()

0.7248973846435547

In [60]:
def persuasion_score(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0.0

    emb = sbert.encode(text, convert_to_tensor=True)
    sim = util.cos_sim(emb, anchor_embeddings).max().item()

    return float(np.clip(sim, 0, 1))


In [64]:
df["persuasion_score_test"] = df["clean_text"].head(10).apply(persuasion_score)
df[["clean_text", "persuasion_score_test"]].head(10)

Unnamed: 0,clean_text,persuasion_score_test
0,Up to $100 off roundtrip flights to Ireland. T...,0.198016
1,yp The Real ур Yellow Pages Find cheap gas nea...,0.222818
2,Food Navigator USA Plant-based meat: Beyond th...,0.151504
3,MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,0.262872
4,"YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",0.17279
5,ROSELINLIN FREE SHIPPING 10,0.294787
6,MENARDS GIFT CARD MENARDS GIFT CARDS MEWARDS G...,0.187448
7,"From Hello to Help, 211 is Here 2.1.1 Get Conn...",0.173179
8,ALL ABOUT CIRCUITS Need a Battery Refresher? 9...,0.105081
9,UNITED MileagePlus. Get ready and bon voyage U...,0.336201


In [65]:
df["persuasion_score"] = df["clean_text"].apply(persuasion_score)

In [66]:
df["persuasion_score"].describe()

count    1000.000000
mean        0.217312
std         0.081536
min         0.029564
25%         0.160739
50%         0.208714
75%         0.262613
max         0.543086
Name: persuasion_score, dtype: float64

In [67]:
df.drop(columns=["persuasion_score_test"], inplace=True)

### CTA Score

In [69]:
df["clean_text"].head(5)

0    Up to $100 off roundtrip flights to Ireland. T...
1    yp The Real ур Yellow Pages Find cheap gas nea...
2    Food Navigator USA Plant-based meat: Beyond th...
3    MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...
4    YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...
Name: clean_text, dtype: object

In [70]:
df["clean_text"] = df["clean_text"].fillna("")

In [71]:
CTA_WORDS = [
    "buy", "shop", "order", "sign up", "signup", "register",
    "download", "click", "start", "get", "try", "apply",
    "claim", "win", "join", "subscribe", "book",

    "buy now", "shop now", "order now", "start now",
    "get started", "join now", "apply now", "book now",
    "reserve now", "enroll now",

    "free trial", "try free", "start free", "get free",
    "free access", "download free", "free preview",
    "no cost", "risk free", "cancel anytime",

    "save now", "get discount", "unlock savings",
    "best deal", "exclusive offer", "special price",
    "flat off", "price drop", "cashback available",
    "grab the deal",

    "limited time", "ends soon", "last chance",
    "act now", "hurry", "don’t miss",
    "today only", "offer expires", "final call",

    "learn more", "explore", "discover",
    "see how it works", "view details",
    "check it out", "find out", "watch demo",

    "get instant access", "claim your offer",
    "buy now and save", "join free today",
    "limited offer act now", "download now it’s free"
]


In [73]:
## Basic CTA scoring function
def cta_score(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0.0

    text = text.lower()

    hits = sum(1 for word in CTA_WORDS if word in text)

    # normalize: 0 CTAs → 0, 3+ → 1.0
    return min(hits / 3, 1.0)


In [74]:
test_ads = [
    "BUY NOW and save big",
    "Sign up today to get exclusive access",
    "Food Navigator USA",
    "Click here to download now",
    "Searching for Speakers"
]

for ad in test_ads:
    print(ad, "→", cta_score(ad))


BUY NOW and save big → 1.0
Sign up today to get exclusive access → 0.6666666666666666
Food Navigator USA → 0.0
Click here to download now → 0.6666666666666666
Searching for Speakers → 0.0


In [76]:
df["cta_score_test"] = df["clean_text"].head(10).apply(cta_score)
df[["clean_text", "cta_score_test"]].head(10)

Unnamed: 0,clean_text,cta_score_test
0,Up to $100 off roundtrip flights to Ireland. T...,1.0
1,yp The Real ур Yellow Pages Find cheap gas nea...,0.333333
2,Food Navigator USA Plant-based meat: Beyond th...,0.333333
3,MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,0.0
4,"YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",0.666667
5,ROSELINLIN FREE SHIPPING 10,0.0
6,MENARDS GIFT CARD MENARDS GIFT CARDS MEWARDS G...,0.666667
7,"From Hello to Help, 211 is Here 2.1.1 Get Conn...",0.333333
8,ALL ABOUT CIRCUITS Need a Battery Refresher? 9...,0.333333
9,UNITED MileagePlus. Get ready and bon voyage U...,0.666667


In [77]:
df["cta_score"] = df["clean_text"].apply(cta_score)

In [79]:
df["cta_score"].value_counts()

cta_score
0.000000    396
0.333333    356
0.666667    185
1.000000     63
Name: count, dtype: int64

In [81]:
df[
    ["clean_text", "cta_score"]
].sample(10)

Unnamed: 0,clean_text,cta_score
504,Take a break from the usual PLAN YOUR VISIT UP...,0.0
997,WIN THIS HOUSE AND START EVERY DAY HERE All to...,0.666667
536,FCW INSIDER CHAT Interviews with the 2023 Fede...,0.333333
858,"Stay productive anywhere, anytime. Boot up in ...",0.333333
475,6 KOTAKU AUSTRALIA PODCAST With David and Ruby...,0.0
408,OCHEN TVIS COCOUNTINL 1798 CHENANGO COUNTY NOW...,0.333333
64,WHAT'S YOUR NEXT MOVE? Create your own headlin...,0.0
580,FICC Fayetteville Technical Community College ...,0.333333
969,WIDE SELECTION OF PRE-OWNED BIKES LOW USED BIK...,0.0
572,ONE VISE TO RULE THEM ALL MULTI-AXIS BENCH-MOU...,0.0


In [82]:
df.drop(columns=["cta_score_test"], inplace=True)

In [83]:
df["clean_text"].head(5)

0    Up to $100 off roundtrip flights to Ireland. T...
1    yp The Real ур Yellow Pages Find cheap gas nea...
2    Food Navigator USA Plant-based meat: Beyond th...
3    MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...
4    YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...
Name: clean_text, dtype: object

In [84]:
df["clean_text"] = df["clean_text"].fillna("")

In [87]:
examples = [
    "Buy now and save big",
    "Searching for speakers",
    "The company has announced a comprehensive strategy to optimize operational efficiency"
]

for text in examples:
    print(text, "→", textstat.flesch_reading_ease(text))

Buy now and save big → 117.16000000000003
Searching for speakers → 62.79000000000002
The company has announced a comprehensive strategy to optimize operational efficiency → -19.6754545454545


### Readability Score

In [88]:
def readability_score(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0.5  # neutral fallback

    score = textstat.flesch_reading_ease(text)

    return max(0, min(score / 100, 1))

In [89]:
df["readability_score_test"] = df["clean_text"].head(10).apply(readability_score)

In [90]:
df[["clean_text", "readability_score_test"]].head(10)

Unnamed: 0,clean_text,readability_score_test
0,Up to $100 off roundtrip flights to Ireland. T...,0.699938
1,yp The Real ур Yellow Pages Find cheap gas nea...,0.88905
2,Food Navigator USA Plant-based meat: Beyond th...,0.179683
3,MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,0.840504
4,"YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",0.452624
5,ROSELINLIN FREE SHIPPING 10,0.75875
6,MENARDS GIFT CARD MENARDS GIFT CARDS MEWARDS G...,0.717816
7,"From Hello to Help, 211 is Here 2.1.1 Get Conn...",0.970014
8,ALL ABOUT CIRCUITS Need a Battery Refresher? 9...,0.519229
9,UNITED MileagePlus. Get ready and bon voyage U...,0.7272


In [91]:
df["readability_score"] = df["clean_text"].apply(readability_score)

In [92]:
df["readability_score"].describe()

count    1000.000000
mean        0.593930
std         0.251079
min         0.000000
25%         0.418518
50%         0.616373
75%         0.782097
max         1.000000
Name: readability_score, dtype: float64

In [93]:
df[["clean_text", "readability_score"]].sample(10)

Unnamed: 0,clean_text,readability_score
213,ASHLEY let the light shine shop now →,1.0
941,Essential news from around the world Read More,0.8239
35,THE YAMAHA GET OUT AND RIDE SALES EVENT WOLVER...,0.71065
34,I I Verified Coupons Available SEE CODES Capit...,0.375452
443,THIS SEASON'S OUTSTANDING HAT-TRICK. 5.99% P.A...,0.774164
362,the spruce. cleaning Meet Our Print Issue! GET...,0.9178
740,Anderson Family Heath-Anderson FUNERAL HOMES &...,0.0
829,Are your cholesterol numbers creeping up? You ...,0.41728
170,NEW YORK RANGERS NHLSHOP.COM AF Fanatics Exper...,0.59745
543,CBTnuggets On-demand IT training Make IT happen,0.547014


In [94]:
df.drop(columns=["readability_score_test"], inplace=True)

## Final CTR Calculation

In [95]:
required_cols = [
    "sentiment_score",
    "capital_ratio",
    "persuasion_score",
    "cta_score",
    "readability_score"
]

missing = [c for c in required_cols if c not in df.columns]
missing

[]

In [96]:
df[required_cols] = df[required_cols].fillna(0)

In [97]:
## Proxy function
def compute_ctr_proxy(row):
    ctr = (
        0.25 * row["sentiment_score"] +
        0.20 * row["capital_ratio"] +
        0.25 * row["persuasion_score"] +
        0.15 * row["cta_score"] +
        0.15 * row["readability_score"]
    )
    return max(0, min(ctr, 1))


In [98]:
df["ctr_proxy_test"] = df.head(10).apply(compute_ctr_proxy, axis=1)

df[[
        "clean_text",
        "sentiment_score",
        "capital_ratio",
        "persuasion_score",
        "cta_score",
        "readability_score",
        "ctr_proxy_test"
]].head(10)

Unnamed: 0,clean_text,sentiment_score,capital_ratio,persuasion_score,cta_score,readability_score,ctr_proxy_test
0,Up to $100 off roundtrip flights to Ireland. T...,0.5,0.037037,0.198016,1.0,0.699938,0.436902
1,yp The Real ур Yellow Pages Find cheap gas nea...,0.5,0.098039,0.222818,0.333333,0.88905,0.38367
2,Food Navigator USA Plant-based meat: Beyond th...,0.5,0.6,0.151504,0.333333,0.179683,0.359828
3,MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,0.5,0.407407,0.262872,0.0,0.840504,0.398275
4,"YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",1.0,0.277778,0.17279,0.666667,0.452624,0.516646
5,ROSELINLIN FREE SHIPPING 10,0.5,0.6,0.294787,0.0,0.75875,0.432509
6,MENARDS GIFT CARD MENARDS GIFT CARDS MEWARDS G...,0.5,0.6,0.187448,0.666667,0.717816,0.499534
7,"From Hello to Help, 211 is Here 2.1.1 Get Conn...",0.5,0.142857,0.173179,0.333333,0.970014,0.392368
8,ALL ABOUT CIRCUITS Need a Battery Refresher? 9...,0.5,0.565217,0.105081,0.333333,0.519229,0.392198
9,UNITED MileagePlus. Get ready and bon voyage U...,0.5,0.047619,0.336201,0.666667,0.7272,0.427654


In [99]:
df["ctr_proxy"] = df.apply(compute_ctr_proxy, axis=1)

In [100]:
df["ctr_proxy"].describe()

count    1000.000000
mean        0.416045
std         0.102411
min         0.141852
25%         0.349670
50%         0.408393
75%         0.487074
max         0.740362
Name: ctr_proxy, dtype: float64

In [101]:
def ctr_bucket(ctr):
    if ctr >= 0.6:
        return "High"
    elif ctr >= 0.3:
        return "Medium"
    else:
        return "Low"

df["ctr_bucket"] = df["ctr_proxy"].apply(ctr_bucket)

In [102]:
df["ctr_bucket"].value_counts()

ctr_bucket
Medium    836
Low       123
High       41
Name: count, dtype: int64

In [103]:
df.drop(columns=["ctr_proxy_test"], inplace=True)

In [104]:
df

Unnamed: 0,text,dimensions,clean_text,num_words,num_caps_words,capital_ratio,num_exclamations,sentiment_score_test,sentiment_score,persuasion_score,cta_score,readability_score,ctr_proxy,ctr_bucket
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,"(160, 600)",Up to $100 off roundtrip flights to Ireland. T...,27.0,1.0,0.037037,0.0,0.5,0.5,0.198016,1.000000,0.699938,0.436902,Medium
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,"(300, 250)",yp The Real ур Yellow Pages Find cheap gas nea...,51.0,5.0,0.098039,1.0,0.5,0.5,0.222818,0.333333,0.889050,0.383670,Medium
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,"(300, 600)",Food Navigator USA Plant-based meat: Beyond th...,23.0,14.0,0.600000,0.0,0.5,0.5,0.151504,0.333333,0.179683,0.359828,Medium
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,"(300, 250)",MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,27.0,11.0,0.407407,0.0,0.5,0.5,0.262872,0.000000,0.840504,0.398275,Medium
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"(728, 90)","YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",18.0,5.0,0.277778,2.0,1.0,1.0,0.172790,0.666667,0.452624,0.516646,Medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Ochrome OS\nDevice management\nmade simple and...,"(300, 600)",Ochrome OS Device management made simple and s...,22.0,1.0,0.045455,0.0,,1.0,0.131270,1.000000,0.724917,0.550646,Medium
996,"Continue to Recipe\n3 Easy Steps:\n1) Click ""C...","(336, 280)","Continue to Recipe 3 Easy Steps: 1) Click ""Con...",19.0,0.0,0.000000,1.0,,1.0,0.089067,0.333333,0.458030,0.390971,Medium
997,WIN THIS HOUSE\nAND START\nEVERY DAY HERE\nAll...,"(970, 250)",WIN THIS HOUSE AND START EVERY DAY HERE All to...,15.0,8.0,0.533333,0.0,,1.0,0.380776,0.666667,0.920625,0.689954,High
998,EVIDENT OLYMPUS\nSZX-AR1 Augmented\nReality Mi...,"(300, 250)",EVIDENT OLYMPUS SZX-AR1 Augmented Reality Micr...,10.0,3.0,0.300000,0.0,,0.5,0.167346,0.000000,0.000000,0.226836,Low


In [105]:
df.head()

Unnamed: 0,text,dimensions,clean_text,num_words,num_caps_words,capital_ratio,num_exclamations,sentiment_score_test,sentiment_score,persuasion_score,cta_score,readability_score,ctr_proxy,ctr_bucket
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,"(160, 600)",Up to $100 off roundtrip flights to Ireland. T...,27.0,1.0,0.037037,0.0,0.5,0.5,0.198016,1.0,0.699938,0.436902,Medium
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,"(300, 250)",yp The Real ур Yellow Pages Find cheap gas nea...,51.0,5.0,0.098039,1.0,0.5,0.5,0.222818,0.333333,0.88905,0.38367,Medium
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,"(300, 600)",Food Navigator USA Plant-based meat: Beyond th...,23.0,14.0,0.6,0.0,0.5,0.5,0.151504,0.333333,0.179683,0.359828,Medium
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,"(300, 250)",MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,27.0,11.0,0.407407,0.0,0.5,0.5,0.262872,0.0,0.840504,0.398275,Medium
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"(728, 90)","YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",18.0,5.0,0.277778,2.0,1.0,1.0,0.17279,0.666667,0.452624,0.516646,Medium


In [106]:
df.tail()

Unnamed: 0,text,dimensions,clean_text,num_words,num_caps_words,capital_ratio,num_exclamations,sentiment_score_test,sentiment_score,persuasion_score,cta_score,readability_score,ctr_proxy,ctr_bucket
995,Ochrome OS\nDevice management\nmade simple and...,"(300, 600)",Ochrome OS Device management made simple and s...,22.0,1.0,0.045455,0.0,,1.0,0.13127,1.0,0.724917,0.550646,Medium
996,"Continue to Recipe\n3 Easy Steps:\n1) Click ""C...","(336, 280)","Continue to Recipe 3 Easy Steps: 1) Click ""Con...",19.0,0.0,0.0,1.0,,1.0,0.089067,0.333333,0.45803,0.390971,Medium
997,WIN THIS HOUSE\nAND START\nEVERY DAY HERE\nAll...,"(970, 250)",WIN THIS HOUSE AND START EVERY DAY HERE All to...,15.0,8.0,0.533333,0.0,,1.0,0.380776,0.666667,0.920625,0.689954,High
998,EVIDENT OLYMPUS\nSZX-AR1 Augmented\nReality Mi...,"(300, 250)",EVIDENT OLYMPUS SZX-AR1 Augmented Reality Micr...,10.0,3.0,0.3,0.0,,0.5,0.167346,0.0,0.0,0.226836,Low
999,Searching for Speakers?\nCompare\nBefore You B...,"(300, 250)",Searching for Speakers? Compare Before You Buy...,13.0,2.0,0.153846,0.0,,0.5,0.253527,0.333333,0.570683,0.354753,Medium


In [107]:
df.head()

Unnamed: 0,text,dimensions,clean_text,num_words,num_caps_words,capital_ratio,num_exclamations,sentiment_score_test,sentiment_score,persuasion_score,cta_score,readability_score,ctr_proxy,ctr_bucket
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,"(160, 600)",Up to $100 off roundtrip flights to Ireland. T...,27.0,1.0,0.037037,0.0,0.5,0.5,0.198016,1.0,0.699938,0.436902,Medium
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,"(300, 250)",yp The Real ур Yellow Pages Find cheap gas nea...,51.0,5.0,0.098039,1.0,0.5,0.5,0.222818,0.333333,0.88905,0.38367,Medium
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,"(300, 600)",Food Navigator USA Plant-based meat: Beyond th...,23.0,14.0,0.6,0.0,0.5,0.5,0.151504,0.333333,0.179683,0.359828,Medium
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,"(300, 250)",MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,27.0,11.0,0.407407,0.0,0.5,0.5,0.262872,0.0,0.840504,0.398275,Medium
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"(728, 90)","YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",18.0,5.0,0.277778,2.0,1.0,1.0,0.17279,0.666667,0.452624,0.516646,Medium


In [108]:
df.head(20)

Unnamed: 0,text,dimensions,clean_text,num_words,num_caps_words,capital_ratio,num_exclamations,sentiment_score_test,sentiment_score,persuasion_score,cta_score,readability_score,ctr_proxy,ctr_bucket
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,"(160, 600)",Up to $100 off roundtrip flights to Ireland. T...,27.0,1.0,0.037037,0.0,0.5,0.5,0.198016,1.0,0.699938,0.436902,Medium
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,"(300, 250)",yp The Real ур Yellow Pages Find cheap gas nea...,51.0,5.0,0.098039,1.0,0.5,0.5,0.222818,0.333333,0.88905,0.38367,Medium
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,"(300, 600)",Food Navigator USA Plant-based meat: Beyond th...,23.0,14.0,0.6,0.0,0.5,0.5,0.151504,0.333333,0.179683,0.359828,Medium
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,"(300, 250)",MONSTROUS AIRFLOW $20-$23 DURAMAX banks +27% B...,27.0,11.0,0.407407,0.0,0.5,0.5,0.262872,0.0,0.840504,0.398275,Medium
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"(728, 90)","YUMMY COMBS * a Nutriti Wellne NOW Safest, Mos...",18.0,5.0,0.277778,2.0,1.0,1.0,0.17279,0.666667,0.452624,0.516646,Medium
5,ROSELINLIN\nFREE SHIPPING\n10,"(970, 250)",ROSELINLIN FREE SHIPPING 10,4.0,3.0,0.6,0.0,0.5,0.5,0.294787,0.0,0.75875,0.432509,Medium
6,MENARDS\nGIFT CARD\nMENARDS\nGIFT CARDS\nMEWAR...,"(300, 250)",MENARDS GIFT CARD MENARDS GIFT CARDS MEWARDS G...,19.0,19.0,0.6,1.0,0.5,0.5,0.187448,0.666667,0.717816,0.499534,Medium
7,"From Hello\nto Help,\n211 is Here\n2.1.1\nGet ...","(300, 250)","From Hello to Help, 211 is Here 2.1.1 Get Conn...",14.0,2.0,0.142857,0.0,0.5,0.5,0.173179,0.333333,0.970014,0.392368,Medium
8,ALL ABOUT\nCIRCUITS\nNeed a Battery\nRefresher...,"(300, 250)",ALL ABOUT CIRCUITS Need a Battery Refresher? 9...,23.0,13.0,0.565217,1.0,0.5,0.5,0.105081,0.333333,0.519229,0.392198,Medium
9,UNITED\nMileagePlus.\nGet ready and bon voyage...,"(300, 250)",UNITED MileagePlus. Get ready and bon voyage U...,21.0,1.0,0.047619,0.0,0.5,0.5,0.336201,0.666667,0.7272,0.427654,Medium
