In [1]:
!pip install nltk spacy scikit-learn textblob
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     - -------------------------------------- 0.5/12.8 MB 1.9 MB/s eta 0:00:07
     -- ------------------------------------- 0.8/12.8 MB 1.5 MB/s eta 0:00:08
     --- ------------------------------------ 1.0/12.8 MB 1.4 MB/s eta 0:00:09
     ---- ----------------------------------- 1.3/12.8 MB 1.3 MB/s eta 0:00:09
     ---- ----------------------------------- 1.6/12.8 MB 1.3 MB/s eta 0:00:09
     ----- ---------------------------------- 1.8/12.8 MB 1.3 MB/s eta 0:00:09
     ------ --------------------------------- 2.1/12.8 MB 1.3 MB/s eta 0:00:09
     ------- -------------------------------- 2.4/12.8 MB 1.3 MB/s eta 0:00:09
     -------- ------------------------------- 2.

In [2]:
!pip install textstat



### Importing Libraries

In [3]:
import re
import nltk
import spacy
import textstat
import numpy as np

from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

### Raw Text

In [4]:
text = "Buy the best smartphone today with 20% discount and free delivery!"

#### Cleaning the Text

In [5]:
import re

In [6]:
text = text.lower()
text = re.sub(r'[^a-z0-9\s]', '', text)
text

'buy the best smartphone today with 20 discount and free delivery'

#### Tokenization

In [7]:
words = text.split()
words

['buy',
 'the',
 'best',
 'smartphone',
 'today',
 'with',
 '20',
 'discount',
 'and',
 'free',
 'delivery']

#### Removing useless words

In [8]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
filtered_words = [w for w in words if w not in stop_words]

print(filtered_words)

['buy', 'best', 'smartphone', 'today', '20', 'discount', 'free', 'delivery']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\subha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### By frequency division

In [9]:
from collections import Counter

In [10]:
word_freq = Counter(filtered_words)
word_freq

Counter({'buy': 1,
         'best': 1,
         'smartphone': 1,
         'today': 1,
         '20': 1,
         'discount': 1,
         'free': 1,
         'delivery': 1})

#### TF-IDF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
ads = [
    "Buy the best smartphone today with 20% discount",
    "Limited offer on laptops",
    "Free delivery on electronics"
]

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(ads)

feature_names = vectorizer.get_feature_names_out()
scores = X[0].toarray()[0]

important_words = sorted(
    zip(feature_names, scores),
    key=lambda x: x[1],
    reverse=True
)

important_words[:5]

[('20', np.float64(0.4082482904638631)),
 ('best', np.float64(0.4082482904638631)),
 ('buy', np.float64(0.4082482904638631)),
 ('discount', np.float64(0.4082482904638631)),
 ('smartphone', np.float64(0.4082482904638631))]

#### Extracting meaning

In [13]:
import spacy

In [14]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("Buy the best smartphone today with 20% discount")

for ent in doc.ents:
    print(ent.text, ent.label_)

today DATE
20% PERCENT


In [15]:
cta_words = ["buy", "shop", "install", "download", "try", "sign"]

cta_present = any(w in filtered_words for w in cta_words)
print(cta_present)

True


## Generating Synthetic CTR Dataset

In [16]:
import pandas as pd

In [17]:
df = pd.read_csv("ads_creative_text_sample.csv")
df

Unnamed: 0,text,dimensions
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,"(160, 600)"
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,"(300, 250)"
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,"(300, 600)"
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,"(300, 250)"
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"(728, 90)"
...,...,...
995,Ochrome OS\nDevice management\nmade simple and...,"(300, 600)"
996,"Continue to Recipe\n3 Easy Steps:\n1) Click ""C...","(336, 280)"
997,WIN THIS HOUSE\nAND START\nEVERY DAY HERE\nAll...,"(970, 250)"
998,EVIDENT OLYMPUS\nSZX-AR1 Augmented\nReality Mi...,"(300, 250)"


In [18]:
df.head()

Unnamed: 0,text,dimensions
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,"(160, 600)"
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,"(300, 250)"
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,"(300, 600)"
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,"(300, 250)"
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"(728, 90)"


In [19]:
df["text"]

0      Up to\n$100 off\nroundtrip\nflights to\nIrelan...
1      yp The Real\nур\nYellow Pages\nFind cheap\ngas...
2      Food Navigator\nUSA\nPlant-based meat:\nBeyond...
3      MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...
4      YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...
                             ...                        
995    Ochrome OS\nDevice management\nmade simple and...
996    Continue to Recipe\n3 Easy Steps:\n1) Click "C...
997    WIN THIS HOUSE\nAND START\nEVERY DAY HERE\nAll...
998    EVIDENT OLYMPUS\nSZX-AR1 Augmented\nReality Mi...
999    Searching for Speakers?\nCompare\nBefore You B...
Name: text, Length: 1000, dtype: object

### Cleaning df["text"]

In [20]:
import re

def clean_ad_text(text):
    if not isinstance(text, str):
        return ""

    # 1. Replace line breaks with space
    text = text.replace("\n", " ")

    # 2. Remove extra dots
    text = re.sub(r"\.{2,}", "", text)

    # 3. Remove special characters (keep % and numbers)
    text = re.sub(r"[^a-zA-Z0-9%\s]", "", text)

    # 4. Convert to lowercase
    text = text.lower()

    # 5. Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [21]:
df["clean_ad_text"] = df["text"].apply(clean_ad_text)

df

Unnamed: 0,text,dimensions,clean_ad_text
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,"(160, 600)",up to 100 off roundtrip flights to ireland tra...
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,"(300, 250)",yp the real yellow pages find cheap gas near y...
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,"(300, 600)",food navigator usa plantbased meat beyond the ...
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,"(300, 250)",monstrous airflow 2023 duramax banks 27% bigge...
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"(728, 90)",yummy combs a nutriti wellne now safest most n...
...,...,...,...
995,Ochrome OS\nDevice management\nmade simple and...,"(300, 600)",ochrome os device management made simple and s...
996,"Continue to Recipe\n3 Easy Steps:\n1) Click ""C...","(336, 280)",continue to recipe 3 easy steps 1 click contin...
997,WIN THIS HOUSE\nAND START\nEVERY DAY HERE\nAll...,"(970, 250)",win this house and start every day here all to...
998,EVIDENT OLYMPUS\nSZX-AR1 Augmented\nReality Mi...,"(300, 250)",evident olympus szxar1 augmented reality micro...


In [22]:
df[["text", "clean_ad_text"]].head(5)

Unnamed: 0,text,clean_ad_text
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,up to 100 off roundtrip flights to ireland tra...
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,yp the real yellow pages find cheap gas near y...
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,food navigator usa plantbased meat beyond the ...
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,monstrous airflow 2023 duramax banks 27% bigge...
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,yummy combs a nutriti wellne now safest most n...


### Duplicate removal

In [23]:
df = df.drop_duplicates(subset="clean_ad_text")
len(df)

993

### TOO short ads removal

In [24]:
df = df.reset_index(drop=True)

### Drop Dimensions Column

In [25]:
df = df.drop(columns=["dimensions"])
df.head()

Unnamed: 0,text,clean_ad_text
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,up to 100 off roundtrip flights to ireland tra...
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,yp the real yellow pages find cheap gas near y...
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,food navigator usa plantbased meat beyond the ...
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,monstrous airflow 2023 duramax banks 27% bigge...
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,yummy combs a nutriti wellne now safest most n...


### Keywords

In [26]:
keywords = pd.read_csv("Keywords.csv")
keywords

Unnamed: 0,buy
0,order
1,shop
2,purchase
3,sale
4,deal
...,...
326,system
327,setup
328,installation
329,supportservice


In [27]:
keyword_weight = {
    # A. Purchase / Action
    "buy": 0.12,
    "order": 0.12,
    "purchase": 0.12,
    "shop": 0.12,
    "now": 0.12,
    "checkout": 0.12,

    # B. Deals / Price
    "sale": 0.08,
    "deal": 0.08,
    "discount": 0.08,
    "coupon": 0.08,
    "promo": 0.08,
    "cashback": 0.08,
    "clearance": 0.08,

    # C. Urgency / Scarcity
    "limited": 0.06,
    "hurry": 0.06,
    "lastchance": 0.06,
    "endingsoon": 0.06,
    "flashsale": 0.06,
    "today": 0.06,

    # D. Trust / Assurance
    "original": 0.05,
    "genuine": 0.05,
    "official": 0.05,
    "trusted": 0.05,
    "verified": 0.05,
    "warranty": 0.05,

    # E. Convenience / Delivery
    "freedelivery": 0.04,
    "freeshipping": 0.04,
    "fastdelivery": 0.04,
    "instant": 0.04,
    "express": 0.04,

    # F. Social Proof / Quality
    "bestseller": 0.04,
    "toprated": 0.04,
    "reviews": 0.04,
    "ratings": 0.04,
    "recommended": 0.04
}


### CTR estimation Function

In [28]:
import math
import re

def estimate_ctr(ad_text,
                 base_ctr=0.015,
                 min_ctr=0.005,
                 max_ctr=0.25,
                 use_log_dampening=True):
    """
    Estimate CTR based on keyword presence in ad text.
    Returns CTR in range [0, 1].
    """

    # Normalize text
    tokens = re.findall(r"[a-zA-Z]+", ad_text.lower())

    # Weighted keyword score
    keyword_score = sum(keyword_weight.get(word, 0) for word in tokens)

    #formula
    if use_log_dampening:
        ctr = base_ctr + math.log(1 + keyword_score)
    else:
        ctr = base_ctr + keyword_score

    # Clamp to valid CTR range
    ctr = min(max(ctr, min_ctr), max_ctr)

    return round(ctr, 4)


In [29]:
ads = [
    "Buy now limited time discount with free delivery",
    "Official store with warranty and fast delivery",
    "Top rated bestseller with verified reviews",
    "Affordable product with cashback offer today",
    "Shop online with instant checkout and express shipping"
]

for ad in ads:
    print(f"Ad: {ad}")
    print(f"Estimated CTR: {estimate_ctr(ad) * 100:.2f}%\n")

Ad: Buy now limited time discount with free delivery
Estimated CTR: 25.00%

Ad: Official store with warranty and fast delivery
Estimated CTR: 11.03%

Ad: Top rated bestseller with verified reviews
Estimated CTR: 13.72%

Ad: Affordable product with cashback offer today
Estimated CTR: 14.60%

Ad: Shop online with instant checkout and express shipping
Estimated CTR: 25.00%



### Estimated CTR

In [30]:
def extract_ctr_features(ad_text):
    tokens = re.findall(r"[a-zA-Z]+", ad_text.lower())

    features = {k: 0 for k in keyword_weight}
    for word in tokens:
        if word in features:
            features[word] += 1

    features["total_keyword_score"] = sum(
        features[k] * keyword_weight[k] for k in keyword_weight
    )

    return features


In [31]:
len(df)

993

In [32]:
df

Unnamed: 0,text,clean_ad_text
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,up to 100 off roundtrip flights to ireland tra...
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,yp the real yellow pages find cheap gas near y...
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,food navigator usa plantbased meat beyond the ...
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,monstrous airflow 2023 duramax banks 27% bigge...
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,yummy combs a nutriti wellne now safest most n...
...,...,...
988,Dying\nLiver\nHelp!\n4 Warning Signs\nof Dying...,dying liver help 4 warning signs of dying live...
989,Ochrome OS\nDevice management\nmade simple and...,ochrome os device management made simple and s...
990,WIN THIS HOUSE\nAND START\nEVERY DAY HERE\nAll...,win this house and start every day here all to...
991,EVIDENT OLYMPUS\nSZX-AR1 Augmented\nReality Mi...,evident olympus szxar1 augmented reality micro...


In [33]:
df["estimated_ctr"] = df["clean_ad_text"].apply(estimate_ctr)

In [34]:
df

Unnamed: 0,text,clean_ad_text,estimated_ctr
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,up to 100 off roundtrip flights to ireland tra...,0.1283
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,yp the real yellow pages find cheap gas near y...,0.1283
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,food navigator usa plantbased meat beyond the ...,0.0150
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,monstrous airflow 2023 duramax banks 27% bigge...,0.0150
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,yummy combs a nutriti wellne now safest most n...,0.2500
...,...,...,...
988,Dying\nLiver\nHelp!\n4 Warning Signs\nof Dying...,dying liver help 4 warning signs of dying live...,0.0150
989,Ochrome OS\nDevice management\nmade simple and...,ochrome os device management made simple and s...,0.0150
990,WIN THIS HOUSE\nAND START\nEVERY DAY HERE\nAll...,win this house and start every day here all to...,0.0150
991,EVIDENT OLYMPUS\nSZX-AR1 Augmented\nReality Mi...,evident olympus szxar1 augmented reality micro...,0.0150


In [35]:
df.tail()

Unnamed: 0,text,clean_ad_text,estimated_ctr
988,Dying\nLiver\nHelp!\n4 Warning Signs\nof Dying...,dying liver help 4 warning signs of dying live...,0.015
989,Ochrome OS\nDevice management\nmade simple and...,ochrome os device management made simple and s...,0.015
990,WIN THIS HOUSE\nAND START\nEVERY DAY HERE\nAll...,win this house and start every day here all to...,0.015
991,EVIDENT OLYMPUS\nSZX-AR1 Augmented\nReality Mi...,evident olympus szxar1 augmented reality micro...,0.015
992,Searching for Speakers?\nCompare\nBefore You B...,searching for speakers compare before you buy ...,0.2301


### CatBoost Regressor

In [36]:
def extract_features(ad_text):
    tokens = re.findall(r"[a-zA-Z]+", ad_text.lower())

    features = {
        "action_cnt": 0,
        "deal_cnt": 0,
        "urgency_cnt": 0,
        "trust_cnt": 0,
        "convenience_cnt": 0,
        "socialproof_cnt": 0,
        "total_keyword_score": 0.0,
        "ad_length": len(tokens)
    }

    for word in tokens:
        w = keyword_weight.get(word, 0)
        if w > 0:
            features["total_keyword_score"] += w

        if word in ["buy", "order", "purchase", "shop", "now", "checkout"]:
            features["action_cnt"] += 1
        elif word in ["sale", "deal", "discount", "coupon", "promo", "cashback", "clearance"]:
            features["deal_cnt"] += 1
        elif word in ["limited", "hurry", "lastchance", "endingsoon", "flashsale", "today"]:
            features["urgency_cnt"] += 1
        elif word in ["original", "genuine", "official", "trusted", "verified", "warranty"]:
            features["trust_cnt"] += 1
        elif word in ["freedelivery", "freeshipping", "fastdelivery", "instant", "express"]:
            features["convenience_cnt"] += 1
        elif word in ["bestseller", "toprated", "reviews", "ratings", "recommended"]:
            features["socialproof_cnt"] += 1

    return features


In [37]:
df

Unnamed: 0,text,clean_ad_text,estimated_ctr
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,up to 100 off roundtrip flights to ireland tra...,0.1283
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,yp the real yellow pages find cheap gas near y...,0.1283
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,food navigator usa plantbased meat beyond the ...,0.0150
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,monstrous airflow 2023 duramax banks 27% bigge...,0.0150
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,yummy combs a nutriti wellne now safest most n...,0.2500
...,...,...,...
988,Dying\nLiver\nHelp!\n4 Warning Signs\nof Dying...,dying liver help 4 warning signs of dying live...,0.0150
989,Ochrome OS\nDevice management\nmade simple and...,ochrome os device management made simple and s...,0.0150
990,WIN THIS HOUSE\nAND START\nEVERY DAY HERE\nAll...,win this house and start every day here all to...,0.0150
991,EVIDENT OLYMPUS\nSZX-AR1 Augmented\nReality Mi...,evident olympus szxar1 augmented reality micro...,0.0150


In [38]:
X = df["clean_ad_text"].apply(extract_features).apply(pd.Series)
y = df["estimated_ctr"]  

## Catboost Regressor

In [41]:
!pip install catboost



In [42]:
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split

In [43]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = CatBoostRegressor(
    iterations=600,
    depth=6,
    learning_rate=0.05,
    loss_function="RMSE",
    eval_metric="RMSE",
    random_seed=42,
    verbose=100
)

model.fit(
    X_train,
    y_train,
    eval_set=(X_val, y_val),
    use_best_model=True
)

0:	learn: 0.0719930	test: 0.0712311	best: 0.0712311 (0)	total: 141ms	remaining: 1m 24s
100:	learn: 0.0022284	test: 0.0026344	best: 0.0026344 (100)	total: 202ms	remaining: 1s
200:	learn: 0.0008694	test: 0.0022070	best: 0.0022070 (200)	total: 260ms	remaining: 516ms
300:	learn: 0.0004828	test: 0.0020674	best: 0.0020673 (299)	total: 323ms	remaining: 321ms
400:	learn: 0.0003158	test: 0.0019980	best: 0.0019980 (400)	total: 386ms	remaining: 191ms
500:	learn: 0.0002383	test: 0.0019488	best: 0.0019488 (500)	total: 444ms	remaining: 87.8ms
599:	learn: 0.0001798	test: 0.0019218	best: 0.0019218 (599)	total: 503ms	remaining: 0us

bestTest = 0.001921797074
bestIteration = 599



<catboost.core.CatBoostRegressor at 0x28c80869400>

### Prediction

In [44]:
new_ads = [
    "Buy now limited discount with free delivery",
    "Official store with warranty"
]

X_new = pd.DataFrame([extract_features(ad) for ad in new_ads])
predicted_ctr = model.predict(X_new)

predicted_ctr

array([0.23107263, 0.10056379])

In [45]:
model.save_model("ctr_model.cbm")