In [3]:
!pip install nltk spacy scikit-learn textblob
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.12-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.12-py3-none-any.whl (176 kB)
Downloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
   --------------- ------------------------ 0.8/2.1 MB 4.9 MB/s eta 0:00:01
   ----------------------------------- ---- 1.8/2.1 MB 5.2 MB/s eta 0:00:01
   ---------------------------------------- 2.1/2.1 MB 4.8 MB/s  0:00:00
Installing collected packages: pyphen, textstat

   -------------------- ------------------- 1/2 [textstat]
   -------------------- ------------------- 1/2 [textstat]
   -------------------- ------------------- 1/2 [textstat]
   ---------------------------------------- 2/2 [textstat]

Successfully installed pyphen-0.17.2 textstat-0.7.12


### Importing Libraries

In [1]:
import re
import nltk
import spacy
import textstat
import numpy as np

from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

### Raw Text

In [2]:
text = "Buy the best smartphone today with 20% discount and free delivery!"

#### Cleaning the Text

In [3]:
import re

In [4]:
text = text.lower()
text = re.sub(r'[^a-z0-9\s]', '', text)
text

'buy the best smartphone today with 20 discount and free delivery'

#### Tokenization

In [5]:
words = text.split()
words

['buy',
 'the',
 'best',
 'smartphone',
 'today',
 'with',
 '20',
 'discount',
 'and',
 'free',
 'delivery']

#### Removing useless words

In [7]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
filtered_words = [w for w in words if w not in stop_words]

print(filtered_words)

['buy', 'best', 'smartphone', 'today', '20', 'discount', 'free', 'delivery']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\subha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### By frequency division

In [8]:
from collections import Counter

In [9]:
word_freq = Counter(filtered_words)
word_freq

Counter({'buy': 1,
         'best': 1,
         'smartphone': 1,
         'today': 1,
         '20': 1,
         'discount': 1,
         'free': 1,
         'delivery': 1})

#### TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
ads = [
    "Buy the best smartphone today with 20% discount",
    "Limited offer on laptops",
    "Free delivery on electronics"
]

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(ads)

feature_names = vectorizer.get_feature_names_out()
scores = X[0].toarray()[0]

important_words = sorted(
    zip(feature_names, scores),
    key=lambda x: x[1],
    reverse=True
)

important_words[:5]

[('20', np.float64(0.4082482904638631)),
 ('best', np.float64(0.4082482904638631)),
 ('buy', np.float64(0.4082482904638631)),
 ('discount', np.float64(0.4082482904638631)),
 ('smartphone', np.float64(0.4082482904638631))]

#### Extracting meaning

In [13]:
import spacy

In [14]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("Buy the best smartphone today with 20% discount")

for ent in doc.ents:
    print(ent.text, ent.label_)

today DATE
20% PERCENT


In [15]:
cta_words = ["buy", "shop", "install", "download", "try", "sign"]

cta_present = any(w in filtered_words for w in cta_words)
print(cta_present)

True


## Generating Synthetic CTR Dataset

In [16]:
import pandas as pd

In [18]:
df = pd.read_csv("ads_creative_text_sample.csv")
df

Unnamed: 0,text,dimensions
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,"(160, 600)"
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,"(300, 250)"
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,"(300, 600)"
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,"(300, 250)"
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"(728, 90)"
...,...,...
995,Ochrome OS\nDevice management\nmade simple and...,"(300, 600)"
996,"Continue to Recipe\n3 Easy Steps:\n1) Click ""C...","(336, 280)"
997,WIN THIS HOUSE\nAND START\nEVERY DAY HERE\nAll...,"(970, 250)"
998,EVIDENT OLYMPUS\nSZX-AR1 Augmented\nReality Mi...,"(300, 250)"


In [19]:
df.head()

Unnamed: 0,text,dimensions
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,"(160, 600)"
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,"(300, 250)"
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,"(300, 600)"
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,"(300, 250)"
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"(728, 90)"


In [20]:
df["text"]

0      Up to\n$100 off\nroundtrip\nflights to\nIrelan...
1      yp The Real\nур\nYellow Pages\nFind cheap\ngas...
2      Food Navigator\nUSA\nPlant-based meat:\nBeyond...
3      MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...
4      YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...
                             ...                        
995    Ochrome OS\nDevice management\nmade simple and...
996    Continue to Recipe\n3 Easy Steps:\n1) Click "C...
997    WIN THIS HOUSE\nAND START\nEVERY DAY HERE\nAll...
998    EVIDENT OLYMPUS\nSZX-AR1 Augmented\nReality Mi...
999    Searching for Speakers?\nCompare\nBefore You B...
Name: text, Length: 1000, dtype: object

### Cleaning df["text"]

In [21]:
import re

def clean_ad_text(text):
    if not isinstance(text, str):
        return ""

    # 1. Replace line breaks with space
    text = text.replace("\n", " ")

    # 2. Remove extra dots
    text = re.sub(r"\.{2,}", "", text)

    # 3. Remove special characters (keep % and numbers)
    text = re.sub(r"[^a-zA-Z0-9%\s]", "", text)

    # 4. Convert to lowercase
    text = text.lower()

    # 5. Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [22]:
df["clean_ad_text"] = df["text"].apply(clean_ad_text)

In [23]:
df[["text", "clean_ad_text"]].head(5)

Unnamed: 0,text,clean_ad_text
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,up to 100 off roundtrip flights to ireland tra...
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,yp the real yellow pages find cheap gas near y...
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,food navigator usa plantbased meat beyond the ...
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,monstrous airflow 2023 duramax banks 27% bigge...
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,yummy combs a nutriti wellne now safest most n...


### Duplicate removal

In [25]:
df = df.drop_duplicates(subset="clean_ad_text")
len(df)

993

### TOO short ads removal

In [26]:
df = df[df["clean_ad_text"].str.len() > 20]
df.head()

Unnamed: 0,text,dimensions,clean_ad_text
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,"(160, 600)",up to 100 off roundtrip flights to ireland tra...
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,"(300, 250)",yp the real yellow pages find cheap gas near y...
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,"(300, 600)",food navigator usa plantbased meat beyond the ...
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,"(300, 250)",monstrous airflow 2023 duramax banks 27% bigge...
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,"(728, 90)",yummy combs a nutriti wellne now safest most n...


In [27]:
df = df.reset_index(drop=True)

### Drop Dimensions Column

In [29]:
df = df.drop(columns=["dimensions"])
df.head()

Unnamed: 0,text,clean_ad_text
0,Up to\n$100 off\nroundtrip\nflights to\nIrelan...,up to 100 off roundtrip flights to ireland tra...
1,yp The Real\nур\nYellow Pages\nFind cheap\ngas...,yp the real yellow pages find cheap gas near y...
2,Food Navigator\nUSA\nPlant-based meat:\nBeyond...,food navigator usa plantbased meat beyond the ...
3,MONSTROUS\nAIRFLOW\n$20-$23\nDURAMAX\nbanks\n+...,monstrous airflow 2023 duramax banks 27% bigge...
4,YUMMY\nCOMBS\n*\na\nNutriti\nWellne\nNOW\nSafe...,yummy combs a nutriti wellne now safest most n...


### Keywords

In [31]:
keywords = pd.read_csv("Keywords.csv")
keywords

Unnamed: 0,buy
0,order
1,shop
2,purchase
3,sale
4,deal
...,...
326,system
327,setup
328,installation
329,supportservice


In [32]:
keyword_weight = {
    # A. Purchase / Action
    "buy": 0.12,
    "order": 0.12,
    "purchase": 0.12,
    "shop": 0.12,
    "now": 0.12,
    "checkout": 0.12,

    # B. Deals / Price
    "sale": 0.08,
    "deal": 0.08,
    "discount": 0.08,
    "coupon": 0.08,
    "promo": 0.08,
    "cashback": 0.08,
    "clearance": 0.08,

    # C. Urgency / Scarcity
    "limited": 0.06,
    "hurry": 0.06,
    "lastchance": 0.06,
    "endingsoon": 0.06,
    "flashsale": 0.06,
    "today": 0.06,

    # D. Trust / Assurance
    "original": 0.05,
    "genuine": 0.05,
    "official": 0.05,
    "trusted": 0.05,
    "verified": 0.05,
    "warranty": 0.05,

    # E. Convenience / Delivery
    "freedelivery": 0.04,
    "freeshipping": 0.04,
    "fastdelivery": 0.04,
    "instant": 0.04,
    "express": 0.04,

    # F. Social Proof / Quality
    "bestseller": 0.04,
    "toprated": 0.04,
    "reviews": 0.04,
    "ratings": 0.04,
    "recommended": 0.04
}


### CTR estimation Function

In [33]:
import math
import re

def estimate_ctr(ad_text,
                 base_ctr=0.015,
                 min_ctr=0.005,
                 max_ctr=0.25,
                 use_log_dampening=True):
    """
    Estimate CTR based on keyword presence in ad text.
    Returns CTR in range [0, 1].
    """

    # Normalize text
    tokens = re.findall(r"[a-zA-Z]+", ad_text.lower())

    # Weighted keyword score
    keyword_score = sum(keyword_weight.get(word, 0) for word in tokens)

    #formula
    if use_log_dampening:
        ctr = base_ctr + math.log(1 + keyword_score)
    else:
        ctr = base_ctr + keyword_score

    # Clamp to valid CTR range
    ctr = min(max(ctr, min_ctr), max_ctr)

    return round(ctr, 4)
