# Lead Scoring with GPT-4.1
## ABB #7 - Session 2

Code authored by: Shaw Talebi

### import

In [1]:
from datasets import load_dataset

import pandas as pd
import numpy as np
from pydantic import BaseModel
from sklearn.linear_model import LogisticRegression

from openai import OpenAI
from dotenv import load_dotenv
import os

In [2]:
# load vars from .env
load_dotenv()

# connect to openai API
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

### helper functions

In [3]:
def load_prompt(filename, **kwargs) -> str:
    """Load text from a markdown file in the prompts directory and format with variables."""
    with open(f"prompts/{filename}.md", "r") as f:
        return f.read().format(**kwargs)

In [4]:
def format_input_data(record) -> str:
    return f"""Lead Origin: {record['Lead Origin']}
Lead Source: {record['Lead Source']}
Last Activity: {record['Last Activity']}
Tags: {record['Tags']}
Current Occupation: {record['What is your current occupation']}"""

In [5]:
def generate_prediction(model_name, prompt, lead_data, data_model) -> int:
    # generate prediction
    response = client.responses.parse(
        model=model_name,
        instructions=prompt,
        input=lead_data,
        text_format=data_model,
        temperature=0
    )

    # return label
    return response.output_parsed.label

In [6]:
def get_true_label(record) -> int:
    return record['Converted']

In [7]:
class LeadScore(BaseModel):
    label: int

In [8]:
def create_one_hot_features_aligned(df_train, df_test):
    """Create one-hot encoded features ensuring train and test have identical columns.
    
    This function concatenates train and test data, applies one-hot encoding to ensure
    all categories are captured, then splits them back to maintain consistent features.
    
    Args:
        df_train: Training DataFrame
        df_test: Test DataFrame
        
    Returns:
        Tuple of (df_train_encoded, df_test_encoded) with identical columns
    """
    # Define columns to encode
    columns_to_encode = [
        'Lead Origin',
        'Lead Source', 
        'What is your current occupation',
        'Last Activity',
        'Tags'
    ]
    
    # Store original sizes
    train_size = len(df_train)
    
    # Concatenate train and test
    df_combined = pd.concat([df_train, df_test], axis=0, ignore_index=True)
    
    # Apply one-hot encoding to combined data
    df_combined_encoded = pd.get_dummies(
        df_combined,
        columns=columns_to_encode,
        prefix=['LeadOrigin', 'LeadSource', 'Occupation', 'Last Activity','Tags'],
        drop_first=False
    )
    
    # Split back into train and test
    df_train_encoded = df_combined_encoded.iloc[:train_size].reset_index(drop=True)
    df_test_encoded = df_combined_encoded.iloc[train_size:].reset_index(drop=True)
    
    return df_train_encoded, df_test_encoded

In [9]:
def generate_examples(num_examples) -> str:  
    # format examples
    examples = ''
    
    # generate random integers to index train data
    np.random.seed(42)
    random_indices = np.random.choice(len(train_data), size=num_examples, replace=False)
    
    for i in random_indices:
        # input
        examples += f'<user_input id="example-{i}">\n'
        examples += f'{format_input_data(train_data[i])}\n'
        examples += f'</user_input id="example-{i}">\n\n'
        
        # output
        examples += f'<ground_truth_label id="example-{i}">\n'
        examples += f'{get_true_label(train_data[i])}\n'
        examples += f'</ground_truth_label id="example-{i}">\n\n'

    return examples

### load data

In [10]:
# load lead scoring data
dataset = load_dataset("shawhin/lead-scoring-x")

train_data = dataset['train'] # few-shot examples
valid_data = dataset['valid'] # crafting prompt
train_ml_data = dataset['test'] # training ML model

### Baseline: ML Classifier

In [11]:
df_train, df_valid = create_one_hot_features_aligned(train_ml_data.to_pandas(), valid_data.to_pandas())

In [12]:
# split data by predictors and target
X_train = df_train.iloc[:, 4:]
y_train = df_train['Converted']

X_test = df_valid.iloc[:, 4:]
y_test = df_valid['Converted']

In [13]:
# train logistic regression model
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

In [14]:
# compute accuracy
train_acc = clf.score(X_train, y_train)
test_acc = clf.score(X_test, y_test)

print(train_acc)
print(test_acc)

0.9648574775478329
0.9519343493552169


### Prompt 1: Zero-shot

In [15]:
print(format_input_data(train_data[0]))

Lead Origin: API
Lead Source: Olark Chat
Last Activity: Page Visited on Website
Tags: Interested in other courses
Current Occupation: Unemployed


In [16]:
print(get_true_label(train_data[0]))

0


In [17]:
num_leads=30
model_name = "gpt-4.1-mini"

In [18]:
prompt_1 = load_prompt('prompt-1')

In [19]:
%%time
prompt_1_pred = []

for i in range(num_leads):
    # generate prediction
    response = client.responses.parse(
        model=model_name,
        instructions=prompt_1,
        input=format_input_data(valid_data[i]),
        text_format=LeadScore,
    )

    # add prediction to list
    prompt_1_pred.append(response.output_parsed.label)

CPU times: user 284 ms, sys: 37 ms, total: 321 ms
Wall time: 51.8 s


In [20]:
# create df to store results
df_results = pd.DataFrame()

# save results to df
df_results['true_label'] = get_true_label(valid_data[:num_leads])
df_results['prompt_1_pred'] = prompt_1_pred
df_results['prompt_1_correct'] = df_results['true_label']==df_results['prompt_1_pred']

In [21]:
# evaluate accuracy
print(df_results['prompt_1_correct'].sum()/len(df_results))

0.43333333333333335


### Prompt 2: Meta-prompting

In [22]:
prompt_2 = load_prompt('prompt-2')

In [23]:
%%time
prompt_2_pred = []

for i in range(num_leads):
    # generate prediction
    label = generate_prediction(
        model_name, 
        prompt_2, 
        format_input_data(valid_data[i]), 
        LeadScore,
    )

    # add prediction to list
    prompt_2_pred.append(label)

CPU times: user 165 ms, sys: 15.9 ms, total: 181 ms
Wall time: 31.9 s


In [24]:
# save results to df
df_results['prompt_2_pred'] = prompt_2_pred
df_results['prompt_2_correct'] = df_results['true_label']==df_results['prompt_2_pred']

In [25]:
# evaluate accuracy
print(df_results['prompt_2_correct'].sum()/len(df_results))

0.7666666666666667


### Prompt 3: Few-shot

In [46]:
# generate few-shot examples
examples = generate_examples(15)

# generate prompt
prompt_3 = load_prompt('prompt-3', examples=examples)

In [47]:
print(prompt_3)

You are an intelligent assistant that classifies leads based on their likelihood to convert.  
Given a lead’s attributes, determine if the lead **will convert (1)** or **will not convert (0)**.  

### **Input Variables**
- **Lead Origin** – How the lead was generated (e.g., API, Landing Page Submission)  
- **Lead Source** – Specific source of the lead (e.g., Google, Direct Traffic, Organic Search)  
- **Last Activity** – The most recent interaction (e.g., Email Opened, Page Visited, Form Submitted)  
- **Tags** – Notes or labels summarizing lead status (e.g., Ringing, Will revert after reading the email)  
- **Current Occupation** – The lead’s current job status (e.g., Student, Unemployed, Working Professional)  

### **Examples**

<user_input id="example-218">
Lead Origin: API
Lead Source: Olark Chat
Last Activity: Olark Chat Conversation
Tags: Diploma holder (Not Eligible)
Current Occupation: Unemployed
</user_input id="example-218">

<ground_truth_label id="example-218">
0
</ground

In [48]:
%%time
prompt_3_pred = []

for i in range(num_leads):
    # generate prediction
    label = generate_prediction(
        model_name, 
        prompt_3, 
        format_input_data(valid_data[i]), 
        LeadScore,
    )

    # add prediction to list
    prompt_3_pred.append(label)

CPU times: user 189 ms, sys: 17.4 ms, total: 206 ms
Wall time: 29.6 s


In [49]:
# save results to df
df_results['prompt_3_pred'] = prompt_3_pred
df_results['prompt_3_correct'] = df_results['true_label']==df_results['prompt_3_pred']

In [50]:
# evaluate accuracy
print(df_results['prompt_3_correct'].sum()/len(df_results))

0.9
