In [10]:
import urllib.request
import os

# Download the dataset
url = "https://github.com/DataTalksClub/machine-learning-zoomcamp/raw/refs/heads/master/cohorts/2025/05-deployment/pipeline_v1.bin"
filename = "pipeline_v1.bin"

print(f"Downloading {filename}...")
urllib.request.urlretrieve(url, filename)

# Verify the download
if os.path.exists(filename):
    file_size = os.path.getsize(filename)
    print(f"✅ Download successful!")
    print(f"File: {filename}")
    print(f"Size: {file_size} bytes")
else:
    print("❌ Download failed!")
#  desired target for classification task will be converted variable - has the client signed up to the platform or not.

Downloading pipeline_v1.bin...
✅ Download successful!
File: pipeline_v1.bin
Size: 1300 bytes
✅ Download successful!
File: pipeline_v1.bin
Size: 1300 bytes


# Model Scoring

load the trained pipeline and use it to score a new record.

In [12]:
import pickle

# Load the trained pipeline
with open('pipeline_v1.bin', 'rb') as f:
    dv, model = pickle.load(f)

print("Pipeline loaded successfully!")
print(f"DictVectorizer: {dv}")
print(f"Model: {model}")

# The record to score
record = {
    "lead_source": "paid_ads",
    "number_of_courses_viewed": 2,
    "annual_income": 79276.0
}

print(f"\nRecord to score: {record}")

# Transform the record using the DictVectorizer
X = dv.transform([record])
print(f"Transformed features shape: {X.shape}")

# Get the probability prediction
probability = model.predict_proba(X)[0, 1]

print(f"\nProbability that this lead will convert: {probability:.4f}")
print(f"Probability percentage: {probability * 100:.2f}%")

Pipeline loaded successfully!
DictVectorizer: DictVectorizer()
Model: LogisticRegression(solver='liblinear')

Record to score: {'lead_source': 'paid_ads', 'number_of_courses_viewed': 2, 'annual_income': 79276.0}
Transformed features shape: (1, 8)

Probability that this lead will convert: 0.5336
Probability percentage: 53.36%
