In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load synthetic dataset
df = pd.read_csv("../data/synthetic_lead_scoring.csv")
df["email_domain"] = df["email"].apply(lambda x: x.split("@")[1])
X = df.drop(columns=["converted", "email"])
X["email_domain"] = df["email_domain"]
y = df["converted"]

# Preprocessing pipeline
categorical_features = ["sector", "country", "email_domain"]
numerical_features = ["company_size", "website_visits"]

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean"))
])
preprocessor = ColumnTransformer([
    ("num", numerical_transformer, numerical_features),
    ("cat", categorical_transformer, categorical_features)
])

# Full pipeline
clf = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf.fit(X_train, y_train)

# Save model
joblib.dump(clf, "../model/lead_scoring_model.joblib")



['../model/lead_scoring_model.joblib']