Lookalike Model

In [24]:

#Feature engineering
customer_features = customers.copy()
customer_features['Tenure'] = (pd.Timestamp.today() - customer_features['SignupDate']).dt.days
customer_features = pd.get_dummies(customer_features, columns=['Region'])

# Transaction features
txn_agg = transactions.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean'],
    'TransactionID': 'count'
}).reset_index()
txn_agg.columns = ['CustomerID', 'TotalSpend', 'AvgOrderValue', 'PurchaseFrequency']


customer_features = customer_features.merge(txn_agg, on='CustomerID', how='left')


from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

features = customer_features.drop(['CustomerID', 'CustomerName', 'SignupDate'], axis=1)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features.fillna(0))

similarity_matrix = cosine_similarity(scaled_features)

#Generate recommendations
lookalikes = {}
target_customers = [f"C{str(i).zfill(4)}" for i in range(1, 21)]

for cust_id in target_customers:
    idx = customer_features[customer_features['CustomerID'] == cust_id].index[0]
    scores = list(enumerate(similarity_matrix[idx]))
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:4]  # Exclude self
    lookalikes[cust_id] = [
        (customer_features.iloc[i]['CustomerID'], round(score, 2))
        for i, score in sorted_scores
    ]


pd.DataFrame.from_dict(lookalikes, orient='index').to_csv('Lookalike.csv', header=False)

In [15]:
 # Customer Segmentation


from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, silhouette_score
from sklearn.decomposition import PCA


X = scaled_features

#  Determine optimal clusters
wcss = []
db_scores = []
sil_scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X)
    wcss.append(kmeans.inertia_)
    db_scores.append(davies_bouldin_score(X, labels))
    sil_scores.append(silhouette_score(X, labels))


plt.plot(range(2,11), wcss, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.savefig('elbow_plot.png')
plt.close()

# using k=5
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X)


db_index = davies_bouldin_score(X, clusters)
sil_score = silhouette_score(X, clusters)

# Visualize clusters
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X)
plt.scatter(principal_components[:,0], principal_components[:,1], c=clusters)
plt.title('Customer Segmentation')
plt.savefig('clusters.png')
plt.close()

print(f"DB Index: {db_index:.2f}")
print(f"Silhouette Score: {sil_score:.2f}")

DB Index: 1.15
Silhouette Score: 0.34
