# Task 2: Lookalike Model

In [297]:
#Import Utilities
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping

In [299]:
# Load the data
data = pd.read_csv("customers1.csv")

In [301]:
# Select the first 20 customers
data = data[data['CustomerID'].isin(data['CustomerID'].unique()[:20])]

In [303]:
# Preprocess the data
le = LabelEncoder()
data['Region'] = le.fit_transform(data['Region'])
data['Category'] = le.fit_transform(data['Category'])
data['ProductID'] = le.fit_transform(data['ProductID'])
data['ProductName'] = le.fit_transform(data['ProductName'])

In [305]:
# Create features and labels
features = data[['Region', 'Category', 'ProductID', 'ProductName', 'Price']]
labels = data['CustomerID']

In [307]:
# Encode labels
label_encoder = LabelEncoder()
label_encoder.fit(labels)  # Encode all customer IDs
labels_encoded = label_encoder.transform(labels)

In [309]:
# Scale features
scaler = StandardScaler()
features = scaler.fit_transform(features)

In [311]:
# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42, k_neighbors=3)
X_resampled, y_resampled = smote.fit_resample(features, labels_encoded)

In [313]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.4, random_state=42)

In [315]:
# Create the neural network model
inputs = Input(shape=(X_train.shape[1],))
x = Dense(16, activation='relu')(inputs)
outputs = Dense(len(np.unique(y_resampled)), activation='softmax')(x)
model = Model(inputs=inputs, outputs=outputs)

early_stopping = EarlyStopping(
    monitor='val_accuracy',
    mode='max',
    patience=3,
    restore_best_weights=True
)

In [317]:
# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [319]:
# Train the model
history = model.fit(X_train, y_train, epochs=1000, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])

Epoch 1/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.0833 - loss: 3.0278 - val_accuracy: 0.1250 - val_loss: 3.0160
Epoch 2/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step - accuracy: 0.0833 - loss: 3.0159 - val_accuracy: 0.1250 - val_loss: 3.0199
Epoch 3/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step - accuracy: 0.0833 - loss: 3.0042 - val_accuracy: 0.1250 - val_loss: 3.0238
Epoch 4/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step - accuracy: 0.0833 - loss: 2.9925 - val_accuracy: 0.1250 - val_loss: 3.0277


In [321]:
# Evaluate the model
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step


In [323]:
# Generate similarity scores for all customers
lookalike_data = pd.DataFrame(columns=['CustomerID', 'Similarity_Score'])
for customer_id in data['CustomerID'].unique():
    customer_index = data[data['CustomerID'] == customer_id].index[0]
    customer_features = features[customer_index].reshape(1, -1)
    probabilities = model.predict(customer_features)
    similarity_score = np.max(probabilities)
    new_row = pd.DataFrame({'CustomerID': [customer_id], 'Similarity_Score': [similarity_score]})
    lookalike_data = pd.concat([lookalike_data, new_row], ignore_index=True)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step


  lookalike_data = pd.concat([lookalike_data, new_row], ignore_index=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37

In [325]:
# Save the Lookalike DataFrame to CSV
lookalike_data.to_csv("Lookalike.csv", index=False)

In [327]:
w = pd.read_csv("Lookalike.csv")
w

Unnamed: 0,CustomerID,Similarity_Score
0,C0001,0.103214
1,C0002,0.09693
2,C0003,0.116926
3,C0004,0.063329
4,C0005,0.102102
5,C0006,0.054948
6,C0007,0.102155
7,C0008,0.057102
8,C0009,0.084261
9,C0010,0.087631
