In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

In [2]:
# Load datasets
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

In [3]:
# Merge datasets on CustomerID and ProductIDprint("Columns in merged dataset:", data.columns)
data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

In [5]:
print("Columns in merged dataset:", data.columns)

Columns in merged dataset: Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')


In [6]:
# Ensure 'Price' column exists
if 'Price' not in data.columns:
    raise KeyError("Column 'Price' not found. Please check the Products.csv file.")


KeyError: "Column 'Price' not found. Please check the Products.csv file."

In [7]:
# Select relevant numerical features for similarity
features = ['Quantity', 'TotalValue', 'Price_x']
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data[features])

In [8]:
# Train Nearest Neighbors Model
nn = NearestNeighbors(n_neighbors=4, metric='euclidean')  # 4 to exclude self-matching
nn.fit(data_scaled)

In [10]:
# Get lookalike customers for the first 20 customers
lookalike_results = {}
for customer_id in customers['CustomerID'][:20]:
    idx = data[data['CustomerID'] == customer_id].index[0]
    distances, indices = nn.kneighbors([data_scaled[idx]])
    similar_customers = [(data.iloc[i]['CustomerID'], distances[0][j]) for j, i in enumerate(indices[0]) if i != idx]
    lookalike_results[customer_id] = similar_customers[:3]  # Top 3 lookalikes

In [13]:
# Convert dictionary to DataFrame
lookalike_df = pd.DataFrame.from_dict(lookalike_results, orient='index')

# Print to check the structure before renaming
print("Lookalike DataFrame Columns Before Renaming:", lookalike_df.columns)

Lookalike DataFrame Columns Before Renaming: RangeIndex(start=0, stop=3, step=1)


In [14]:
# Reset index
lookalike_df.reset_index(inplace=True)

# Rename columns dynamically based on actual structure
column_names = ['CustomerID'] + [f'SimilarCustomer_{i+1}' for i in range(lookalike_df.shape[1] - 1)]
lookalike_df.columns = column_names

# Save results to CSV correctly
lookalike_df.to_csv('Somesh_Gowda_Lookalike.csv', index=False)

print("Lookalike Model completed. Results saved in Somesh_Gowda_Lookalike.csv")


Lookalike Model completed. Results saved in Somesh_Gowda_Lookalike.csv
