In [18]:
!pip install pandas

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com



[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
!pip install scikit-learn

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip





In [20]:
import pandas as pd
from itertools import combinations
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])


In [21]:
# Extract customer data
customer_features = transactions.groupby('CustomerID').agg({
    'Quantity': 'sum',
    'TotalValue': ['mean', 'sum', 'count'],
    'TransactionDate': [lambda x: (x.max() - x.min()).days]
}).reset_index()
customer_features.columns = [
    'CustomerID', 'TotalQuantity', 'AvgTransactionValue', 'TotalSpent',
    'TransactionCount', 'PurchaseSpanDays'
]


customer_features = pd.merge(customers, customer_features, on='CustomerID', how='left')
customer_features['PurchaseSpanDays'] = customer_features['PurchaseSpanDays'].fillna(0)
customer_features = customer_features.fillna(0)
print(customer_features.head())

  CustomerID        CustomerName         Region SignupDate  TotalQuantity  \
0      C0001    Lawrence Carroll  South America 2022-07-10           12.0   
1      C0002      Elizabeth Lutz           Asia 2022-02-13           10.0   
2      C0003      Michael Rivera  South America 2024-03-07           14.0   
3      C0004  Kathleen Rodriguez  South America 2022-10-09           23.0   
4      C0005         Laura Weber           Asia 2022-08-15            7.0   

   AvgTransactionValue  TotalSpent  TransactionCount  PurchaseSpanDays  
0              670.904     3354.52               5.0             288.0  
1              465.685     1862.74               4.0             278.0  
2              681.345     2725.38               4.0             188.0  
3              669.360     5354.88               8.0             299.0  
4              678.080     2034.24               3.0             233.0  


In [22]:
# Extract product data

product_features = transactions.groupby('ProductID').agg({
    'Quantity': 'sum',
    'Price': 'mean',
    'TotalValue': 'sum'
}).reset_index()
product_features.columns = ['ProductID', 'TotalQuantitySold', 'AvgPrice', 'TotalRevenue']

product_features = pd.merge(products, product_features, on='ProductID', how='left')
product_features = product_features.fillna(0)
print(product_features.head())

  ProductID              ProductName     Category   Price  TotalQuantitySold  \
0      P001     ActiveWear Biography        Books  169.30                 18   
1      P002    ActiveWear Smartwatch  Electronics  346.30                 25   
2      P003  ComfortLiving Biography        Books   44.12                 31   
3      P004            BookWorld Rug   Home Decor   95.69                 18   
4      P005          TechPro T-Shirt     Clothing  429.31                 24   

   AvgPrice  TotalRevenue  
0    169.30       3047.40  
1    346.30       8657.50  
2     44.12       1367.72  
3     95.69       1722.42  
4    429.31      10303.44  


In [23]:
# Generate pair-wise data for the model

pairs = list(combinations(customer_features['CustomerID'], 2))
pairwise_data = []

for c1, c2 in pairs:
    cust1 = customer_features[customer_features['CustomerID'] == c1].iloc[0]
    cust2 = customer_features[customer_features['CustomerID'] == c2].iloc[0]
    diff = abs(cust1.drop(['CustomerID', 'CustomerName', 'Region', 'SignupDate']) - 
               cust2.drop(['CustomerID', 'CustomerName', 'Region', 'SignupDate']))
    pairwise_data.append([c1, c2] + list(diff))

pairwise_df = pd.DataFrame(pairwise_data, columns=['Customer1', 'Customer2'] + list(customer_features.columns[4:]))


In [24]:
# Train Random forest model to create the look-alike model

pairwise_df['SimilarityScore'] = 1 - pairwise_df.iloc[:, 2:].mean(axis=1)

X = pairwise_df.drop(['Customer1', 'Customer2', 'SimilarityScore'], axis=1)
y = pairwise_df['SimilarityScore']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [26]:
# Generate the look-alike dataset

top_20_customers = customers[customers['CustomerID'].isin([f'C{i:04}' for i in range(1, 21)])]

recommendations = {}

for customer in top_20_customers['CustomerID']:
    similarities = []
    for other_customer in customers['CustomerID']:
        if customer != other_customer:
            cust1 = customer_features[customer_features['CustomerID'] == customer].iloc[0]
            cust2 = customer_features[customer_features['CustomerID'] == other_customer].iloc[0]
            diff = abs(cust1.drop(['CustomerID', 'CustomerName', 'Region', 'SignupDate']) - 
                       cust2.drop(['CustomerID', 'CustomerName', 'Region', 'SignupDate']))
            diff_df = pd.DataFrame([diff], columns=X.columns)  # Ensure proper column names
            score = model.predict(diff_df)[0]
            similarities.append((other_customer, score))
    top_3 = sorted(similarities, key=lambda x: x[1], reverse=True)[:3]
    recommendations[customer] = top_3

lookalike_df = pd.DataFrame({
    'cust_id': recommendations.keys(),
    'lookalikes': [str(l) for l in recommendations.values()]
})
print(lookalike_df)
lookalike_df.iloc[1:].to_csv('Lookalike.csv', index=False)



   cust_id                                         lookalikes
0    C0001  [('C0152', np.float64(-9.039523333333204)), ('...
1    C0002  [('C0029', np.float64(-22.352087999999966)), (...
2    C0003  [('C0178', np.float64(-13.917207142857187)), (...
3    C0004  [('C0021', np.float64(-19.551405333333364)), (...
4    C0005  [('C0159', np.float64(-15.45802133333332)), ('...
5    C0006  [('C0079', np.float64(-25.540220952380906)), (...
6    C0007  [('C0085', np.float64(-17.453520500000035)), (...
7    C0008  [('C0084', np.float64(-48.009003404762005)), (...
8    C0009  [('C0077', np.float64(-21.30920266666668)), ('...
9    C0010  [('C0029', np.float64(-22.34436133333334)), ('...
10   C0011  [('C0183', np.float64(-31.16980047619053)), ('...
11   C0012  [('C0155', np.float64(-46.712625238095285)), (...
12   C0013  [('C0045', np.float64(-19.636439214285648)), (...
13   C0014  [('C0058', np.float64(-30.9169850555555)), ('C...
14   C0015  [('C0095', np.float64(-46.014970634920644)), (...
15   C00