### Importing Libraries and Data Preprocessing

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv("/content/Customers.csv")
products = pd.read_csv("/content/Products.csv")
transactions = pd.read_csv("/content/Transactions.csv")

print(customers)
print(products)
print(transactions)

    CustomerID        CustomerName         Region  SignupDate
0        C0001    Lawrence Carroll  South America  2022-07-10
1        C0002      Elizabeth Lutz           Asia  2022-02-13
2        C0003      Michael Rivera  South America  2024-03-07
3        C0004  Kathleen Rodriguez  South America  2022-10-09
4        C0005         Laura Weber           Asia  2022-08-15
..         ...                 ...            ...         ...
195      C0196         Laura Watts         Europe  2022-06-07
196      C0197    Christina Harvey         Europe  2023-03-21
197      C0198         Rebecca Ray         Europe  2022-02-27
198      C0199      Andrea Jenkins         Europe  2022-12-03
199      C0200         Kelly Cross           Asia  2023-06-11

[200 rows x 4 columns]
   ProductID              ProductName     Category   Price
0       P001     ActiveWear Biography        Books  169.30
1       P002    ActiveWear Smartwatch  Electronics  346.30
2       P003  ComfortLiving Biography        Books   44

### Merging the Customer and Product Information

In [2]:
data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")
data

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86,Jacob Holt,South America,2022-01-22,SoundWave Smartwatch,Electronics,459.86
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86,Mrs. Kimberly Wright,North America,2024-04-07,SoundWave Smartwatch,Electronics,459.86
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86,Tyler Haynes,North America,2024-09-21,SoundWave Smartwatch,Electronics,459.86
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86,Joshua Hamilton,Asia,2024-11-11,SoundWave Smartwatch,Electronics,459.86


### Feature Engineering for Customer Profiles

In [3]:
# Feature engineering
customer_profiles = data.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    avg_spent=("Price_y", "mean"),
    total_transactions=("TransactionID", "count"),
    preferred_category=("Category", lambda x: x.mode()[0] if not x.mode().empty else None),
    region=("Region", "first")
).reset_index()
customer_profiles

Unnamed: 0,CustomerID,total_spent,avg_spent,total_transactions,preferred_category,region
0,C0001,3354.52,278.334000,5,Electronics,South America
1,C0002,1862.74,208.920000,4,Clothing,Asia
2,C0003,2725.38,195.707500,4,Home Decor,South America
3,C0004,5354.88,240.636250,8,Books,South America
4,C0005,2034.24,291.603333,3,Electronics,Asia
...,...,...,...,...,...,...
194,C0196,4982.88,416.992500,4,Home Decor,Europe
195,C0197,1928.65,227.056667,3,Electronics,Europe
196,C0198,931.83,239.705000,2,Clothing,Europe
197,C0199,1979.28,250.610000,4,Electronics,Europe


### Encoding Categorical Features in Customer Profiles

In [4]:
customer_profiles = pd.get_dummies(customer_profiles, columns=["preferred_category", "region"], drop_first=True)
customer_profiles

Unnamed: 0,CustomerID,total_spent,avg_spent,total_transactions,preferred_category_Clothing,preferred_category_Electronics,preferred_category_Home Decor,region_Europe,region_North America,region_South America
0,C0001,3354.52,278.334000,5,False,True,False,False,False,True
1,C0002,1862.74,208.920000,4,True,False,False,False,False,False
2,C0003,2725.38,195.707500,4,False,False,True,False,False,True
3,C0004,5354.88,240.636250,8,False,False,False,False,False,True
4,C0005,2034.24,291.603333,3,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
194,C0196,4982.88,416.992500,4,False,False,True,True,False,False
195,C0197,1928.65,227.056667,3,False,True,False,True,False,False
196,C0198,931.83,239.705000,2,True,False,False,True,False,False
197,C0199,1979.28,250.610000,4,False,True,False,True,False,False


### Normalizing data

In [5]:
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_profiles.drop("CustomerID", axis=1))
# normalized_features
similarity_matrix = cosine_similarity(normalized_features)
similarity_matrix

array([[ 1.        , -0.30649831,  0.15544862, ..., -0.37235445,
         0.29656692, -0.33432379],
       [-0.30649831,  1.        , -0.06499741, ...,  0.66787063,
        -0.10422184,  0.65882546],
       [ 0.15544862, -0.06499741,  1.        , ..., -0.16580598,
        -0.28750152, -0.38596243],
       ...,
       [-0.37235445,  0.66787063, -0.16580598, ...,  1.        ,
         0.38550092,  0.31606985],
       [ 0.29656692, -0.10422184, -0.28750152, ...,  0.38550092,
         1.        , -0.38385404],
       [-0.33432379,  0.65882546, -0.38596243, ...,  0.31606985,
        -0.38385404,  1.        ]])

### Creating Lookalikes

In [6]:
lookalikes = {}
for idx, cust_id in enumerate(customer_profiles["CustomerID"][:20]):
    scores = list(enumerate(similarity_matrix[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:4]  # Top 3 excluding self
    lookalikes[cust_id] = [(customer_profiles["CustomerID"].iloc[i], round(score, 4)) for i, score in scores]
lookalikes

{'C0001': [('C0181', 0.983), ('C0192', 0.9535), ('C0190', 0.9504)],
 'C0002': [('C0088', 0.9764), ('C0134', 0.9464), ('C0106', 0.9464)],
 'C0003': [('C0025', 0.9804), ('C0031', 0.9734), ('C0052', 0.9688)],
 'C0004': [('C0165', 0.9709), ('C0153', 0.9214), ('C0087', 0.9111)],
 'C0005': [('C0186', 0.9885), ('C0140', 0.988), ('C0146', 0.9451)],
 'C0006': [('C0171', 0.9532), ('C0011', 0.939), ('C0168', 0.9379)],
 'C0007': [('C0146', 0.9849), ('C0115', 0.979), ('C0186', 0.9724)],
 'C0008': [('C0065', 0.8921), ('C0059', 0.786), ('C0160', 0.7698)],
 'C0009': [('C0061', 0.9791), ('C0198', 0.9594), ('C0062', 0.924)],
 'C0010': [('C0111', 0.9842), ('C0062', 0.9558), ('C0198', 0.8907)],
 'C0011': [('C0137', 0.9936), ('C0191', 0.9774), ('C0118', 0.9628)],
 'C0012': [('C0163', 0.9778), ('C0113', 0.9615), ('C0195', 0.9478)],
 'C0013': [('C0099', 0.9882), ('C0108', 0.941), ('C0107', 0.8653)],
 'C0014': [('C0060', 0.9527), ('C0166', 0.8594), ('C0097', 0.8351)],
 'C0015': [('C0131', 0.9703), ('C0185', 0

### Saving Lookalike.csv file

In [9]:
lookalike_list = []
for cust_id, similar_custs in lookalikes.items():
    lookalike_list.append({
        "cust_id": cust_id,
        "lookalikes": str(similar_custs)
    })

lookalike_df = pd.DataFrame(lookalike_list)
lookalike_df.to_csv("Vashist_Tiwari_Lookalike.csv", index=False)
print("Lookalike.csv has been created successfully!")
# lookalike_list

Lookalike.csv has been created successfully!


### Displaying Some Results

In [8]:
# Display lookalikes for the first few customers
for cust_id, matches in list(lookalikes.items())[:5]:
    print(f"Customer {cust_id}:")
    for similar_cust_id, score in matches:
        print(f"  - Similar Customer: {similar_cust_id}, Score: {score}")

Customer C0001:
  - Similar Customer: C0181, Score: 0.983
  - Similar Customer: C0192, Score: 0.9535
  - Similar Customer: C0190, Score: 0.9504
Customer C0002:
  - Similar Customer: C0088, Score: 0.9764
  - Similar Customer: C0134, Score: 0.9464
  - Similar Customer: C0106, Score: 0.9464
Customer C0003:
  - Similar Customer: C0025, Score: 0.9804
  - Similar Customer: C0031, Score: 0.9734
  - Similar Customer: C0052, Score: 0.9688
Customer C0004:
  - Similar Customer: C0165, Score: 0.9709
  - Similar Customer: C0153, Score: 0.9214
  - Similar Customer: C0087, Score: 0.9111
Customer C0005:
  - Similar Customer: C0186, Score: 0.9885
  - Similar Customer: C0140, Score: 0.988
  - Similar Customer: C0146, Score: 0.9451
