In [None]:
import pandas as pd

In [None]:
# Load the datasets
customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions=pd.read_csv('/content/Transactions.csv')


In [None]:
# Display the first five rows of each dataset
print("Customers Data:")
print(customers.head())

print("\nProducts Data:")
print(products.head())

print("\nProducts Data:")
print(transactions.head())

Customers Data:
  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15

Products Data:
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31

Products Data:
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127     

In [None]:
# Display the last five rows of each dataset
print("Customers Data:")
print(customers.tail())

print("\nProducts Data:")
print(products.tail())

print("\nProducts Data:")
print(transactions.tail())



Customers Data:
    CustomerID      CustomerName  Region  SignupDate
195      C0196       Laura Watts  Europe  2022-06-07
196      C0197  Christina Harvey  Europe  2023-03-21
197      C0198       Rebecca Ray  Europe  2022-02-27
198      C0199    Andrea Jenkins  Europe  2022-12-03
199      C0200       Kelly Cross    Asia  2023-06-11

Products Data:
   ProductID             ProductName     Category   Price
95      P096    SoundWave Headphones  Electronics  307.47
96      P097      BookWorld Cookbook        Books  319.34
97      P098        SoundWave Laptop  Electronics  299.93
98      P099  SoundWave Mystery Book        Books  354.29
99      P100       HomeSense Sweater     Clothing  126.34

Products Data:
    TransactionID CustomerID ProductID      TransactionDate  Quantity  \
995        T00496      C0118      P037  2024-10-24 08:30:27         1   
996        T00759      C0059      P037  2024-06-04 02:15:24         3   
997        T00922      C0018      P037  2024-04-05 13:05:32        

In [None]:
# Display the information of each dataset
print("Customers Data:")
print(customers.info())

print("\nProducts Data:")
print(products.info())

print("\nProducts Data:")
print(transactions.info())



Customers Data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CustomerID    200 non-null    object
 1   CustomerName  200 non-null    object
 2   Region        200 non-null    object
 3   SignupDate    200 non-null    object
dtypes: object(4)
memory usage: 6.4+ KB
None

Products Data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ProductID    100 non-null    object 
 1   ProductName  100 non-null    object 
 2   Category     100 non-null    object 
 3   Price        100 non-null    float64
dtypes: float64(1), object(3)
memory usage: 3.3+ KB
None

Products Data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column           Non-Null Cou

In [None]:
# Check for missing values
print("\nMissing Values in Customers:")
print(customers.isnull().sum())

print("\nMissing Values in Products:")
print(products.isnull().sum())

print("\nMissing Values in Transactions:")
print(transactions.isnull().sum())




Missing Values in Customers:
CustomerID      0
CustomerName    0
Region          0
SignupDate      0
dtype: int64

Missing Values in Products:
ProductID      0
ProductName    0
Category       0
Price          0
dtype: int64

Missing Values in Transactions:
TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price              0
dtype: int64


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from datetime import datetime

In [None]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
customers['DaysSinceSignup'] = (datetime.now() - customers['SignupDate']).dt.days


In [None]:
# Encode 'Region' using OneHotEncoder
encoder = OneHotEncoder()
region_encoded = encoder.fit_transform(customers[['Region']])


In [None]:
# Feature Engineering: Combine customer profile and product purchases (dummy example)
customer_features = pd.DataFrame(region_encoded.toarray(), columns=encoder.categories_[0])
customer_features['DaysSinceSignup'] = customers['DaysSinceSignup']

In [None]:
# Normalize prices
scaler = StandardScaler()
products['Price'] = scaler.fit_transform(products[['Price']])

In [None]:
# Calculate similarity between customers
customer_feature_matrix = customer_features.values  # Just an example
similarity_matrix = cosine_similarity(customer_feature_matrix)

In [None]:
import numpy as np

In [None]:
# Top 3 recommendations based on similarity
top_3_customers = {}
for i in range(len(customers)):
    sim_scores = similarity_matrix[i]
    top_3_indices = np.argsort(sim_scores)[-4:-1]  # Get the indices of top 3 similar customers
    top_3_scores = sim_scores[top_3_indices]

    top_3_customers[customers['CustomerID'][i]] = list(zip(customers['CustomerID'][top_3_indices], top_3_scores))


In [None]:
# Create the Lookalike.csv file
lookalike_df = pd.DataFrame(top_3_customers.items(), columns=['cust_id', 'lookalikes'])
lookalike_df.to_csv('Lookalike.csv', index=False)

# Print the top 3 lookalikes for all customers
for cust_id, lookalikes in top_3_customers.items():
    print(f"Customer {cust_id} -> Top 3 Lookalikes: {lookalikes}")

Customer C0001 -> Top 3 Lookalikes: [('C0071', 0.9999999999468924), ('C0025', 0.9999999999762451), ('C0112', 0.999999999994023)]
Customer C0002 -> Top 3 Lookalikes: [('C0040', 0.9999999998125529), ('C0045', 0.9999999999452731), ('C0134', 0.999999999986471)]
Customer C0003 -> Top 3 Lookalikes: [('C0076', 0.999999975559986), ('C0126', 0.9999999894523861), ('C0052', 0.9999999894523861)]
Customer C0004 -> Top 3 Lookalikes: [('C0192', 0.9999999994506541), ('C0102', 0.999999999494308), ('C0108', 0.9999999998810423)]
Customer C0005 -> Top 3 Lookalikes: [('C0007', 0.9999999976779262), ('C0106', 0.9999999987671407), ('C0159', 0.9999999999378554)]
Customer C0006 -> Top 3 Lookalikes: [('C0126', 0.9999999432130708), ('C0181', 0.9999999517679141), ('C0076', 0.9999999658974945)]
Customer C0007 -> Top 3 Lookalikes: [('C0005', 0.9999999976779262), ('C0175', 0.9999999978745827), ('C0159', 0.9999999983755298)]
Customer C0008 -> Top 3 Lookalikes: [('C0183', 0.9999999974160476), ('C0016', 0.99999999767556