In [78]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

In [79]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [80]:
products.head(3)

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12


In [81]:
# merging products onto transactions
transactions = transactions.merge(products, on="ProductID", how="left")

In [82]:
# getting all we need
data = customers.merge(transactions,on = 'CustomerID',how='left')

In [83]:
data["TransactionDate"] = pd.to_datetime(data["TransactionDate"])

In [84]:
data.head(3)

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,TransactionID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y
0,C0001,Lawrence Carroll,South America,2022-07-10,T00015,P054,2024-01-19 03:12:55,2.0,114.6,57.3,SoundWave Cookbook,Books,57.3
1,C0001,Lawrence Carroll,South America,2022-07-10,T00932,P022,2024-09-17 09:01:18,3.0,412.62,137.54,HomeSense Wall Art,Home Decor,137.54
2,C0001,Lawrence Carroll,South America,2022-07-10,T00085,P096,2024-04-08 00:01:00,2.0,614.94,307.47,SoundWave Headphones,Electronics,307.47


# Creating Feature Matrix

In [96]:
customer_features = data.groupby("CustomerID").agg(
    # Money Wise 
    total_transactions=("TransactionID", "count"),
    total_spent=("TotalValue", "sum"),
    avg_spent=("TotalValue", "mean"),
    # Region Wise
    region=("Region", "first"),
    # Product and Category Wise (most imp)
    unique_products=("ProductID", lambda x: list(set(x))), 
    categories_purchased=("Category", lambda x: list(set(x))),
    total_quantity=("Quantity", "sum"), 
    # Date wise
    most_common_day=("TransactionDate", lambda x: x.dt.weekday.mode()[0] if not x.dt.weekday.mode().empty else None),
    most_common_month=("TransactionDate", lambda x: x.dt.month.mode()[0] if not x.dt.month.mode().empty else None), 
    most_common_hour=("TransactionDate", lambda x: x.dt.hour.mode()[0] if hasattr(x.dt, "hour") and not x.dt.hour.mode().empty else None)  # Most common hour
).reset_index()


In [97]:
customer_features.head(3)

Unnamed: 0,CustomerID,total_transactions,total_spent,avg_spent,region,unique_products,categories_purchased,total_quantity,most_common_day,most_common_month,most_common_hour
0,C0001,5,3354.52,670.904,South America,"[P022, P083, P096, P029, P054]","[Electronics, Books, Home Decor]",12.0,1.0,1.0,3.0
1,C0002,4,1862.74,465.685,Asia,"[P019, P071, P095, P004]","[Home Decor, Clothing]",10.0,1.0,2.0,1.0
2,C0003,4,2725.38,681.345,South America,"[P025, P002, P006, P035]","[Electronics, Home Decor, Clothing]",14.0,6.0,6.0,2.0


In [98]:
numerical_features = [
    "total_transactions", "total_spent", "avg_spent", "total_quantity", 
    "most_common_day", "most_common_month", "most_common_hour"
]

# Normalizing numerical features
scaler = StandardScaler()
customer_features[numerical_features] = scaler.fit_transform(customer_features[numerical_features])


In [99]:
# using tfidf for encoding category and product names

# converting lists to strings
customer_features['products_str'] = customer_features['unique_products'].apply(lambda x: ' '.join(map(str, x)) if isinstance(x, list) else '')
customer_features['categories_str'] = customer_features['categories_purchased'].apply(lambda x: ' '.join(map(str, x)) if isinstance(x, list) else '')

# seperate tfidf instance for category and product
vectorizer_products = TfidfVectorizer()
tfidf_matrix_products = vectorizer_products.fit_transform(customer_features['products_str'])
vectorizer_categories = TfidfVectorizer()
tfidf_matrix_categories = vectorizer_categories.fit_transform(customer_features['categories_str'])

# matrices to dataframes
tfidf_df_products = pd.DataFrame(tfidf_matrix_products.toarray(), columns=vectorizer_products.get_feature_names_out())
tfidf_df_categories = pd.DataFrame(tfidf_matrix_categories.toarray(), columns=vectorizer_categories.get_feature_names_out())

# final feature vector
customer_features = pd.concat([customer_features, tfidf_df_products, tfidf_df_categories], axis=1)


In [100]:
# encoding region
label_encoder = LabelEncoder()
customer_features['region_encoded'] = label_encoder.fit_transform(customer_features['region'])

In [None]:
# dropping columns which are now not necessary
final_features = customer_features.drop(['region','unique_products','categories_purchased','products_str','categories_str'],axis=1)

In [104]:
final_features.head(5)

Unnamed: 0,CustomerID,total_transactions,total_spent,avg_spent,total_quantity,most_common_day,most_common_month,most_common_hour,nan,p001,...,p098,p099,p100,books,clothing,decor,electronics,home,nan.1,region_encoded
0,C0001,0.0,-0.051884,-0.070263,-0.110735,-0.649939,-1.052254,-0.653061,0.0,0.0,...,0.0,0.0,0.0,0.470528,0.0,0.511218,0.505882,0.511218,0.0,3
1,C0002,-0.451294,-0.862714,-0.934933,-0.434049,-0.649939,-0.724557,-0.968496,0.0,0.0,...,0.0,0.0,0.0,0.0,0.583437,0.574283,0.0,0.574283,0.0,0
2,C0003,-0.451294,-0.393842,-0.026271,0.212579,1.978879,0.586232,-0.810779,0.0,0.0,...,0.0,0.0,0.0,0.0,0.50725,0.499291,0.49408,0.499291,0.0,3
3,C0004,1.353881,1.035375,-0.076769,1.667493,-1.175703,2.552417,0.450961,0.0,0.0,...,0.0,0.0,0.0,0.470528,0.0,0.511218,0.505882,0.511218,0.0,3
4,C0005,-0.902587,-0.769499,-0.040028,-0.919021,-1.175703,-0.39686,-1.126214,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.579359,0.573312,0.579359,0.0,0


# Explanation:

1. First all required fields taken into customers dataframe from other products and transactions which contain information we can use to find similar users, this is done using simple group by function
2. Next we want to create fetaures that we can use
3. First feaatures I created were dependent on Actual Transaction Money and total and average, this gives us money wise features
4. They are numerical featires so no need to encode
5. Next I used Regions to get region wise features. They are encoded using label encoder in previous cell.
6. Next I used Category of products and actal products. Firat they were converted to list then string and then in vectors using TfIDF vectorizer
7. Next I used transaction date to extract month,day,weekday and hour.
8. At last I normalized all numerical columns except tfidf features as they are important as it is
9. Last we get a feature matrix

# Using Cosine Similarity for LookAlike

1. Cosing Similarity is powerful way of getting similar recommendatins
2. It works by 

In [108]:
features = final_features.drop(['CustomerID'],axis = 1)

In [114]:
features.fillna(0,inplace=True)

In [116]:
similarity_matrix = cosine_similarity(features)
# Done!

In [118]:
# back to df
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

In [123]:
# Function takes id and returns top 3 recommendations
def get_similar_customers(customer_id, top_n=3):
    similar_customers = similarity_df[customer_id].drop(customer_id).sort_values(ascending=False).head(top_n)
    return similar_customers

In [124]:
customer_id = 'C0001'
top_similar = get_similar_customers(customer_id, top_n=5)
print(top_similar)

CustomerID
C0052    0.867984
C0126    0.858198
C0191    0.856846
C0055    0.855796
C0076    0.847924
Name: C0001, dtype: float64


## DOneeee!

# Now Creating LookAlike.csv

In [132]:
ids_to_generate = customers.iloc[:20].CustomerID.values # taking 1st 20 ids

In [133]:
lookalike_dict = {}

In [134]:
for cust_id in ids_to_generate:
    similar_customers = similarity_df[cust_id].sort_values(ascending=False)[1:4]  # Exclude itself
    lookalike_dict[cust_id] = list(zip(similar_customers.index, similar_customers.values))

In [138]:
lookalike_dict

{'C0001': [('C0052', 0.8679842282479358),
  ('C0126', 0.8581977507838431),
  ('C0191', 0.8568463555004938)],
 'C0002': [('C0043', 0.7676650136553883),
  ('C0159', 0.7132664624125221),
  ('C0128', 0.6913167924440262)],
 'C0003': [('C0190', 0.9054365022260268),
  ('C0031', 0.8935844180323499),
  ('C0181', 0.8760160505988458)],
 'C0004': [('C0102', 0.8732227081784946),
  ('C0104', 0.8689311371066696),
  ('C0155', 0.8353139514805824)],
 'C0005': [('C0159', 0.7723744801682234),
  ('C0177', 0.691833112910386),
  ('C0043', 0.6783410676220802)],
 'C0006': [('C0148', 0.8525331055354749),
  ('C0079', 0.830233642262665),
  ('C0163', 0.8176917742644488)],
 'C0007': [('C0080', 0.7396840079118119),
  ('C0110', 0.7220232473165386),
  ('C0089', 0.6994023910034383)],
 'C0008': [('C0147', 0.8473193946335654),
  ('C0098', 0.8315747413371052),
  ('C0049', 0.820582671415536)],
 'C0009': [('C0132', 0.797631569387823),
  ('C0198', 0.7934797057139923),
  ('C0199', 0.7489048150310311)],
 'C0010': [('C0025', 0.

In [139]:
lookalike_df = pd.DataFrame(lookalike_dict.items(), columns=['CustomerID', 'Lookalikes'])
lookalike_df.to_csv("Lookalike.csv", index=False)

In [140]:
# Checking answer
df = pd.read_csv('Lookalike.csv')
df.head()

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[('C0052', 0.8679842282479358), ('C0126', 0.85..."
1,C0002,"[('C0043', 0.7676650136553883), ('C0159', 0.71..."
2,C0003,"[('C0190', 0.9054365022260268), ('C0031', 0.89..."
3,C0004,"[('C0102', 0.8732227081784946), ('C0104', 0.86..."
4,C0005,"[('C0159', 0.7723744801682234), ('C0177', 0.69..."
