<a href="https://colab.research.google.com/github/SakethGunda9603/DataScience-Assignment/blob/main/Saketh_LookaLike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# Corrected links for direct download
customers_url = 'https://drive.google.com/uc?id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE&export=download'
products_url = 'https://drive.google.com/uc?id=1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0&export=download'
transactions_url = 'https://drive.google.com/uc?id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF&export=download'

# Load datasets
customers = pd.read_csv(customers_url)
products = pd.read_csv(products_url)
transactions = pd.read_csv(transactions_url)

# Preview data
print(customers.head())
print(products.head())
print(transactions.head())

  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

In [2]:
# Merge transactions with customer and product information
transactions_full = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

# Fill missing values (if any)
transactions_full.fillna(0, inplace=True)

# Example of a derived feature: Total spend per customer
customer_spend = transactions_full.groupby("CustomerID")["TotalValue"].sum().reset_index()
customer_spend.columns = ["CustomerID", "TotalSpend"]

# Merge total spend into customer data
customers = customers.merge(customer_spend, on="CustomerID", how="left").fillna(0)

# Display the updated customer dataset
print(customers.head())


  CustomerID        CustomerName         Region  SignupDate  TotalSpend
0      C0001    Lawrence Carroll  South America  2022-07-10     3354.52
1      C0002      Elizabeth Lutz           Asia  2022-02-13     1862.74
2      C0003      Michael Rivera  South America  2024-03-07     2725.38
3      C0004  Kathleen Rodriguez  South America  2022-10-09     5354.88
4      C0005         Laura Weber           Asia  2022-08-15     2034.24


In [4]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Normalize numerical features
scaler = StandardScaler()
customers["TotalSpend"] = scaler.fit_transform(customers[["TotalSpend"]])

# Encode categorical features
encoder = LabelEncoder()
customers["Region"] = encoder.fit_transform(customers["Region"])

# Combine features for similarity calculation
features = ["Region", "TotalSpend"]
customer_features = customers[features]


In [6]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity scores
similarity_matrix = cosine_similarity(customer_features)

# Create a DataFrame to store similarity scores
similarity_df = pd.DataFrame(similarity_matrix, index=customers["CustomerID"], columns=customers["CustomerID"])


In [7]:
# Function to get top 3 similar customers
def get_top_3_similar(customers_df, similarity_df, customer_id):
    similar_scores = similarity_df.loc[customer_id].sort_values(ascending=False)
    top_3 = similar_scores.iloc[1:4].reset_index()  # Exclude self (index 0)
    top_3.columns = ["CustomerID", "SimilarityScore"]
    return top_3

# Generate recommendations for customers C0001 to C0020
lookalike_data = {}
for customer_id in customers["CustomerID"].iloc[:20]:
    lookalike_data[customer_id] = get_top_3_similar(customers, similarity_df, customer_id).values.tolist()

# Convert recommendations into a CSV
lookalike_df = pd.DataFrame([
    {"CustomerID": cust, "Lookalikes": lookalikes}
    for cust, lookalikes in lookalike_data.items()
])

lookalike_df.to_csv("Lookalike.csv", index=False)
print("Lookalike.csv created.")


Lookalike.csv created.


In [8]:
import pandas as pd

# Load the Lookalike.csv file
lookalike_csv = pd.read_csv("Lookalike.csv")

# Display the first few rows of the file
print(lookalike_csv.head())

# Display the full DataFrame in case of many rows
lookalike_csv


  CustomerID                                         Lookalikes
0      C0001  [['C0181', 0.9999992145643578], ['C0137', 0.99...
1      C0002   [['C0123', 1.0], ['C0159', 1.0], ['C0142', 1.0]]
2      C0003  [['C0125', 0.9999982266715225], ['C0072', 0.99...
3      C0004  [['C0155', 0.9999931153987299], ['C0018', 0.99...
4      C0005   [['C0123', 1.0], ['C0159', 1.0], ['C0142', 1.0]]


Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[['C0181', 0.9999992145643578], ['C0137', 0.99..."
1,C0002,"[['C0123', 1.0], ['C0159', 1.0], ['C0142', 1.0]]"
2,C0003,"[['C0125', 0.9999982266715225], ['C0072', 0.99..."
3,C0004,"[['C0155', 0.9999931153987299], ['C0018', 0.99..."
4,C0005,"[['C0123', 1.0], ['C0159', 1.0], ['C0142', 1.0]]"
5,C0006,"[['C0039', 0.9999977176874255], ['C0117', 0.99..."
6,C0007,"[['C0123', 1.0], ['C0159', 1.0], ['C0142', 1.0]]"
7,C0008,"[['C0079', 0.9999865002688586], ['C0153', 0.99..."
8,C0009,"[['C0198', 0.9999780711777825], ['C0121', 0.99..."
9,C0010,"[['C0033', 0.9997650035597762], ['C0062', 0.99..."
