In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import davies_bouldin_score

# Load datasets
customers = pd.read_csv('https://drive.google.com/uc?id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE')
products = pd.read_csv('https://drive.google.com/uc?id=1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0')
transactions = pd.read_csv('https://drive.google.com/uc?id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF')
# Basic exploration
print(customers.info())
print(products.info())
print(transactions.info())

# Merge datasets for analysis
data = pd.merge(transactions, customers, on='CustomerID')
data = pd.merge(data, products, on='ProductID')

# Example EDA visualizations
# 1. Customers by Region
sns.countplot(x='Region', data=customers)
plt.title('Customers by Region')
plt.xticks(rotation=45)
plt.show()

# 2. Revenue by Product Category
category_revenue = data.groupby('Category')['TotalValue'].sum().sort_values(ascending=False)
category_revenue.plot(kind='bar', color='teal')
plt.title('Revenue by Product Category')
plt.show()

# 3. Signup trends over time
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
customers['SignupYearMonth'] = customers['SignupDate'].dt.to_period('M')
signup_trend = customers['SignupYearMonth'].value_counts().sort_index()
signup_trend.plot(kind='line', marker='o', color='orange')
plt.title('Signup Trends Over Time')
plt.show()

# 4. High-value customers (top 10% of spenders)
top_10_percent_spenders = data.groupby('CustomerID')['TotalValue'].sum().nlargest(int(len(customers)*0.1))
print(f"Top 10% spenders contribute {top_10_percent_spenders.sum()} to revenue.")

# 5. Most purchased products
popular_products = data['ProductName'].value_counts().head(10)
print("Top 10 Most Purchased Products:")
print(popular_products)
