In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

# Load the datasets
customers_df = pd.read_csv("Customers.csv")
products_df = pd.read_csv("Products.csv")
transactions_df = pd.read_csv("Transactions.csv")

# Convert date columns to datetime format
customers_df["SignupDate"] = pd.to_datetime(customers_df["SignupDate"])
transactions_df["TransactionDate"] = pd.to_datetime(transactions_df["TransactionDate"])

# Check for missing values
print("Missing Values in Customers.csv:")
print(customers_df.isnull().sum())
print("\nMissing Values in Products.csv:")
print(products_df.isnull().sum())
print("\nMissing Values in Transactions.csv:")
print(transactions_df.isnull().sum())

# Summary statistics for transactions
print("\nTransaction Data Summary:")
print(transactions_df.describe())

# Customer Distribution by Region
region_counts = customers_df["Region"].value_counts()
plt.figure(figsize=(8,5))
region_counts.plot(kind='bar', color='blue', alpha=0.7)
plt.title("Number of Customers by Region")
plt.xlabel("Region")
plt.ylabel("Number of Customers")
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Top 10 Frequent Customers
top_customers = transactions_df["CustomerID"].value_counts().head(10)
plt.figure(figsize=(8,5))
top_customers.plot(kind='bar', color='skyblue')
plt.title("Top 10 Customers by Transaction Count")
plt.xlabel("Customer ID")
plt.ylabel("Number of Transactions")
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Distribution of Transaction Values
plt.figure(figsize=(8,5))
transactions_df["TotalValue"].hist(bins=30, color='orange', alpha=0.7)
plt.title("Distribution of Total Transaction Values")
plt.xlabel("Transaction Value (USD)")
plt.ylabel("Frequency")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Merge transactions with products data for category analysis
merged_df = transactions_df.merge(products_df, on="ProductID", how="left")

# Most Sold Product Categories
category_sales = merged_df["Category"].value_counts()
plt.figure(figsize=(8,5))
category_sales.plot(kind='bar', color='purple', alpha=0.7)
plt.title("Most Sold Product Categories")
plt.xlabel("Product Category")
plt.ylabel("Number of Transactions")
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Top 10 Best-Selling Products
top_products = merged_df.groupby("ProductName")["Quantity"].sum().nlargest(10)
plt.figure(figsize=(8,5))
top_products.plot(kind='bar', color='green', alpha=0.7)
plt.title("Top 10 Best-Selling Products")
plt.xlabel("Product Name")
plt.ylabel("Total Quantity Sold")
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
