In [None]:
%pip install pandas numpy matplotlib seaborn

In [None]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Display the first few rows of each dataset
print("Customers Dataset:")
print(customers.head())

print("\nProducts Dataset:")
print(products.head())

print("\nTransactions Dataset:")
print(transactions.head())

In [None]:
# Check dataset information
print("Customers Info:")
print(customers.info())

print("\nProducts Info:")
print(products.info())

print("\nTransactions Info:")
print(transactions.info())

# Check for missing values
print("\nMissing Values:")
print("Customers:\n", customers.isnull().sum())
print("\nProducts:\n", products.isnull().sum())
print("\nTransactions:\n", transactions.isnull().sum())

In [None]:
# Convert date columns to datetime
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Verify data types after conversion
print("\nData types after conversion:")
print(customers.dtypes)
print(transactions.dtypes)

# Check for duplicates
print("\nDuplicates:")
print("Customers:", customers.duplicated().sum())
print("Products:", products.duplicated().sum())
print("Transactions:", transactions.duplicated().sum())

In [None]:
# Merge datasets for comprehensive analysis
merged_data = transactions.merge(customers, on='CustomerID', how='left').merge(products, on='ProductID', how='left')

# Display the merged dataset
print("\nMerged Dataset:")
print(merged_data.head())

In [None]:
# Descriptive statistics for numerical columns
print("\nSummary Statistics:")
print(merged_data.describe())

# Count of unique customers, products, and transactions
print("\nUnique Counts:")
print("Unique Customers:", merged_data['CustomerID'].nunique())
print("Unique Products:", merged_data['ProductID'].nunique())
print("Total Transactions:", merged_data['TransactionID'].nunique())

In [None]:
# Visualization 1: Customer distribution by region
plt.figure(figsize=(8, 5))
sns.countplot(data=customers, x='Region', order=customers['Region'].value_counts().index)
plt.title('Customer Distribution by Region')
plt.xlabel('Region')
plt.ylabel('Count')
plt.show()

# Visualization 2: Most popular product categories
plt.figure(figsize=(8, 5))
sns.countplot(data=products, y='Category', order=products['Category'].value_counts().index)
plt.title('Most Popular Product Categories')
plt.xlabel('Count')
plt.ylabel('Category')
plt.show()

# Visualization 3: Sales trend over time
merged_data['MonthYear'] = merged_data['TransactionDate'].dt.to_period('M')
sales_trend = merged_data.groupby('MonthYear')['TotalValue'].sum()
sales_trend.plot(kind='line', figsize=(10, 5), marker='o', title='Sales Trend Over Time')
plt.xlabel('Month-Year')
plt.ylabel('Total Sales (USD)')
plt.grid()
plt.show()

# Visualization 4: Top 10 customers by spending
top_customers = merged_data.groupby('CustomerName')['TotalValue'].sum().nlargest(10)
top_customers.plot(kind='barh', figsize=(8, 5), title='Top 10 Customers by Spending')
plt.xlabel('Total Spending (USD)')
plt.ylabel('Customer Name')
plt.show()

# Visualization 5: Top 10 products by sales
top_products = merged_data.groupby('ProductName')['TotalValue'].sum().nlargest(10)
top_products.plot(kind='barh', figsize=(8, 5), title='Top 10 Products by Sales')
plt.xlabel('Total Sales (USD)')
plt.ylabel('Product Name')
plt.show()