In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Set a professional and clean aesthetic style for all plots
sns.set_style("whitegrid")
plt.style.use("seaborn-v0_8-whitegrid")


# Load the dataset from the CSV file. The file is assumed to be in the same directory.
try:
    df = pd.read_csv('retail_data.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: 'retail_data.csv' not found. Please ensure the file is in the correct directory.")
    exit()


# Get a summary of the DataFrame's structure and data types
print("### Initial Dataset Information:")
df.info()


# Display descriptive statistics for numerical columns
print("\n### Initial Descriptive Statistics:")
print(df.describe())


# Check for missing values across all columns
print("\n### Missing Values Summary (Initial):")
print(df.isnull().sum())


# Drop rows with missing critical data (e.g., Customer_ID, Name, financial data, ratings) to ensure integrity
critical_columns = ['Customer_ID', 'Name', 'Amount', 'Total_Amount', 'Ratings']
df.dropna(subset=critical_columns, inplace=True)
print(f"DataFrame size after dropping rows with missing critical data: {df.shape}")


# Convert 'Date' column to a proper datetime object, handling mixed formats
df['Date'] = pd.to_datetime(df['Date'], errors='coerce', format='%m/%d/%Y').fillna(
    pd.to_datetime(df['Date'], errors='coerce', format='%d-%m-%y'))


# Drop any rows where date conversion still failed
df.dropna(subset=['Date'], inplace=True)


# Convert financial and rating columns to numeric types
df['Amount'] = pd.to_numeric(df['Amount'])
df['Total_Amount'] = pd.to_numeric(df['Total_Amount'])
df['Ratings'] = pd.to_numeric(df['Ratings']).astype('Int64')


# Create a new feature for month name for clearer temporal analysis
df['month_name'] = df['Date'].dt.month_name()


# Final check to confirm cleaning was successful
print("\nFinal Cleaned Dataset Information:")
df.info()


# Chart 1: Distribution of Total Transaction Amount
plt.figure(figsize=(12, 6))
sns.histplot(df['Total_Amount'], bins=50, kde=True, color='skyblue')
plt.title('Distribution of Total Transaction Amount')
plt.xlabel('Total Amount (USD)')
plt.ylabel('Frequency')
plt.show()


# Chart 2: Distribution of Product Categories
plt.figure(figsize=(12, 8))
sns.countplot(data=df, y='Product_Category', order=df['Product_Category'].value_counts().index, palette='viridis')
plt.title('Number of Transactions by Product Category')
plt.xlabel('Number of Transactions')
plt.ylabel('Product Category')
plt.show()


# Chart 3: Distribution of Customer Ratings
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Ratings', order=df['Ratings'].value_counts().index, palette='YlGnBu')
plt.title('Distribution of Customer Ratings')
plt.xlabel('Rating (1-5)')
plt.ylabel('Count')
plt.show()


from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer


# Chart 4 (Revised): Customer Cluster Analysis using PCA
# This visualization identifies and plots customer clusters based on key numerical features.


# 1. Select and prepare the numerical data for clustering, handling missing values.
numerical_features = ['Age', 'Total_Purchases', 'Amount', 'Total_Amount', 'Ratings']
X = df[numerical_features].copy() # Create a copy to avoid SettingWithCopyWarning


# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)


# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)




# 2. Apply KMeans clustering to find natural groups. We'll choose 4 clusters for a clear visualization.
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
df['Cluster'] = kmeans.fit_predict(X_scaled)


# 3. Use PCA to reduce the data to 2 dimensions for plotting.
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_scaled)
pca_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])
pca_df['Cluster'] = df['Cluster'].values


# 4. Plot the clusters using the principal components.
plt.figure(figsize=(14, 10))
sns.scatterplot(
    data=pca_df,
    x='Principal Component 1',
    y='Principal Component 2',
    hue='Cluster',
    palette='viridis',
    s=50,
    alpha=0.7
)
plt.title('Customer Clusters (PCA)', fontsize=18)
plt.xlabel('Principal Component 1', fontsize=12)
plt.ylabel('Principal Component 2', fontsize=12)
plt.legend(title='Cluster', loc='upper right', bbox_to_anchor=(1.15, 1))
plt.tight_layout()
plt.show()


# Print the descriptive statistics for each cluster to understand their characteristics
print("\n### Cluster Analysis - Descriptive Statistics by Cluster:")
print(df.groupby('Cluster')[numerical_features].mean())




# Chart 5: Total Amount Distribution by Customer Segment
plt.figure(figsize=(12, 8))
sns.boxplot(data=df, x='Customer_Segment', y='Total_Amount', palette='coolwarm')
# Overlay a stripplot to show the distribution of individual data points
sns.stripplot(data=df, x='Customer_Segment', y='Total_Amount', color='0.3', jitter=0.2, size=3, alpha=0.5)
plt.title('Total Amount Distribution by Customer Segment')
plt.xlabel('Customer Segment')
plt.ylabel('Total Amount (USD)')
plt.show()


# Chart 6: Number of Transactions by Product Category and Customer Segment
category_segment_pivot = df.pivot_table(index='Product_Category',
                                        columns='Customer_Segment',
                                        values='Total_Amount',
                                        aggfunc='count',
                                        fill_value=0)
plt.figure(figsize=(14, 10))
sns.heatmap(category_segment_pivot, annot=True, fmt='d', cmap='YlGnBu')
plt.title('Number of Transactions by Product Category and Customer Segment')
plt.xlabel('Customer Segment')
plt.ylabel('Product Category')
plt.show()


# Get descriptive statistics for all numerical columns on the cleaned data
print("### Descriptive Statistics of Cleaned Data:")
print(df.describe())


# Correlation matrix for numerical variables
numerical_cols = ['Transaction_ID', 'Customer_ID', 'Phone', 'Zipcode', 'Age', 'Year', 'Total_Purchases', 'Amount', 'Total_Amount', 'Ratings']
correlation_matrix = df[numerical_cols].corr()
print("\n### Correlation Matrix:")
print(correlation_matrix)


# Chart 7: Temporal Analysis of Monthly Sales Trends
monthly_sales = df.groupby(df['Date'].dt.to_period('M'))['Total_Amount'].sum().to_frame()
monthly_sales.index = monthly_sales.index.strftime('%Y-%m')
plt.figure(figsize=(14, 7))
sns.lineplot(data=monthly_sales, x=monthly_sales.index, y='Total_Amount', marker='o', color='purple')
plt.title('Total Sales by Month')
plt.xlabel('Month')
plt.ylabel('Total Sales (USD)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


Error: 'retail_data.csv' not found. Please ensure the file is in the correct directory.
### Initial Dataset Information:


NameError: name 'df' is not defined