<a href="https://colab.research.google.com/github/Via-01/tvastr_vaishnavi_submissions/blob/main/Dashboard.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("orders_train.csv")  # please ensure orders_train.csv is present in file browser with the same name before running
df.replace('?', np.nan, inplace=True)
df.shape

(193971, 14)

In [None]:
df['customerID'].nunique()

30914

In [None]:
print(df.shape)
df.info()

(193971, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193971 entries, 0 to 193970
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   orderItemID     193971 non-null  int64  
 1   orderDate       193970 non-null  object 
 2   deliveryDate    175022 non-null  object 
 3   itemID          193970 non-null  float64
 4   size            193970 non-null  object 
 5   color           193911 non-null  object 
 6   manufacturerID  193970 non-null  float64
 7   price           193970 non-null  float64
 8   customerID      193970 non-null  float64
 9   salutation      193970 non-null  object 
 10  dateOfBirth     176347 non-null  object 
 11  state           193970 non-null  object 
 12  creationDate    193970 non-null  object 
 13  returnShipment  193970 non-null  float64
dtypes: float64(5), int64(1), object(8)
memory usage: 20.7+ MB


In [None]:
# Convert the date columns to datetime after replacing '?'
df['orderDate'] = pd.to_datetime(df['orderDate'], format='%d-%m-%Y', errors='coerce')
df['deliveryDate'] = pd.to_datetime(df['deliveryDate'], format='%d-%m-%Y', errors='coerce')
df['dateOfBirth'] = pd.to_datetime(df['dateOfBirth'], format='%d-%m-%Y', errors='coerce')
df['creationDate'] = pd.to_datetime(df['creationDate'], format='%d-%m-%Y', errors='coerce')

In [None]:
# Check for missing values
df.isnull().sum()

Unnamed: 0,0
orderItemID,0
orderDate,1
deliveryDate,193971
itemID,1
size,1
color,60
manufacturerID,1
price,1
customerID,1
salutation,1


In [None]:
# Impute missing values for numerical columns with the mean
numerical_cols = df.select_dtypes(include=['int64']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())

# Impute missing values for categorical columns with the mode
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Impute missing values for datetime columns
datetime_cols = df.select_dtypes(include=['datetime64']).columns
for col in datetime_cols:
    if df[col].isnull().any():
        mode_value = df[col].mode()
        if not mode_value.empty:
            df[col] = df[col].fillna(mode_value[0])
        else:
            df[col] = df[col].fillna(pd.to_datetime('today'))

# Check if there are still any missing values
print(df.isnull().sum())

orderItemID       0
orderDate         0
deliveryDate      0
itemID            1
size              0
color             0
manufacturerID    1
price             1
customerID        1
salutation        0
dateOfBirth       0
state             0
creationDate      0
returnShipment    1
dtype: int64


In [None]:
# Calculate order processing time (in days)
df['processingTime'] = (df['deliveryDate'] - df['orderDate']).dt.days

df['customerAge'] = (pd.to_datetime('today') - df['dateOfBirth']).dt.days //365

# Calculate account age -- And here.  You were subtracting NaT values.
df['accountAge'] = (pd.to_datetime('today') - df['creationDate']).dt.days // 365


In [None]:
import plotly.express as px

def plot_interactive_return_rates(df, size_col='size', return_col='returnShipment'):
    """Plots interactive bar chart using Plotly."""

    return_rate_by_size = df.groupby(size_col)[return_col].mean().reset_index()

    fig = px.bar(return_rate_by_size, x=size_col, y=return_col,
                 title='Return Rate by Product Size',
                 labels={'returnShipment': 'Return Rate', 'size': 'Product Size'})
    fig.show()

plot_interactive_return_rates(df)

In [None]:
# import pandas as pd
# import plotly.express as px
# import plotly.graph_objects as go
# import dash
# from dash import dcc, html
# from dash.dependencies import Input, Output

# def plot_return_rate_by_size(df):
#     """Plots return rate by product size."""
#     return_rate_by_size = df.groupby('size')['returnShipment'].mean().reset_index()
#     fig = px.bar(return_rate_by_size, x='size', y='returnShipment',
#                  title='Return Rate by Product Size',
#                  labels={'returnShipment': 'Return Rate', 'size': 'Product Size'})
#     return fig, return_rate_by_size

# def plot_return_rate_by_color(df):
#     """Plots return rate by product color."""
#     return_rate_by_color = df.groupby('color')['returnShipment'].mean().reset_index()
#     fig = px.bar(return_rate_by_color, x='color', y='returnShipment',
#                  title='Return Rate by Product Color',
#                  labels={'returnShipment': 'Return Rate', 'color': 'Color'})
#     return fig, return_rate_by_color

# def plot_return_rate_by_manufacturer(df):
#     """Plots return rate by manufacturer."""
#     return_rate_by_manufacturer = df.groupby('manufacturerID')['returnShipment'].mean().reset_index()
#     fig = px.bar(return_rate_by_manufacturer, x='manufacturerID', y='returnShipment',
#                  title='Return Rate by Manufacturer',
#                  labels={'returnShipment': 'Return Rate', 'manufacturerID': 'Manufacturer ID'})
#     return fig, return_rate_by_manufacturer

# def plot_return_rate_by_state(df):
#     """Plots return rate by customer state."""
#     return_rate_by_state = df.groupby('state')['returnShipment'].mean().reset_index()
#     fig = px.bar(return_rate_by_state, x='state', y='returnShipment',
#                  title='Return Rate by State',
#                  labels={'returnShipment': 'Return Rate', 'state': 'State'})
#     return fig, return_rate_by_state

# def plot_return_rate_by_age_group(df):
#     """Plots return rate by customer age group."""
#     bins = [18, 30, 40, 50, 60, 100]
#     labels = ['18-30', '31-40', '41-50', '51-60', '60+']
#     df['ageGroup'] = pd.cut(df['customerAge'], bins=bins, labels=labels)
#     return_rate_by_age = df.groupby('ageGroup')['returnShipment'].mean().reset_index()
#     fig = px.bar(return_rate_by_age, x='ageGroup', y='returnShipment',
#                  title='Return Rate by Customer Age Group',
#                  labels={'returnShipment': 'Return Rate', 'ageGroup': 'Age Group'})
#     return fig, return_rate_by_age

# def plot_return_rate_by_account_age(df):
#     """Plots return rate by customer account age."""
#     bins = [0, 30, 90, 180, 365, float('inf')]
#     labels = ['<1 Month', '1-3 Months', '3-6 Months', '6-12 Months', '12+ Months']
#     df['accountAgeGroup'] = pd.cut(df['accountAge'], bins=bins, labels=labels)
#     return_rate_by_account_age = df.groupby('accountAgeGroup')['returnShipment'].mean().reset_index()
#     fig = px.bar(return_rate_by_account_age, x='accountAgeGroup', y='returnShipment',
#                  title='Return Rate by Customer Account Age',
#                  labels={'returnShipment': 'Return Rate', 'accountAgeGroup': 'Account Age Group'})
#     return fig, return_rate_by_account_age

# def plot_return_rate_by_timeliness(df):
#     """Plots return rate by delivery timeliness."""
#     return_rate_by_timeliness = df.groupby('onTimeDelivery')['returnShipment'].mean().reset_index()
#     return_rate_by_timeliness['onTimeDelivery'] = return_rate_by_timeliness['onTimeDelivery'].map({True: 'On Time', False: 'Delayed'})
#     fig = px.bar(return_rate_by_timeliness, x='onTimeDelivery', y='returnShipment',
#                  title='Return Rate by Delivery Timeliness',
#                  labels={'returnShipment': 'Return Rate', 'onTimeDelivery': 'Delivery Timeliness'})
#     return fig, return_rate_by_timeliness

# def plot_return_rate_by_price_range(df):
#     """Plots return rate by price range."""
#     bins = [0, 50, 100, 200, 500, 1000]
#     labels = ['<50', '50-100', '100-200', '200-500', '500+']
#     df['priceRange'] = pd.cut(df['price'], bins=bins, labels=labels)
#     return_rate_by_price = df.groupby('priceRange')['returnShipment'].mean().reset_index()
#     fig = px.bar(return_rate_by_price, x='priceRange', y='returnShipment',
#                  title='Return Rate by Price Range',
#                  labels={'returnShipment': 'Return Rate', 'priceRange': 'Price Range'})
#     return fig, return_rate_by_price

# def plot_return_rate_by_salutation(df):
#     """Plots return rate by customer salutation."""
#     return_rate_by_salutation = df.groupby('salutation')['returnShipment'].mean().reset_index()
#     fig = px.bar(return_rate_by_salutation, x='salutation', y='returnShipment',
#                  title='Return Rate by Customer Salutation',
#                  labels={'returnShipment': 'Return Rate', 'salutation': 'Salutation'})
#     return fig, return_rate_by_salutation

# def plot_return_rate_over_time(df):
#     """Plots return rate over time (monthly)."""
#     df['orderMonth'] = pd.to_datetime(df['orderDate']).dt.to_period('M')
#     return_rate_by_month = df.groupby('orderMonth')['returnShipment'].mean().reset_index()
#     fig = px.line(return_rate_by_month, x='orderMonth', y='returnShipment',
#                   title='Return Rate Over Time (Monthly)',
#                   labels={'returnShipment': 'Return Rate', 'orderMonth': 'Order Month'})
#     return fig, return_rate_by_month

# def plot_return_rate_vs_price(df):
#     """Plots return rate vs. price."""
#     fig = px.scatter(df, x='price', y='returnShipment',
#                      title='Return Rate vs. Price',
#                      labels={'returnShipment': 'Return Rate', 'price': 'Price'})
#     return fig, df[['price', 'returnShipment']]

# def plot_return_rate_vs_processing_time(df):
#     """Plots return rate vs. processing time."""
#     fig = px.scatter(df, x='processingTime', y='returnShipment',
#                      title='Return Rate vs. Processing Time',
#                      labels={'returnShipment': 'Return Rate', 'processingTime': 'Processing Time'})
#     return fig, df[['processingTime', 'returnShipment']]

# def plot_return_rate_vs_account_age_numerical(df):
#     """Plots return rate vs. account age (numerical)."""
#     fig = px.scatter(df, x='accountAge', y='returnShipment',
#                      title='Return Rate vs. Account Age (Days)',
#                      labels={'returnShipment': 'Return Rate', 'accountAge': 'Account Age (Days)'})
#     return fig, df[['accountAge', 'returnShipment']]



In [None]:
import pandas as pd
import plotly.express as px
import plotly.io as pio

# --- Define Plotting Functions ---
def plot_return_rate_by_size(df):
    df['size'] = df['size'].astype(str)
    return_rate_by_size = df.groupby('size')['returnShipment'].mean().reset_index()
    return px.bar(return_rate_by_size, x='size', y='returnShipment', title='Return Rate by Product Size')

def plot_return_rate_by_color(df):
    return_rate_by_color = df.groupby('color')['returnShipment'].mean().reset_index()
    return px.bar(return_rate_by_color, x='color', y='returnShipment', title='Return Rate by Product Color')

def plot_return_rate_by_manufacturer(df):
    return_rate_by_manufacturer = df.groupby('manufacturerID')['returnShipment'].mean().reset_index()
    return px.bar(return_rate_by_manufacturer, x='manufacturerID', y='returnShipment', title='Return Rate by Manufacturer')

def plot_return_rate_by_state(df):
    return_rate_by_state = df.groupby('state')['returnShipment'].mean().reset_index()
    return px.bar(return_rate_by_state, x='state', y='returnShipment', title='Return Rate by State')

def plot_return_rate_by_age_group(df):
    bins = [18, 30, 40, 50, 60, 100]
    labels = ['18-30', '31-40', '41-50', '51-60', '60+']
    df['ageGroup'] = pd.cut(df['customerAge'], bins=bins, labels=labels)
    return_rate_by_age = df.groupby('ageGroup')['returnShipment'].mean().reset_index()
    return px.bar(return_rate_by_age, x='ageGroup', y='returnShipment', title='Return Rate by Customer Age Group')

def plot_return_rate_by_account_age(df):
    bins = [0, 30, 90, 180, 365, float('inf')]
    labels = ['<1 Month', '1-3 Months', '3-6 Months', '6-12 Months', '12+ Months']
    df['accountAgeGroup'] = pd.cut(df['accountAge'], bins=bins, labels=labels)
    return_rate_by_account_age = df.groupby('accountAgeGroup')['returnShipment'].mean().reset_index()
    return px.bar(return_rate_by_account_age, x='accountAgeGroup', y='returnShipment', title='Return Rate by Account Age')

def plot_return_rate_by_price_range(df):
    bins = [0, 50, 100, 200, 500, 1000]
    labels = ['<50', '50-100', '100-200', '200-500', '500+']
    df['priceRange'] = pd.cut(df['price'], bins=bins, labels=labels)
    return_rate_by_price = df.groupby('priceRange')['returnShipment'].mean().reset_index()
    return px.bar(return_rate_by_price, x='priceRange', y='returnShipment', title='Return Rate by Price Range')

def plot_return_rate_by_salutation(df):
    return_rate_by_salutation = df.groupby('salutation')['returnShipment'].mean().reset_index()
    return px.bar(return_rate_by_salutation, x='salutation', y='returnShipment', title='Return Rate by Customer Salutation')

def plot_return_rate_vs_price(df):
    price_data = df.groupby('price')['returnShipment'].mean().reset_index()
    return px.line(price_data, x='price', y='returnShipment', title='Return Rate vs. Price')

def plot_return_rate_vs_processing_time(df):
    processing_data = df.groupby('processingTime')['returnShipment'].mean().reset_index()
    return px.line(processing_data, x='processingTime', y='returnShipment', title='Return Rate vs. Processing Time')

def plot_return_rate_vs_account_age_numerical(df):
    account_age_data = df.groupby('accountAge')['returnShipment'].mean().reset_index()
    return px.line(account_age_data, x='accountAge', y='returnShipment', title='Return Rate vs. Account Age')

# --- Generate Figures ---
figures = [
    plot_return_rate_by_size(df),
    plot_return_rate_by_color(df),
    plot_return_rate_by_manufacturer(df),
    plot_return_rate_by_state(df),
    plot_return_rate_by_age_group(df),
    plot_return_rate_by_account_age(df),
    plot_return_rate_by_price_range(df),
    plot_return_rate_by_salutation(df),
    plot_return_rate_vs_price(df),
    plot_return_rate_vs_processing_time(df),
    plot_return_rate_vs_account_age_numerical(df)
]

# --- Convert Figures to HTML Divs ---
html_figures = [pio.to_html(fig, full_html=False) for fig in figures]

# --- Generate HTML File ---
html_content = f"""
<!DOCTYPE html>
<html>
<head>
    <title>Product Returns Dashboard</title>
    <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
    <style>
        body {{ font-family: Arial, sans-serif; text-align: center; }}
        .chart-container {{ width: 80%; margin: auto; padding: 20px; }}
    </style>
</head>
<body>
    <h1>Product Returns Dashboard</h1>
    {"".join(f'<div class="chart-container">{fig}</div>' for fig in html_figures)}
</body>
</html>
"""

# --- Save to File ---
with open("dashboard.html", "w", encoding="utf-8") as f:
    f.write(html_content)

print("✅ Dashboard saved as 'dashboard.html'. Open it in a browser to view the charts!")
print("Download dashboard.html from file browser and run.")

✅ Dashboard saved as 'dashboard.html'. Open it in a browser to view the charts!
Download dashboard.html from file browser and run.
