# 1. Import libraries and load the dataset



In [12]:
import pandas as pd
import numpy as np


df_train = pd.read_csv(r'D:\Topic_13_Project\Topic_13_Retail_Store_Sales_Time_Series\data\raw\train.csv')

# 2. Display basic information about the dataset


In [13]:
print("First 5 Rows of Data Frame:\n", df_train.head(5))
print("Data Frame Shape:\n", df_train.shape)
print("Data Frame Info:\n", df_train.info())
print("Data Frame Statistics:\n", df_train.describe())
    

First 5 Rows of Data Frame:
    id        date  store_nbr      family  sales  onpromotion
0   0  2013-01-01          1  AUTOMOTIVE    0.0            0
1   1  2013-01-01          1   BABY CARE    0.0            0
2   2  2013-01-01          1      BEAUTY    0.0            0
3   3  2013-01-01          1   BEVERAGES    0.0            0
4   4  2013-01-01          1       BOOKS    0.0            0
Data Frame Shape:
 (3000888, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 6 columns):
 #   Column       Dtype  
---  ------       -----  
 0   id           int64  
 1   date         object 
 2   store_nbr    int64  
 3   family       object 
 4   sales        float64
 5   onpromotion  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 137.4+ MB
Data Frame Info:
 None
Data Frame Statistics:
                  id     store_nbr         sales   onpromotion
count  3.000888e+06  3.000888e+06  3.000888e+06  3.000888e+06
mean   1.500444e+0

# 3. Missing Values Summary

In [14]:
# Calculate missing values
missing_counts = df_train.isnull().sum()
missing_percentage = missing_counts / len(df_train) * 100

# Create a summary DataFrame
missing_df_train = pd.DataFrame({
    'Missing Values': missing_counts,
    'Percentage': missing_percentage
})

print("Missing Values Summary:\n", missing_df_train)

# Filter columns with missing values
missing_df_train = missing_df_train[missing_df_train['Missing Values'] > 0].sort_values(by='Missing Values', ascending=False)

Missing Values Summary:
              Missing Values  Percentage
id                        0         0.0
date                      0         0.0
store_nbr                 0         0.0
family                    0         0.0
sales                     0         0.0
onpromotion               0         0.0


# 4. Detailed Analyze and Statistics

### Analyze 'sales' column and calculate detailed statistics

In [15]:
# Analyze 'sales' column and calculate detailed statistics
# Total sales and basic stats
total_sales = df_train['sales'].sum()
avg_sales = df_train['sales'].mean()
median_sales = df_train['sales'].median()
std_sales = df_train['sales'].std()
zero_sales_count = (df_train['sales'] == 0.0).sum()
pct_zero_sales = zero_sales_count / len(df_train) * 100
print(f"Total sales: ${total_sales:,.2f}")
print(f"Average sales: ${avg_sales:.2f}")
print(f"Median sales: ${median_sales:.2f}")
print(f"Std dev sales: ${std_sales:.2f}")
print(f"Zero sales count: {zero_sales_count:,} ({pct_zero_sales:.2f}%)")

# Sales by store and family (sums and means)
sales_by_store = df_train.groupby('store_nbr')['sales'].sum().sort_values(ascending=False)
sales_by_family = df_train.groupby('family')['sales'].sum().sort_values(ascending=False)
sales_by_store_mean = df_train.groupby('store_nbr')['sales'].mean().sort_values(ascending=False)
sales_by_family_mean = df_train.groupby('family')['sales'].mean().sort_values(ascending=False)

print('\nTop 10 stores by sales:\n', sales_by_store.head(10))
print('\nTop 10 families by sales:\n', sales_by_family.head(10))

# Daily sales (time series) - ensure 'date' is datetime
df_train['date'] = pd.to_datetime(df_train['date'])
daily_sales = df_train.groupby('date')['sales'].sum()
print('\nDaily sales sample:\n', daily_sales.head())

# Create a summary DataFrame
sales_summary = pd.DataFrame({
    'total_sales': [total_sales],
    'avg_sales': [avg_sales],
    'median_sales': [median_sales],
    'std_sales': [std_sales],
    'zero_sales_count': [zero_sales_count],
    'pct_zero_sales': [pct_zero_sales]
})
print('\nSales summary:\n', sales_summary)

Total sales: $1,073,644,952.20
Average sales: $357.78
Median sales: $11.00
Std dev sales: $1102.00
Zero sales count: 939,130 (31.30%)

Top 10 stores by sales:
 store_nbr
44    6.208755e+07
45    5.449801e+07
47    5.094831e+07
3     5.048191e+07
49    4.342010e+07
46    4.189606e+07
48    3.593313e+07
51    3.291149e+07
8     3.049429e+07
50    2.865302e+07
Name: sales, dtype: float64

Top 10 families by sales:
 family
GROCERY I        3.434627e+08
BEVERAGES        2.169545e+08
PRODUCE          1.227047e+08
CLEANING         9.752129e+07
DAIRY            6.448771e+07
BREAD/BAKERY     4.213395e+07
POULTRY          3.187600e+07
MEATS            3.108647e+07
PERSONAL CARE    2.459205e+07
DELI             2.411032e+07
Name: sales, dtype: float64

Daily sales sample:
 date
2013-01-01      2511.618999
2013-01-02    496092.417944
2013-01-03    361461.231124
2013-01-04    354459.677093
2013-01-05    477350.121229
Name: sales, dtype: float64

Sales summary:
     total_sales   avg_sales  median_s

### Sales Distribution Analysis

Analyzing the shape and characteristics of the sales distribution to understand skewness, outliers, and spread.

In [16]:
# Sales Distribution Characteristics
from scipy import stats

# Calculate distribution metrics
min_sales = df_train['sales'].min()
max_sales = df_train['sales'].max()
q25 = df_train['sales'].quantile(0.25)
q75 = df_train['sales'].quantile(0.75)
q99 = df_train['sales'].quantile(0.99)

# Skewness and Kurtosis
skewness = stats.skew(df_train['sales'])
kurtosis_val = stats.kurtosis(df_train['sales'])

# Coefficient of Variation (std/mean)
cv = (std_sales / avg_sales) * 100

# Gap between mean and median
mean_median_gap = ((avg_sales - median_sales) / median_sales) * 100

# Create distribution summary table
distribution_stats = pd.DataFrame({
    'Metric': ['Mean', 'Median', 'Std Dev', 'Min', 'Max', '25th Percentile', '75th Percentile', '99th Percentile', 
               'Skewness', 'Kurtosis', 'Coefficient of Variation (%)', 'Mean-Median Gap (%)'],
    'Value': [f'${avg_sales:.2f}', f'${median_sales:.2f}', f'${std_sales:.2f}', f'${min_sales:.2f}', 
              f'${max_sales:,.2f}', f'${q25:.2f}', f'${q75:.2f}', f'${q99:,.2f}',
              f'{skewness:.2f}', f'{kurtosis_val:.2f}', f'{cv:.1f}%', f'{mean_median_gap:.1f}%']
})

print("Sales Distribution Statistics:")
print(distribution_stats.to_string(index=False))

print(f"\n Interpretation:")
print(f"- Mean is {mean_median_gap:.1f}% higher than median, indicating RIGHT-SKEWED distribution")
print(f"- High CV ({cv:.1f}%) indicates EXTREME VARIABILITY in sales")
print(f"- Skewness of {skewness:.2f} confirms heavy right tail (large sales are rare but impactful)")
print(f"- Maximum sales (${max_sales:,.0f}) is {max_sales/avg_sales:.0f}x the mean")

Sales Distribution Statistics:
                      Metric       Value
                        Mean     $357.78
                      Median      $11.00
                     Std Dev    $1102.00
                         Min       $0.00
                         Max $124,717.00
             25th Percentile       $0.00
             75th Percentile     $195.85
             99th Percentile   $5,507.00
                    Skewness        7.36
                    Kurtosis      154.56
Coefficient of Variation (%)      308.0%
         Mean-Median Gap (%)     3152.5%

 Interpretation:
- Mean is 3152.5% higher than median, indicating RIGHT-SKEWED distribution
- High CV (308.0%) indicates EXTREME VARIABILITY in sales
- Skewness of 7.36 confirms heavy right tail (large sales are rare but impactful)
- Maximum sales ($124,717) is 349x the mean


### Store Performance Concentration Analysis

Analyzing how sales are distributed across stores to identify concentration and top performers.

In [17]:
# Store-Level Concentration Metrics

# Calculate cumulative percentages
store_sales_sorted = sales_by_store.sort_values(ascending=False)
store_cumsum = store_sales_sorted.cumsum()
store_cumsum_pct = (store_cumsum / total_sales) * 100

# Create top 10 stores with concentration metrics
top_10_stores = pd.DataFrame({
    'Store #': store_sales_sorted.head(10).index,
    'Total Sales': [f'${x:,.0f}' for x in store_sales_sorted.head(10).values],
    '% of Total': [f'{(x/total_sales)*100:.1f}%' for x in store_sales_sorted.head(10).values],
    'Cumulative %': [f'{store_cumsum_pct.iloc[i]:.1f}%' for i in range(10)]
})

print("Top 10 Stores by Sales (with Concentration Metrics):")
print(top_10_stores.to_string(index=False))

# Calculate concentration statistics
top_4_stores_pct = (store_sales_sorted.head(4).sum() / total_sales) * 100
top_10_stores_pct = (store_sales_sorted.head(10).sum() / total_sales) * 100
total_stores = df_train['store_nbr'].nunique()

# Store heterogeneity: max vs min
max_store_sales = store_sales_sorted.iloc[0]
min_store_sales = store_sales_sorted.iloc[-1]
store_variance_ratio = max_store_sales / min_store_sales

print(f"\n Store Concentration Insights:")
print(f"- Top 4 stores (7.4% of locations) generate {top_4_stores_pct:.1f}% of revenue")
print(f"- Top 10 stores ({10/total_stores*100:.1f}% of locations) generate {top_10_stores_pct:.1f}% of revenue")
print(f"- Highest performing store (#{store_sales_sorted.index[0]}) generates {store_variance_ratio:.1f}x more than lowest")
print(f"- This indicates EXTREME HETEROGENEITY across store locations")

Top 10 Stores by Sales (with Concentration Metrics):
 Store # Total Sales % of Total Cumulative %
      44 $62,087,553       5.8%         5.8%
      45 $54,498,010       5.1%        10.9%
      47 $50,948,310       4.7%        15.6%
       3 $50,481,910       4.7%        20.3%
      49 $43,420,096       4.0%        24.4%
      46 $41,896,062       3.9%        28.3%
      48 $35,933,130       3.3%        31.6%
      51 $32,911,490       3.1%        34.7%
       8 $30,494,287       2.8%        37.5%
      50 $28,653,021       2.7%        40.2%

 Store Concentration Insights:
- Top 4 stores (7.4% of locations) generate 20.3% of revenue
- Top 10 stores (18.5% of locations) generate 40.2% of revenue
- Highest performing store (#44) generates 23.0x more than lowest
- This indicates EXTREME HETEROGENEITY across store locations


### Product Family Concentration Analysis

Analyzing which product families drive the most revenue and their contribution to total sales.

In [18]:
# Product Family Concentration Metrics

# Calculate cumulative percentages for families
family_sales_sorted = sales_by_family.sort_values(ascending=False)
family_cumsum = family_sales_sorted.cumsum()
family_cumsum_pct = (family_cumsum / total_sales) * 100

# Create top 10 families with concentration metrics
top_10_families = pd.DataFrame({
    'Product Family': family_sales_sorted.head(10).index,
    'Total Sales': [f'${x:,.0f}' for x in family_sales_sorted.head(10).values],
    '% of Total': [f'{(x/total_sales)*100:.1f}%' for x in family_sales_sorted.head(10).values],
    'Cumulative %': [f'{family_cumsum_pct.iloc[i]:.1f}%' for i in range(10)]
})

print("Top 10 Product Families by Sales (with Concentration Metrics):")
print(top_10_families.to_string(index=False))

# Calculate concentration statistics
top_3_families_pct = (family_sales_sorted.head(3).sum() / total_sales) * 100
top_5_families_pct = (family_sales_sorted.head(5).sum() / total_sales) * 100
top_10_families_pct = (family_sales_sorted.head(10).sum() / total_sales) * 100
total_families = df_train['family'].nunique()

print(f"\n Product Family Concentration Insights:")
print(f"- Top 3 families (GROCERY I, BEVERAGES, PRODUCE) account for {top_3_families_pct:.1f}% of total revenue")
print(f"- Top 5 families ({5/total_families*100:.1f}% of categories) generate {top_5_families_pct:.1f}% of revenue")
print(f"- Top 10 families ({10/total_families*100:.1f}% of categories) generate {top_10_families_pct:.1f}% of revenue")
print(f"- This shows HEAVY CONCENTRATION in daily necessities categories")

Top 10 Product Families by Sales (with Concentration Metrics):
Product Family  Total Sales % of Total Cumulative %
     GROCERY I $343,462,735      32.0%        32.0%
     BEVERAGES $216,954,486      20.2%        52.2%
       PRODUCE $122,704,685      11.4%        63.6%
      CLEANING  $97,521,289       9.1%        72.7%
         DAIRY  $64,487,709       6.0%        78.7%
  BREAD/BAKERY  $42,133,946       3.9%        82.6%
       POULTRY  $31,876,004       3.0%        85.6%
         MEATS  $31,086,468       2.9%        88.5%
 PERSONAL CARE  $24,592,051       2.3%        90.8%
          DELI  $24,110,322       2.2%        93.0%

 Product Family Concentration Insights:
- Top 3 families (GROCERY I, BEVERAGES, PRODUCE) account for 63.6% of total revenue
- Top 5 families (15.2% of categories) generate 78.7% of revenue
- Top 10 families (30.3% of categories) generate 93.0% of revenue
- This shows HEAVY CONCENTRATION in daily necessities categories


### Analyze 'onpromotion' column and calculate detailed statistics

In [19]:

# Total promotion occurrences and percentage
total_onpromo = df_train['onpromotion'].sum()
pct_onpromo = total_onpromo / len(df_train) * 100
zero_onpromo_count = (df_train['onpromotion'] == 0).sum()
pct_zero_onpromo = zero_onpromo_count / len(df_train) * 100
print(f"Total 'onpromotion' occurrences: {total_onpromo}")
print(f"Percentage of rows with promotion: {pct_onpromo:.2f}%")
print(f"Zero 'onpromotion' count: {zero_onpromo_count} ({pct_zero_onpromo:.2f}%)")

# Promotions by store and family (counts and percent)
onpromo_by_store = df_train.groupby('store_nbr')['onpromotion'].sum().sort_values(ascending=False)
onpromo_by_family = df_train.groupby('family')['onpromotion'].sum().sort_values(ascending=False)
onpromo_by_store_pct = (df_train.groupby('store_nbr')['onpromotion'].mean()*100).sort_values(ascending=False)
onpromo_by_family_pct = (df_train.groupby('family')['onpromotion'].mean()*100).sort_values(ascending=False)

print('\nTop 10 stores by promotion count:\n', onpromo_by_store.head(10))
print('\nTop 10 families by promotion count:\n', onpromo_by_family.head(10))

# Daily promotion counts (time series) - ensure 'date' is datetime
df_train['date'] = pd.to_datetime(df_train['date'])
daily_onpromo = df_train.groupby('date')['onpromotion'].sum()
print('\nDaily promotion sample:\n', daily_onpromo.head())

# Create a summary DataFrame
promotion_summary = pd.DataFrame({
    'total_onpromotion': [total_onpromo],
    'pct_onpromotion': [pct_onpromo],
    'unique_promo_days': [daily_onpromo[daily_onpromo>0].shape[0]]
})
print('\nPromotion summary:\n', promotion_summary)

Total 'onpromotion' occurrences: 7810622
Percentage of rows with promotion: 260.28%
Zero 'onpromotion' count: 2389559 (79.63%)

Top 10 stores by promotion count:
 store_nbr
53    204016
47    192725
44    192449
45    191503
46    190697
48    185566
49    184736
9     177356
3     177075
50    174115
Name: onpromotion, dtype: int64

Top 10 families by promotion count:
 family
GROCERY I        1914801
PRODUCE          1117921
BEVERAGES         906958
DAIRY             728707
CLEANING          661157
DELI              583316
BREAD/BAKERY      331289
MEATS             304028
PERSONAL CARE     246928
POULTRY           226421
Name: onpromotion, dtype: int64

Daily promotion sample:
 date
2013-01-01    0
2013-01-02    0
2013-01-03    0
2013-01-04    0
2013-01-05    0
Name: onpromotion, dtype: int64

Promotion summary:
    total_onpromotion  pct_onpromotion  unique_promo_days
0            7810622       260.277025               1230


### Promotional Strategy Analysis

Analyzing how promotions are distributed across stores and product families.

In [20]:
# Promotional Concentration Analysis

# Top stores by promotional activity
promo_sorted_stores = onpromo_by_store.sort_values(ascending=False)
promo_cumsum_stores = promo_sorted_stores.cumsum()
promo_cumsum_pct_stores = (promo_cumsum_stores / total_onpromo) * 100

top_5_promo_stores = pd.DataFrame({
    'Store #': promo_sorted_stores.head(5).index,
    'Total Promo Items': [f'{x:,}' for x in promo_sorted_stores.head(5).values],
    '% of All Promos': [f'{(x/total_onpromo)*100:.1f}%' for x in promo_sorted_stores.head(5).values],
    'Cumulative %': [f'{promo_cumsum_pct_stores.iloc[i]:.1f}%' for i in range(5)]
})

print("Top 5 Stores by Promotional Activity:")
print(top_5_promo_stores.to_string(index=False))

# Top families by promotional activity  
promo_sorted_families = onpromo_by_family.sort_values(ascending=False)
promo_cumsum_families = promo_sorted_families.cumsum()
promo_cumsum_pct_families = (promo_cumsum_families / total_onpromo) * 100

top_5_promo_families = pd.DataFrame({
    'Product Family': promo_sorted_families.head(5).index,
    'Total Promo Items': [f'{x:,}' for x in promo_sorted_families.head(5).values],
    '% of All Promos': [f'{(x/total_onpromo)*100:.1f}%' for x in promo_sorted_families.head(5).values],
    'Cumulative %': [f'{promo_cumsum_pct_families.iloc[i]:.1f}%' for i in range(5)]
})

print("\nTop 5 Product Families by Promotional Activity:")
print(top_5_promo_families.to_string(index=False))

# Promotional insights
top_5_promo_families_pct = (promo_sorted_families.head(5).sum() / total_onpromo) * 100
days_with_promos = (daily_onpromo > 0).sum()
total_days = daily_onpromo.shape[0]
days_with_promos_pct = (days_with_promos / total_days) * 100

print(f"\n Promotional Strategy Insights:")
print(f"- {pct_zero_onpromo:.1f}% of records have NO promotions, while {100-pct_zero_onpromo:.1f}% have at least one")
print(f"- Promotions active on {days_with_promos} out of {total_days} days ({days_with_promos_pct:.1f}%)")
print(f"- Top 5 families receive {top_5_promo_families_pct:.1f}% of all promotional activity")
print(f"- Promotions are CATEGORY-SPECIFIC, targeting high-revenue families")

Top 5 Stores by Promotional Activity:
 Store # Total Promo Items % of All Promos Cumulative %
      53           204,016            2.6%         2.6%
      47           192,725            2.5%         5.1%
      44           192,449            2.5%         7.5%
      45           191,503            2.5%        10.0%
      46           190,697            2.4%        12.4%

Top 5 Product Families by Promotional Activity:
Product Family Total Promo Items % of All Promos Cumulative %
     GROCERY I         1,914,801           24.5%        24.5%
       PRODUCE         1,117,921           14.3%        38.8%
     BEVERAGES           906,958           11.6%        50.4%
         DAIRY           728,707            9.3%        59.8%
      CLEANING           661,157            8.5%        68.2%

 Promotional Strategy Insights:
- 79.6% of records have NO promotions, while 20.4% have at least one
- Promotions active on 1230 out of 1684 days (73.0%)
- Top 5 families receive 68.2% of all promotional 