# 1. Import libraries and load the dataset



In [3]:
import pandas as pd
import numpy as np


df_train = pd.read_csv(r'D:\Topic_13_Retail_Store_Sales_Time_Series\Topic_13_Retail_Store_Sales_Time_Series\data\raw\train.csv')

# 2. Display basic information about the dataset


In [5]:
print("First 5 Rows of Data Frame:\n", df_train.head(5))
print("Data Frame Shape:\n", df_train.shape)
print("Data Frame Info:\n", df_train.info())
print("Data Frame Statistics:\n", df_train.describe())
    

First 5 Rows of Data Frame:
    id        date  store_nbr      family  sales  onpromotion
0   0  2013-01-01          1  AUTOMOTIVE    0.0            0
1   1  2013-01-01          1   BABY CARE    0.0            0
2   2  2013-01-01          1      BEAUTY    0.0            0
3   3  2013-01-01          1   BEVERAGES    0.0            0
4   4  2013-01-01          1       BOOKS    0.0            0
Data Frame Shape:
 (3000888, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 6 columns):
 #   Column       Dtype  
---  ------       -----  
 0   id           int64  
 1   date         object 
 2   store_nbr    int64  
 3   family       object 
 4   sales        float64
 5   onpromotion  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 137.4+ MB
Data Frame Info:
 None
Data Frame Statistics:
                  id     store_nbr         sales   onpromotion
count  3.000888e+06  3.000888e+06  3.000888e+06  3.000888e+06
mean   1.500444e+0

# 3. Missing Values Summary

In [13]:
# Calculate missing values
missing_counts = df_train.isnull().sum()
missing_percentage = missing_counts / len(df_train) * 100

# Create a summary DataFrame
missing_df_train = pd.DataFrame({
    'Missing Values': missing_counts,
    'Percentage': missing_percentage
})

print("Missing Values Summary:\n", missing_df_train)

# Filter columns with missing values
missing_df_train = missing_df_train[missing_df_train['Missing Values'] > 0].sort_values(by='Missing Values', ascending=False)

Missing Values Summary:
              Missing Values  Percentage
id                        0         0.0
date                      0         0.0
store_nbr                 0         0.0
family                    0         0.0
sales                     0         0.0
onpromotion               0         0.0


# 4. Detailed Analyze and Statistics

### Analyze 'sales' column and calculate detailed statistics

In [10]:
# Analyze 'sales' column and calculate detailed statistics
# Total sales and basic stats
total_sales = df_train['sales'].sum()
avg_sales = df_train['sales'].mean()
median_sales = df_train['sales'].median()
std_sales = df_train['sales'].std()
zero_sales_count = (df_train['sales'] == 0.0).sum()
pct_zero_sales = zero_sales_count / len(df_train) * 100
print(f"Total sales: {total_sales:.2f}")
print(f"Average sales: {avg_sales:.2f}")
print(f"Median sales: {median_sales:.2f}")
print(f"Std dev sales: {std_sales:.2f}")
print(f"Zero sales count: {zero_sales_count} ({pct_zero_sales:.2f}%)")

# Sales by store and family (sums and means)
sales_by_store = df_train.groupby('store_nbr')['sales'].sum().sort_values(ascending=False)
sales_by_family = df_train.groupby('family')['sales'].sum().sort_values(ascending=False)
sales_by_store_mean = df_train.groupby('store_nbr')['sales'].mean().sort_values(ascending=False)
sales_by_family_mean = df_train.groupby('family')['sales'].mean().sort_values(ascending=False)

print('\nTop 10 stores by sales:\n', sales_by_store.head(10))
print('\nTop 10 families by sales:\n', sales_by_family.head(10))

# Daily sales (time series) - ensure 'date' is datetime
df_train['date'] = pd.to_datetime(df_train['date'])
daily_sales = df_train.groupby('date')['sales'].sum()
print('\nDaily sales sample:\n', daily_sales.head())

# Create a summary DataFrame
sales_summary = pd.DataFrame({
    'total_sales': [total_sales],
    'avg_sales': [avg_sales],
    'median_sales': [median_sales],
    'std_sales': [std_sales],
    'zero_sales_count': [zero_sales_count],
    'pct_zero_sales': [pct_zero_sales]
})
print('\nSales summary:\n', sales_summary)

Total sales: 1073644952.20
Average sales: 357.78
Median sales: 11.00
Std dev sales: 1102.00
Zero sales count: 939130 (31.30%)

Top 10 stores by sales:
 store_nbr
44    6.208755e+07
45    5.449801e+07
47    5.094831e+07
3     5.048191e+07
49    4.342010e+07
46    4.189606e+07
48    3.593313e+07
51    3.291149e+07
8     3.049429e+07
50    2.865302e+07
Name: sales, dtype: float64

Top 10 families by sales:
 family
GROCERY I        3.434627e+08
BEVERAGES        2.169545e+08
PRODUCE          1.227047e+08
CLEANING         9.752129e+07
DAIRY            6.448771e+07
BREAD/BAKERY     4.213395e+07
POULTRY          3.187600e+07
MEATS            3.108647e+07
PERSONAL CARE    2.459205e+07
DELI             2.411032e+07
Name: sales, dtype: float64

Daily sales sample:
 date
2013-01-01      2511.618999
2013-01-02    496092.417944
2013-01-03    361461.231124
2013-01-04    354459.677093
2013-01-05    477350.121229
Name: sales, dtype: float64

Sales summary:
     total_sales   avg_sales  median_sales    

### Analyze 'onpromotion' column and calculate detailed statistics

In [None]:

# Total promotion occurrences and percentage
total_onpromo = df_train['onpromotion'].sum()
pct_onpromo = total_onpromo / len(df_train) * 100
zero_onpromo_count = (df_train['onpromotion'] == 0).sum()
pct_zero_onpromo = zero_onpromo_count / len(df_train) * 100
print(f"Total 'onpromotion' occurrences: {total_onpromo}")
print(f"Percentage of rows with promotion: {pct_onpromo:.2f}%")
print(f"Zero 'onpromotion' count: {zero_onpromo_count} ({pct_zero_onpromo:.2f}%)")

# Promotions by store and family (counts and percent)
onpromo_by_store = df_train.groupby('store_nbr')['onpromotion'].sum().sort_values(ascending=False)
onpromo_by_family = df_train.groupby('family')['onpromotion'].sum().sort_values(ascending=False)
onpromo_by_store_pct = (df_train.groupby('store_nbr')['onpromotion'].mean()*100).sort_values(ascending=False)
onpromo_by_family_pct = (df_train.groupby('family')['onpromotion'].mean()*100).sort_values(ascending=False)

print('\nTop 10 stores by promotion count:\n', onpromo_by_store.head(10))
print('\nTop 10 families by promotion count:\n', onpromo_by_family.head(10))

# Daily promotion counts (time series) - ensure 'date' is datetime
df_train['date'] = pd.to_datetime(df_train['date'])
daily_onpromo = df_train.groupby('date')['onpromotion'].sum()
print('\nDaily promotion sample:\n', daily_onpromo.head())

# Create a summary DataFrame
promotion_summary = pd.DataFrame({
    'total_onpromotion': [total_onpromo],
    'pct_onpromotion': [pct_onpromo],
    'unique_promo_days': [daily_onpromo[daily_onpromo>0].shape[0]]
})
print('\nPromotion summary:\n', promotion_summary)

Total 'onpromotion' occurrences: 7810622
Percentage of rows with promotion: 260.28%
Zero 'onpromotion' count: 2389559 (79.63%)

Top 10 stores by promotion count:
 store_nbr
53    204016
47    192725
44    192449
45    191503
46    190697
48    185566
49    184736
9     177356
3     177075
50    174115
Name: onpromotion, dtype: int64

Top 10 families by promotion count:
 family
GROCERY I        1914801
PRODUCE          1117921
BEVERAGES         906958
DAIRY             728707
CLEANING          661157
DELI              583316
BREAD/BAKERY      331289
MEATS             304028
PERSONAL CARE     246928
POULTRY           226421
Name: onpromotion, dtype: int64

Daily promotion sample:
 date
2013-01-01    0
2013-01-02    0
2013-01-03    0
2013-01-04    0
2013-01-05    0
Name: onpromotion, dtype: int64

Promotion summary:
    total_onpromotion  pct_onpromotion  unique_promo_days
0            7810622       260.277025               1230


# 5. Handling Missong Values