# 1. Import libraries and load dataset


In [None]:
import numpy as np 
import pandas as pd 

df_oil = pd.read_csv(r"D:\Topic_13_Project\Topic_13_Retail_Store_Sales_Time_Series\data\raw\oil.csv")

# 2. Display basic information about the dataset


In [3]:
print("First 5 Rows of Data Frame:\n", df_oil.head(5))
print("Data Frame Shape:\n", df_oil.shape)
print("Data Frame Info:\n", df_oil.info())
print("Data Frame Statistics:\n", df_oil.describe())

First 5 Rows of Data Frame:
          date  dcoilwtico
0  2013-01-01         NaN
1  2013-01-02       93.14
2  2013-01-03       92.97
3  2013-01-04       93.12
4  2013-01-07       93.20
Data Frame Shape:
 (1218, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218 entries, 0 to 1217
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        1218 non-null   object 
 1   dcoilwtico  1175 non-null   float64
dtypes: float64(1), object(1)
memory usage: 19.2+ KB
Data Frame Info:
 None
Data Frame Statistics:
         dcoilwtico
count  1175.000000
mean     67.714366
std      25.630476
min      26.190000
25%      46.405000
50%      53.190000
75%      95.660000
max     110.620000


# 3. Missing Values Summary

In [4]:
# Calculate missing values
missing_counts = df_oil.isnull().sum()
missing_percentage = missing_counts / len(df_oil) * 100

# Create a summary DataFrame
missing_df_oil = pd.DataFrame({
    'Missing Values': missing_counts,
    'Percentage': missing_percentage
})

print("Missing Values Summary:\n", missing_df_oil)

# Filter columns with missing values
missing_df_oil = missing_df_oil[missing_df_oil['Missing Values'] > 0].sort_values(by='Missing Values', ascending=False)

Missing Values Summary:
             Missing Values  Percentage
date                     0    0.000000
dcoilwtico              43    3.530378


# 4. Handling Missing Values

In [5]:
# Handle missing values for oil price (time-series data)
df_oil['date'] = pd.to_datetime(df_oil['date'])
df_oil = df_oil.sort_values('date')

# Forward fill missing oil prices
df_oil['dcoilwtico'] = df_oil['dcoilwtico'].ffill()

# Backward fill in case missing values exist at the start
df_oil['dcoilwtico'] = df_oil['dcoilwtico'].bfill()

# Recheck missing values after handling
print("Missing Values After Handling:\n", df_oil.isnull().sum())


Missing Values After Handling:
 date          0
dcoilwtico    0
dtype: int64


## 5. Oil Price Distribution Analysis

Analyzing statistical characteristics of oil prices to understand volatility and regime shifts.

In [None]:
# Oil Price Distribution Statistics
from scipy import stats

mean_oil = df_oil['dcoilwtico'].mean()
median_oil = df_oil['dcoilwtico'].median()
std_oil = df_oil['dcoilwtico'].std()
min_oil = df_oil['dcoilwtico'].min()
max_oil = df_oil['dcoilwtico'].max()
q25_oil = df_oil['dcoilwtico'].quantile(0.25)
q75_oil = df_oil['dcoilwtico'].quantile(0.75)

# Distribution shape metrics
skewness_oil = stats.skew(df_oil['dcoilwtico'])
kurtosis_oil = stats.kurtosis(df_oil['dcoilwtico'])
cv_oil = (std_oil / mean_oil) * 100

# Mean-median gap
mean_median_gap_oil = ((mean_oil - median_oil) / median_oil) * 100

# Price range
price_range = max_oil - min_oil
price_range_pct = (price_range / min_oil) * 100

print("Oil Price Distribution Statistics:")
print("=" * 60)
print(f"Mean price: ${mean_oil:.2f}/barrel")
print(f"Median price: ${median_oil:.2f}/barrel")
print(f"Std deviation: ${std_oil:.2f}")
print(f"Min price: ${min_oil:.2f}/barrel")
print(f"Max price: ${max_oil:.2f}/barrel")
print(f"25th percentile: ${q25_oil:.2f}/barrel")
print(f"75th percentile: ${q75_oil:.2f}/barrel")

print(f"\nDistribution Characteristics:")
print("=" * 60)
print(f"Coefficient of Variation: {cv_oil:.1f}%")
print(f"Skewness: {skewness_oil:.2f}")
print(f"Kurtosis: {kurtosis_oil:.2f}")
print(f"Mean-Median gap: {mean_median_gap_oil:.1f}%")
print(f"Price range: ${price_range:.2f} ({price_range_pct:.0f}% swing)")

print(f"\nðŸ’° Interpretation:")
print(f"- Mean {mean_median_gap_oil:.1f}% higher than median indicates RIGHT-SKEWED distribution")
print(f"- High CV ({cv_oil:.1f}%) shows EXTREME VOLATILITY in oil prices")
print(f"- Price swung {price_range_pct:.0f}% from min to max (${min_oil:.2f} to ${max_oil:.2f})")
print(f"- Positive skewness ({skewness_oil:.2f}) reflects 2013-2014 high-price period")

## 6. Oil Price Regime Analysis

Identifying different price regimes to understand macroeconomic shocks affecting retail sales.

In [None]:
# Oil Price Regime Classification
# Define regimes based on price levels
df_oil['regime'] = pd.cut(df_oil['dcoilwtico'], 
                           bins=[0, 40, 70, 150], 
                           labels=['Low (<$40)', 'Medium ($40-$70)', 'High (>$70)'])

regime_stats = df_oil.groupby('regime')['dcoilwtico'].agg(['count', 'mean', 'min', 'max'])
regime_pct = (regime_stats['count'] / len(df_oil)) * 100

print("Oil Price Regime Distribution:")
print("=" * 60)
for regime in regime_stats.index:
    count = regime_stats.loc[regime, 'count']
    mean_price = regime_stats.loc[regime, 'mean']
    min_price = regime_stats.loc[regime, 'min']
    max_price = regime_stats.loc[regime, 'max']
    pct = regime_pct.loc[regime]
    print(f"{regime}:")
    print(f"  Days: {count} ({pct:.1f}%)")
    print(f"  Avg: ${mean_price:.2f}, Range: ${min_price:.2f}-${max_price:.2f}")
    print()

# Identify crash period (2015-2016)
crash_threshold = 40
crash_days = (df_oil['dcoilwtico'] < crash_threshold).sum()
crash_pct = (crash_days / len(df_oil)) * 100

print(f"ðŸš¨ Oil Crash Analysis (prices <${crash_threshold}):")
print(f"  - {crash_days} days ({crash_pct:.1f}%) in crash territory")
print(f"  - This represents the 2015-2016 oil market collapse")
print(f"  - Ecuador's oil-dependent economy likely experienced severe retail impact")