# 1 Data Exploration
    a. Explore the dataset by displaying the first few rows, summary statistics, and data types of each column.
    b. Identify missing values, outliers, and unique values in categorical columns.

## 1.1 store sales
### 1.1.a Explore the dataset



In [None]:
from pathlib import Path
from matplotlib.colors import ListedColormap, BoundaryNorm
import seaborn as sns
import matplotlib.ticker as mtick
import pandas as pd
import matplotlib.pyplot as plt
import math

#### Oil

In [None]:
# 1. Overview

DATA_DIR = Path("data/assigment_1/store-sales-item-time-series")

oil_df = pd.read_csv(DATA_DIR / "oil.csv", names=["date", "oil_price"], header=0, parse_dates=["date"])

print("=== Head: ===")
print(oil_df.head())

print("=== Info: ===")
print(oil_df.info())

print("=== Description: ===")
print(oil_df.describe())

In [None]:
print("=== Time Period: ===")
print("Begin:", oil_df["date"].min(), "\nEnd:", oil_df["date"].max())
print("Tage insgesamt:", oil_df["date"].nunique())

#### holiday

In [None]:
holidays_df = pd.read_csv(DATA_DIR / "holidays_events.csv",
                          names=["date", "type", "local", "local-name", "description", "transferred", ],
                          header=0, parse_dates=["date"])

print("=== Head: ===")
print(holidays_df.head())

print("=== Info: ===")
print(holidays_df.info())

print("=== Description: ===")
print(holidays_df.describe())

In [None]:
# 2. Missing Values
print("=== How much NAs per Col: ===")
print(holidays_df.isnull().sum())

#### Sample submission


In [None]:
sample_submission = pd.read_csv(DATA_DIR / "sample_submission.csv",
                          names=["id", "sales" ],
                          header=0)

print("=== Head: ===")
print(sample_submission.head())

print("=== Info: ===")
print(sample_submission.info())

print("=== Description: ===")
print(sample_submission.describe())

#### Stores

In [None]:
stores = pd.read_csv(DATA_DIR / "stores.csv",
                          names=["store_nbr", "city", "state", "type", "cluster" ],
                          header=0)

print("=== Head: ===")
print(stores.head())

print("=== Info: ===")
print(stores.info())

print("=== Description: ===")
print(stores.describe())

In [None]:

store_counts = stores['type'].value_counts().sort_index()

plt.figure(figsize=(8,5))
store_counts.plot(kind='bar')
plt.title("Distribution of Stores by Type")
plt.xlabel("Store Type")
plt.ylabel("Number of Stores")
plt.xticks(rotation=0)
plt.show()


In [None]:


cluster_type_ct = pd.crosstab(stores['cluster'], stores['type'])
plt.figure(figsize=(8,6))
sns.heatmap(cluster_type_ct, annot=True, fmt="d", cmap="Greens")
plt.title("Cluster vs. Store Type")
plt.ylabel("Cluster")
plt.xlabel("Store Type")
plt.show()


In [None]:


city_cluster_ct = pd.crosstab(stores['city'], stores['cluster'])

values = range(int(city_cluster_ct.values.max()) + 1)
cmap = ListedColormap(sns.color_palette("YlGnBu", len(values)))
norm = BoundaryNorm(values, cmap.N)

plt.figure(figsize=(12,8))
sns.heatmap(
    city_cluster_ct,
    cmap=cmap,
    norm=norm,
    cbar=True,
    linewidths=0.5,
    linecolor="gray",
    annot=True, fmt="d"
)
plt.title("City vs. Cluster Distribution (Discrete)")
plt.ylabel("City")
plt.xlabel("Cluster")
plt.show()


#### transaction


In [None]:
transaction_df = pd.read_csv(DATA_DIR / "transactions.csv",
                          names=["date", "store_nbr", "transaction" ],
                          header=0, parse_dates=["date"])

print("=== Head: ===")
print(transaction_df.head())

print("=== Info: ===")
print(transaction_df.info())

print("=== Description: ===")
print(transaction_df.describe())

In [None]:
# count unique store_nbr
transaction_df['store_nbr'].nunique()

In [None]:
transactions_per_day = transaction_df.groupby('date')['transaction'].sum()


plt.figure(figsize=(12,5))
transactions_per_day.plot()
plt.title("Total Transactions per Day")
plt.xlabel("Date")
plt.ylabel("Transactions")
plt.show()

In [None]:
plt.figure(figsize=(20,10))
transactions_per_day.plot(alpha=0.4, label="Daily")
transactions_per_day.rolling(7).mean().plot(label="7-day Avg")
transactions_per_day.rolling(14).mean().plot(label="14-day Avg")
transactions_per_day.rolling(30).mean().plot(label="30-day Avg")
plt.title("Total Transactions per Day with Rolling Averages")
plt.xlabel("Date")
plt.ylabel("Transactions")
plt.legend()
plt.show()


#### Train/ Test

In [None]:
train_df = pd.read_csv(DATA_DIR / "train.csv",
                          names=["id", "date", "store_nbr", "family","sales", "onpromotion" ],
                          header=0, parse_dates=["date"])

print("=== Head: ===")
print(train_df.head())

print("=== Info: ===")
print(train_df.info())

print("=== Description: ===")
print(train_df.describe())

In [None]:
train_df['family'].nunique()

In [None]:
# missing values
train_df.isna().sum()

In [None]:

daily_sales = train_df.groupby('date')['sales'].sum()

plt.figure(figsize=(12,5))
ax = daily_sales.plot()

# values into millions
ax.yaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f'{float(x/1e6)}M'))

plt.title("Total Sales per Day")
plt.xlabel("Date")
plt.ylabel("Sales")
plt.show()



In [None]:

weekday_sales = train_df.groupby(train_df['date'].dt.day_name())['sales'].sum()

# define order fpr weekdays
order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
weekday_sales = weekday_sales.reindex(order)

# values into millions
ax = weekday_sales.plot.bar()
ax.yaxis.set_major_formatter(mtick.FuncFormatter(lambda x, _: f'{float(x/1e6)}M'))
ax.xaxis.set_tick_params(rotation=45)


### 1.1.b Identifiy missing data and outliers

#### missing data

In [None]:
# Missing oil
print("=== How much NAs per Col: ===")
print(oil_df.isnull().sum())

In [None]:
# Visualization to missing oil data:
import matplotlib.pyplot as plt
s = oil_df.set_index("date")["oil_price"]

fig, ax = plt.subplots(figsize=(12,5))
s.plot(ax=ax)

for x in s.index[s.isna()]:
    ax.axvline(x, linestyle="--", linewidth=0.8, alpha=0.3, color="tab:orange")

ax.set_title("Daily Oil Price")
plt.show()

In [None]:
tpd = (transaction_df
       .groupby(['store_nbr', 'date'], as_index=False)['transaction']
       .sum()
       .sort_values(['store_nbr', 'date']))
tpd['date'] = pd.to_datetime(tpd['date']).dt.normalize()


full_idx = pd.date_range(tpd['date'].min(), tpd['date'].max(), freq='D')


dates_by_store = tpd.groupby('store_nbr')['date'].unique()
missing_counts = {s: len(full_idx.difference(pd.DatetimeIndex(dates)))
                  for s, dates in dates_by_store.items()}
stores_with_gaps = [s for s, cnt in missing_counts.items() if cnt > 0]

stores_to_plot = sorted(stores_with_gaps, key=lambda s: missing_counts[s], reverse=True)

# Subplots
n = len(stores_to_plot)
cols = 3 if n > 1 else 1
rows = math.ceil(n / cols)

fig, axes = plt.subplots(rows, cols, figsize=(14, 3.2*rows), sharex=True)
axes = axes.flatten() if n > 1 else [axes]

for ax, s in zip(axes, stores_to_plot):
    sdf = (tpd[tpd['store_nbr'] == s]
           .set_index('date')
           .reindex(full_idx))
    ax.plot(sdf.index, sdf['transaction'], linewidth=1.1)
    # NA-days marked
    miss = sdf['transaction'].isna()
    if miss.any():
        ymin, ymax = ax.get_ylim()
        ax.scatter(sdf.index[miss], [ymin]*miss.sum(), marker='v', s=16, alpha=0.7, color="red")
    ax.set_title(f"Store {s}  (missing days: {missing_counts[s]})", fontsize=10)

for ax in axes[n:]:
    ax.axis('off')

fig.tight_layout()
plt.show()


Most stores have 10-12 days missing around Christmas/ New Year, but some stores have a lot more missing data.

In [None]:
tpd_df = transaction_df.copy()
tpd_df['date'] = pd.to_datetime(tpd_df['date']).dt.normalize()

tpd = (tpd_df
       .groupby(['store_nbr', 'date'], as_index=False)['transaction']
       .sum()
       .sort_values(['store_nbr', 'date']))


stores_to_plot = sorted(tpd['store_nbr'].unique().tolist())


n = len(stores_to_plot)
cols = 4 if n >= 12 else 3 if n >= 6 else 2 if n > 1 else 1
rows = math.ceil(n / cols)

fig, axes = plt.subplots(rows, cols, figsize=(3.2*cols, 3.8*rows))
axes = axes.flatten() if n > 1 else [axes]

for ax, store in zip(axes, stores_to_plot):
    svals = tpd.loc[tpd['store_nbr'] == store, 'transaction']
    sns.boxplot(y=svals, ax=ax, fliersize=2)
    ax.set_title(f"Store {store} (n={svals.notna().sum()})", fontsize=9)
    ax.grid(True, alpha=0.2)

for ax in axes[len(stores_to_plot):]:
    ax.axis('off')

fig.tight_layout()
plt.show()


Looks like there are outliers in every store transaction.

# 2 Data Cleaning
    a. Handling Missing Values
    b. Choose appropriate methods to handle missing values (e.g., mean/median imputation for numerical data, mode imputation for categorical data, or deletion of rows/columns).
    c. Justify your choices for handling missing data.

# 3 Handling Outliers
    a. Detect outliers using methods such as the IQR method or Z-score.
    b. Decide whether to remove, cap, or transform the outliers. Justify your decisions.

# 4 Data Transformation
    a. Encoding Categorical Data
        i. Apply label encoding or one-hot encoding to transform categorical data into numerical form.
        ii. Justify your choice of encoding method.
    b. Feature Scaling
        i. Apply feature scaling techniques such as normalization (Min-Max scaling) or standardization (Z-score normalization) to the dataset.
        ii. Explain why feature scaling is necessary and how it impacts the model.


# 5 Data Splitting
    a. Split the preprocessed dataset into training and testing sets. Typically, an 80-20 or 70-30 split is used.


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the training dataset
train_df = pd.read_csv("train.csv", parse_dates=["date"])

# Features and target
X = train_df.drop("sales", axis=1)
y = train_df["sales"]

# Split 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

# Check the shapes
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (383753, 5)
X_test shape: (95939, 5)
y_train shape: (383753,)
y_test shape: (95939,)


    b. Explain the importance of splitting the data and how it prevents overfitting.

We split the dataset into training (80%) and testing (20%) sets. The training set is used to teach the model, while the testing set evaluates its performance on unseen data. This prevents overfitting, ensuring the model generalizes well rather than just memorizing the training data.

# 6 Bonus
Apply dimensionality reduction techniques such as Principal
Component Analysis (PCA) and discuss how it affects the dataset.