#Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load dataset

In [None]:
df=pd.read_csv('sales_data_with_discounts.csv')
df.head()

In [None]:
df.shape

# Detect numeric and categorical columns

In [None]:
numeric_cols=df.select_dtypes(include=np.number)
numeric_cols.head()

In [None]:
categorical_cols=df.select_dtypes(include='object')
categorical_cols.head()

# Descriptive statistics (numeric columns)

In [None]:
desc=numeric_cols.describe().T
desc

# Histograms and boxplots for numeric columns

In [None]:

for c in numeric_cols:
    plt.figure()
    sns.histplot(df[c].dropna(), kde=False)
    plt.title(f"Histogram - {c}")
    plt.xlabel(c)
    plt.ylabel("Frequency")

In [None]:
for c in numeric_cols:
    plt.figure()
    sns.boxplot(x=df[c])
    plt.title(f"Boxplot - {c}")

#Skewness,kurtosis and correlation of numerical cols

In [None]:
numeric_cols.skew()

In [None]:
numeric_cols.kurtosis()

In [None]:
numeric_cols.corr()

# Bar charts for categorical columns (top 20)

In [None]:
for c in categorical_cols:
    plt.figure(figsize=(8,4))
    counts = df[c].value_counts().head(20)
    sns.barplot(x=counts.index.astype(str), y=counts.values)
    plt.xticks(rotation=45)
    plt.title(f"Bar Chart - {c}")

# Standardization (Z-score)

In [None]:
scaler = StandardScaler()
scaled_arr = scaler.fit_transform(df[numeric_cols.columns].fillna(df[numeric_cols.columns].mean()))
scaled_df = pd.DataFrame(scaled_arr, columns=numeric_cols.columns)

In [None]:
scaled_df.head()


#scaled numeric data and summary

In [None]:
summary_df = pd.DataFrame({
    "mean_before": df[numeric_cols.columns].mean(),
    "std_before": df[numeric_cols.columns].std(),
    "mean_after": scaled_df.mean(),
    "std_after": scaled_df.std()})

In [None]:
summary_df

# One-hot encoding (one-hot / dummy variables)

In [None]:
# One-hot encoding (one-hot / dummy variables)
df_dummies = pd.get_dummies(categorical_cols.fillna("<<NA>>"), prefix=categorical_cols.columns, prefix_sep="__", drop_first=False)
transformed_df = pd.concat([scaled_df.reset_index(drop=True), df_dummies.reset_index(drop=True)], axis=1)

In [None]:
transformed_df

# Outlier detection using IQR

In [None]:
# Outlier detection using IQR
outlier_report = []
for c in numeric_cols:
    col = df[c].dropna()
    q1 = col.quantile(0.25)
    q3 = col.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    n_outliers = ((col < lower) | (col > upper)).sum()
    outlier_report.append({
        "column": c, "q1": q1, "q3": q3, "iqr": iqr, "lower": lower, "upper": upper, "n_outliers": int(n_outliers)
    })

outlier_df = pd.DataFrame(outlier_report)
# outlier_df.to_csv(os.path.join(OUT_DIR, "outlier_report_iqr.csv"), index=False)
outlier_df