In [None]:
'''
INFO_511_ Application Exercise 02: Diwali-Sales
Author: Todd Adams
Date: 04/06/2025
Description: We are evaluating the dataset data/diwali_sales_data.csv
Note: I used VS Code and ChatGPT to help me write this code.
'''
# Exercise 1: Reading and examining the data

# Load Packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('data/diwali_sales_data.csv', encoding='iso-8859-1')

# Display the first 5 rows
df.head()

# Basic information about the dataset
print("Basic information about the dataset:")
df.info()

# Summary statistics of the dataset
print("Summary statistics of the dataset:")
df.describe()

'''
The dataset contains sales information from a Diwali campaign.  
Using .info(), we observed that some columns have missing values (especially in Amount, Product_Category, etc.).  
The .describe() method showed that while some numeric fields like Amount and Age have wide ranges, others are more constrained.  
This suggests we should check for potential outliers or anomalies in these fields.
'''

# Exercise 2: Exploring unique levels, outliers, and missing values

# 1. Exploring unique levels - Identify and display the unique values in each categorical column
# Display unique values for each categorical column
categorical_cols = df.select_dtypes(include='object').columns

for col in categorical_cols:
    print(f"Unique values in {col}:")
    print(df[col].unique())
    print('-' * 40)

# 2. Identifying and visualizing outliers

# Create a box plot to visualize outliers in the 'Amount' column
plt.figure(figsize=(8, 4))
sns.boxplot(x=df['Amount'])
plt.title('Boxplot of Purchase Amounts')
plt.show()


# Identify outliers using the IQR method and count the number of outliers for each numerical column
# Outlier count for all numeric columns
numeric_cols = df.select_dtypes(include='number').columns

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outlier_count = df[(df[col] < lower) | (df[col] > upper)].shape[0]
    print(f"{col}: {outlier_count} outliers"

# 3. Handle missing values:
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)

# Optional: visualizing missing values
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Value Heatmap")
plt.show()

'''
Upon checking for missing values, we noticed that columns such as Amount and Product_Category contain some nulls.  
These will need to be addressed before further analysis.  
For example, we might choose to drop rows with missing Amount values as they are critical for sales analysis.  
Unique value exploration revealed that some columns (e.g., Age, City_Category)  
have a limited number of unique values, which is useful for grouping and segmenting customers.
'''
