#### 1.

In [None]:
import pandas as pd

# Reading the csv file with proper error handling
file_path = "sales_data.csv"

try:
    df = pd.read_csv(file_path)
    print("File loaded successfully!")
except FileNotFoundError:
    print(f"Error: File '{file_path}' not found.")
except pd.errors.EmptyDataError:
    print("Error: File is empty.")
except pd.errors.ParserError:
    print("Error: File could not be parsed (check delimiter/format).")
except Exception as e:
    print(f"Unexpected error: {str(e)}")

# 1. 

# Grouping by 'Category' first, then using agg() function, created variables(e.g. Total_Quantity) assigning the aggregation methods
result = df.groupby('Category').agg(
    Total_Quantity=('Quantity', 'sum'),
    Average_Price=('Price', 'mean'),
    Max_Quantity=('Quantity', 'max')
).reset_index()

# The result dataframe
result

# 2.

# Group by Category and Product, sum quantities, then find top product in each category
top_products = (df.groupby(['Category', 'Product'])['Quantity']
                .sum()
                .reset_index()
                .sort_values('Quantity', ascending=False)
                .drop_duplicates('Category')
                )

# The result table(dataframe)
top_products

# 3.

# Adding a new column ('Revenue'='Quantity' * 'Price')
df['Revenue'] = df['Quantity'] * df['Price']

# Grouping by 'Date', then summing the 'Revenue' col and reset index col
highest_sales = df.groupby('Date')['Revenue'].sum().reset_index()

# Filtering the maximum sales and assigning to a new variable (max_sales)
max_sales = highest_sales.loc[highest_sales['Revenue'].idxmax()]

# The result
max_sales


#### 2.

In [None]:
import pandas as pd

# Reading the csv file with proper error handling
file_path = "customer_orders.csv"

try:
    df = pd.read_csv(file_path)
    print("File loaded successfully!")
except FileNotFoundError:
    print(f"Error: File '{file_path}' not found.")
except pd.errors.EmptyDataError:
    print("Error: File is empty.")
except pd.errors.ParserError:
    print("Error: File could not be parsed (check delimiter/format).")
except Exception as e:
    print(f"Unexpected error: {str(e)}")


# 1a. Version 1

# Grouping df by 'CustomerID' to use for upcoming tasks as well
customer_group = df.groupby('CustomerID')

# On the grouped object, aggregating 'OrderID' col with count() method, renamed the col to 'Orders', and lastly reset the index col
orders = customer_group.agg(Orders = ('OrderID', 'count')).reset_index()

# Filtering the 'Orders' col where the order qty is less than 20 per customer
less_20_orders = orders[orders['Orders'] < 20]

# The result dataframe
less_20_orders

# 1b. Version 2

# More coincise and shorter (professional) version

low_volume_customers = (df.groupby('CustomerID')['OrderID']
                        .count()
                        .reset_index(name='Orders')
                        .query('Orders < 20'))
# The result
low_volume_customers

# 2.

# Grouping by 'CustomerID', then appying mean() method on 'Price" col, next reset index renaming the mean column to 'AvgPrice' and last filtering the AvgPrice col with query method.
customers_with_avgPrice = (df.groupby('CustomerID')['Price']
                           .mean()
                           .reset_index(name='AvgPrice')
                           .query('AvgPrice > 120')
)

# The result df
customers_with_avgPrice

# 3.

# Grouped the df by 'Product' col. aggregated 'Quantity' & 'Price' cols with sum(), then queried to find Total_quantity less than 5, lastly reset the index col
total_products = (df.groupby('Product').agg(
                                            Total_quantity = ('Quantity', 'sum'),
                                            Total_price = ('Price', 'sum'))
                                            .query('Total_quantity < 5')
                                            .reset_index()
)

total_products

#### 3. 

In [None]:
import pandas as pd
import numpy as np
import sqlite3
from contextlib import closing

# 1.
# Connecting to a .db file with try:catch block

try:
    with closing(sqlite3.connect('population.db')) as conn:  # Ensures cleanup
        with conn:  # Manages transactions
            query = "SELECT * FROM population"
            db = pd.read_sql(query, conn)
            print("DB connected successfully!")
except sqlite3.Error as e:
    print(f"SQLite error: {e}")
except Exception as e:
    print(f"Unexpected error: {e}")


# 2.

# Reading the csv file with proper error handling
file_path = "population_salary_analysis.xlsx"

try:
    band = pd.read_excel(file_path)
    print("File loaded successfully!")
except FileNotFoundError:
    print(f"Error: File '{file_path}' not found.")
except pd.errors.EmptyDataError:
    print("Error: File is empty.")
except pd.errors.ParserError:
    print("Error: File could not be parsed (check delimiter/format).")
except Exception as e:
    print(f"Unexpected error: {str(e)}")


# Defining a variable 'salary_cat' that hold the list of labesl(categories)
salary_cat = list(band['Salary Band'])

# Defining the salary intervals in the list form
salary_interval = [0, 200_001.0, 400_001.0, 600_001.0, 800_001.0, 1_000_001.0, 1_200_001.0, 1_400_001.0, 1_600_001.0, 1_800_001.0, np.inf]

# Adding a new column to our db dataframe (database - population table) and filling with value from the 'salary_cat' based on 'salary' column
db['salary_cat'] = pd.cut(db.salary, bins=salary_interval, labels=salary_cat)

# Grouping db (database object) by 'salary_cat' and calculating percentage, avg salary, median salary, and population number in agg function
db_stats_sal = (db.groupby('salary_cat', observed=False).agg(percentage = ('id', lambda x: x.size / len(db) * 100),
                                        average_sal = ('salary', 'mean'),
                                        median_sal = ('salary', 'median'),
                                        population_num = ('id', 'count'))
                                        .round(2)
                                        .reset_index()
)

# The result dataframe
db_stats_sal

# 3. 

# Grouping db (database object) by 'state' and calculating percentage, avg salary, median salary, and population number in agg function
db_stats_state = (db.groupby('state', observed=False).agg(percentage = ('id', lambda x: x.size / len(db) * 100),
                                        average_sal = ('salary', 'mean'),
                                        median_sal = ('salary', 'median'),
                                        population_num = ('id', 'count'))
                                        .round(2)
                                        .reset_index()
)

# The result dataframe
db_stats_state


DB connected successfully!
File loaded successfully!


Unnamed: 0,state,percentage,average_sal,median_sal,population_num
0,Alabama,2.11,975074.14,986050.5,246
1,Alaska,0.46,1004373.19,970124.0,54
2,Arizona,2.17,1044694.76,1096324.0,253
3,Arkansas,0.51,1130035.15,1136530.0,60
4,California,11.43,998989.4,1017228.0,1332
5,Colorado,2.46,1030554.8,987728.0,287
6,Connecticut,1.56,1004350.45,1025583.0,182
7,Delaware,0.3,922087.8,987908.0,35
8,District of Columbia,3.18,1002836.82,961365.5,370
9,Florida,7.93,994446.22,972755.5,924
