### Sales data

In [3]:
import pandas as pd

df_sales = pd.read_csv('task\\sales_data.csv')
df_sales.head()

Unnamed: 0,Date,Product,Category,Quantity,Price
0,2023-01-01,Laptop,Electronics,10,800
1,2023-01-01,T-Shirt,Clothing,5,20
2,2023-01-02,Smartphone,Electronics,8,400
3,2023-01-02,Coffee Maker,Home,12,50
4,2023-01-03,Jeans,Clothing,15,30


In [8]:
#1. Group the data by the Category column and calculate the following aggregate statistics for each category:
#### Total quantity sold.
#### Average price per unit.
#### Maximum quantity sold in a single transaction.
category_sales = df_sales.groupby('Category').agg(Total_quan_sold=('Quantity','sum'), Avg_price=('Price', 'mean'), Max_quan_sold=('Quantity', 'max')).reset_index()
category_sales

Unnamed: 0,Category,Total_quan_sold,Avg_price,Max_quan_sold
0,Clothing,157,31.176471,15
1,Electronics,183,276.764706,15
2,Home,144,55.0,14


In [18]:
#2.Identify the top-selling product in each category based on the total quantity sold.
top_product = df_sales.groupby(['Category', 'Product'])['Quantity'].sum().reset_index()
top_product = top_product.sort_values(['Category', 'Quantity'], ascending=[True, False])
top_selling_prod = top_product.groupby('Category').first()
top_selling_prod

Unnamed: 0_level_0,Product,Quantity
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Clothing,Jeans,15
Electronics,Smart TV,15
Home,Pressure Cooker,14


In [22]:
#3. Find the date on which the highest total sales (quantity * price) occurred.
df_sales['Total_sales'] = df_sales['Quantity'] * df_sales['Price']
highest_sales_date = df_sales.groupby('Date')['Total_sales'].sum().idxmax()
print("\nDate with Highest Sales:", highest_sales_date)


Date with Highest Sales: 2023-01-07


In [24]:
df_customer = pd.read_csv('task\\customer_orders.csv')
df_customer

Unnamed: 0,OrderID,CustomerID,Product,Quantity,Price
0,1,101,Laptop,2,800
1,2,102,Headphones,1,150
2,3,103,Smartphone,3,400
3,4,101,External Hard Drive,2,80
4,5,102,Backpack,1,40
...,...,...,...,...,...
95,96,103,Blender,1,60
96,97,104,Sport Shoes,2,40
97,98,105,Comforter Set,1,45
98,99,101,Wireless Earbuds,2,120


### Customer data

In [35]:
#1. Group the data by CustomerID and filter out customers who have made less than 20 orders.
order_count = df_customer.groupby('CustomerID').size().reset_index(name='OrderCount')
less_20order = order_count[order_count.OrderCount >= 20]
less_20order

Unnamed: 0,CustomerID,OrderCount
0,101,21
1,102,21
2,103,20
3,104,20


In [32]:
#2. Identify customers who have ordered products with an average price per unit greater than $120.
avg_price = df_customer.groupby('CustomerID')['Price'].mean().reset_index()
avg_price_cust = avg_price[avg_price.Price > 120].round(2)
avg_price_cust

Unnamed: 0,CustomerID,Price
1,102,138.1
3,104,169.75


In [36]:
#3. Find the total quantity and total price for each product ordered, and filter out products that have a total quantity less than 5 units.
prod_sales = df_customer.groupby('Product').agg(Total_quan=('Quantity', 'sum'), Total_price=('Price', 'sum')).reset_index()
le5_sales = prod_sales[prod_sales['Total_quan'] > 5]
le5_sales

Unnamed: 0,Product,Total_quan,Total_price
5,Cargo Pants,6,120
19,Formal Shirt,6,105
41,Wireless Earbuds,6,360


### Population data

In [41]:
import sqlite3
import pandas as pd

conn = sqlite3.connect("task\\population.db")
population_df = pd.read_sql_query("SELECT * FROM population", conn)
population_df.head()

Unnamed: 0,id,first_name,last_name,email,gender,salary,state
0,1,Armin,Coltart,acoltart0@abc.net.au,Male,368693,District of Columbia
1,2,Mia,Tuddenham,mtuddenham1@addthis.com,Female,154398,Florida
2,3,Kirsteni,Brafield,kbrafield2@arizona.edu,Female,1230304,Georgia
3,4,Phylis,Furlong,pfurlong3@merriam-webster.com,Female,1567795,California
4,5,Wandis,Loveredge,wloveredge4@hatena.ne.jp,Female,1136950,Alabama


In [50]:
salary_bands = pd.read_excel("task\\population_salary_analysis.xlsx")

In [49]:
population_df['salary'] = pd.to_numeric(population_df['salary'], errors='coerce')

In [53]:
salary_band_edges = salary_bands['Salary Band'].str.extract(r'(\d+)-(\d+)').astype(float)
salary_bands['Min'] = salary_band_edges[0]
salary_bands['Max'] = salary_band_edges[1]

# Categorize salaries
def categorize_salary(salary):
    for _, row in salary_bands.iterrows():
        if row['Min'] <= salary <= row['Max']:
            return row['Salary Band']
    return 'Other'

In [55]:
population_df['Salary Band'] = population_df['salary'].apply(categorize_salary)

In [61]:
#1. Stats by salary band
band_stats = population_df.groupby('Salary Band').agg(
    Population_Count=('salary', 'count'),
    Average_Salary=('salary', 'mean'),
    Median_Salary=('salary', 'median')
).reset_index()
band_stats['Percentage_of_Population'] = (band_stats['Population_Count'] / population_df.shape[0]) * 100
band_stats

Unnamed: 0,Salary Band,Population_Count,Average_Salary,Median_Salary,Percentage_of_Population
0,Other,11651,992628.157411,990301.0,100.0


In [62]:
# 2. Stats by state and salary band
state_band_stats = population_df.groupby(['state', 'Salary Band']).agg(
    Population_Count=('salary', 'count'),
    Average_Salary=('salary', 'mean'),
    Median_Salary=('salary', 'median')
).reset_index()
state_band_stats.rename(columns={'state': 'State'}, inplace=True)
state_band_stats['Percentage_of_Population'] = state_band_stats.groupby('State')['Population_Count'].transform(
    lambda x: (x / x.sum()) * 100
)
state_band_stats.head()

Unnamed: 0,State,Salary Band,Population_Count,Average_Salary,Median_Salary,Percentage_of_Population
0,Alabama,Other,246,975074.1,986050.5,100.0
1,Alaska,Other,54,1004373.0,970124.0,100.0
2,Arizona,Other,253,1044695.0,1096324.0,100.0
3,Arkansas,Other,60,1130035.0,1136530.0,100.0
4,California,Other,1332,998989.4,1017228.0,100.0
