In [1]:
import pandas as pd
df = pd.read_csv('sales_data.csv')

In [2]:
# Group the data by the Category column and calculate the following aggregate statistics for each category:
# Total quantity sold.
# Average price per unit.
# Maximum quantity sold in a single transaction.

category_stats = df.groupby('Category').agg(
    total_quantity_sold=('Quantity', 'sum'),
    average_price_per_unit=('Price', 'mean'),
    max_quantity_sold=('Quantity', 'max')
)
category_stats

Unnamed: 0_level_0,total_quantity_sold,average_price_per_unit,max_quantity_sold
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Clothing,157,31.176471,15
Electronics,183,276.764706,15
Home,144,55.0,14


In [3]:
#Identify the top-selling product in each category based on the total quantity sold.


top_selling_products = df.groupby(['Category', 'Product'])['Quantity'].sum().reset_index()
top_selling_products = top_selling_products.sort_values(['Category', 'Quantity'], ascending=[True, False])
top_selling_products = top_selling_products.groupby('Category').first().reset_index()


top_selling_products

Unnamed: 0,Category,Product,Quantity
0,Clothing,Jeans,15
1,Electronics,Smart TV,15
2,Home,Pressure Cooker,14


In [4]:
# Find the date on which the highest total sales (quantity * price) occurred.



df['Total_Sales'] = df['Quantity'] * df['Price']
highest_sales_date = df.groupby('Date')['Total_Sales'].sum().idxmax()
highest_sales_date

'2023-01-07'

In [5]:
df = pd.read_csv('customer_orders.csv')

In [6]:
# Group the data by CustomerID and filter out customers who have made less than 20 orders.


customer_order_counts = df.groupby('CustomerID').size().reset_index(name='Order_Count')
frequent_customers = customer_order_counts[customer_order_counts['Order_Count'] >= 20]
frequent_customers

Unnamed: 0,CustomerID,Order_Count
0,101,21
1,102,21
2,103,20
3,104,20


In [7]:
# Identify customers who have ordered products with an average price per unit greater than $120.

customer_avg_price = df.groupby('CustomerID')['Price'].mean().reset_index()
high_value_customers = customer_avg_price[customer_avg_price['Price'] > 120]
high_value_customers

Unnamed: 0,CustomerID,Price
1,102,138.095238
3,104,169.75


In [8]:
#  Find the total quantity and total price for each product ordered, and filter out products that have a total quantity less than 5 units.


product_totals = df.groupby('Product').agg(
    total_quantity=('Quantity', 'sum'),
    total_price=('Price', 'sum')
).reset_index()

filtered_products = product_totals[product_totals['total_quantity'] >= 5]
filtered_products

Unnamed: 0,Product,total_quantity,total_price
5,Cargo Pants,6,120
15,Dress Shirt,5,50
19,Formal Shirt,6,105
30,Smartphone,5,800
32,Sport Shoes,5,120
35,Sunglasses,5,60
41,Wireless Earbuds,6,360


In [9]:
import sqlalchemy as sa
import pandas as pd


connection = 'sqlite:///population.db' 
engine = sa.create_engine(connection)
con = engine.connect()

population = pd.read_sql("SELECT * FROM population", con=con)

con.close()

In [12]:
population_salary = pd.read_excel('population_salary_analysis.xlsx')

In [None]:
import numpy as np
print(population.columns)


bins = [0, 200000, 400000, 600000, 800000, 1000000, 1200000, 1400000, 1600000, 1800000, np.inf]
labels = [
    "till $200,000",
    "$200,001 - $400,000",
    "$400,001 - $600,000",
    "$600,001 - $800,000",
    "$800,001 - $1,000,000",
    "$1,000,001 - $1,200,000",
    "$1,200,001 - $1,400,000",
    "$1,400,001 - $1,600,000",
    "$1,600,001 - $1,800,000",
    "$1,800,001 and over"
]

population['Salary_Category'] = pd.cut(population['salary'], bins=bins, labels=labels, right=False)

salary_stats = population.groupby('Salary_Category').agg(
    percentage_population=('salary', lambda x: len(x) / len(population) * 100),
    average_salary=('salary', 'mean'),
    median_salary=('salary', 'median'),
    population_count=('salary', 'size')
).reset_index()

salary_stats

Index(['id', 'first_name', 'last_name', 'email', 'gender', 'salary', 'state'], dtype='object')


  salary_stats = population.groupby('Salary_Category').agg(


Unnamed: 0,Salary_Category,percentage_population,average_salary,median_salary,population_count
0,"till $200,000",9.87898,99283.99,98800.0,1151
1,"$200,001 - $400,000",10.042056,299558.1,299882.0,1170
2,"$400,001 - $600,000",10.591366,499164.0,497925.5,1234
3,"$600,001 - $800,000",9.921895,699680.9,701317.0,1156
4,"$800,001 - $1,000,000",10.084971,901152.3,899845.0,1175
5,"$1,000,001 - $1,200,000",10.531285,1098524.0,1097765.0,1227
6,"$1,200,001 - $1,400,000",9.707321,1300685.0,1300430.0,1131
7,"$1,400,001 - $1,600,000",9.715904,1499606.0,1500623.0,1132
8,"$1,600,001 - $1,800,000",9.612909,1698519.0,1697481.5,1120
9,"$1,800,001 and over",9.913312,1902892.0,1906451.0,1155


In [None]:
print(population.columns)

# Calculate measures for each state
state_salary_stats = population.groupby('state').agg(
    percentage_population=('salary', lambda x: len(x) / len(population) * 100),
    average_salary=('salary', 'mean'),
    median_salary=('salary', 'median'),
    population_count=('salary', 'size')
).reset_index()

state_salary_stats

Index(['id', 'first_name', 'last_name', 'email', 'gender', 'salary', 'state',
       'Salary_Category'],
      dtype='object')


Unnamed: 0,state,percentage_population,average_salary,median_salary,population_count
0,Alabama,2.111407,975074.1,986050.5,246
1,Alaska,0.46348,1004373.0,970124.0,54
2,Arizona,2.171487,1044695.0,1096324.0,253
3,Arkansas,0.514977,1130035.0,1136530.0,60
4,California,11.432495,998989.4,1017228.0,1332
5,Colorado,2.463308,1030555.0,987728.0,287
6,Connecticut,1.562098,1004350.0,1025583.0,182
7,Delaware,0.300403,922087.8,987908.0,35
8,District of Columbia,3.175693,1002837.0,961365.5,370
9,Florida,7.93065,994446.2,972755.5,924
