In [None]:
import pandas as pd
import numpy as np

1. **Sales Data Analysis**

Use Case: Analyze sales data for trends, customer segmentation, or performance.

Operations:
Count total sales per product category.

Calculate the total revenue generated by each sales representative.

Find the product with the highest sales.

Group data by sales regions and calculate average sales.

In [None]:
data = {
    'product_category': ['Electronics', 'Furniture', 'Electronics', 'Toys', 'Furniture'],
    'product': ['Laptop', 'Sofa', 'Phone', 'Toy Car', 'Table'],
    'units_sold': [100, 50, 200, 80, 60],
    'revenue': [1000, 1500, 2000, 1200, 1800],
    'sales_rep': ['Alice', 'Bob', 'Alice', 'Charlie', 'Bob'],
    'region': ['North', 'South', 'North', 'West', 'South'],
}

# Create a DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df)


  product_category  product  units_sold  revenue sales_rep region
0      Electronics   Laptop         100     1000     Alice  North
1        Furniture     Sofa          50     1500       Bob  South
2      Electronics    Phone         200     2000     Alice  North
3             Toys  Toy Car          80     1200   Charlie   West
4        Furniture    Table          60     1800       Bob  South


In [None]:
# Group by product category and calculate total units sold
total_sales_per_category = df.groupby('product_category')['units_sold'].sum().reset_index()

print(total_sales_per_category)


  product_category  units_sold
0      Electronics         300
1        Furniture         110
2             Toys          80


In [None]:
# Group by sales representative and calculate total revenue
total_revenue_per_rep = df.groupby('sales_rep')['revenue'].sum().reset_index()

print(total_revenue_per_rep)

  sales_rep  revenue
0     Alice     3000
1       Bob     3300
2   Charlie     1200


In [None]:
# Find the product with the highest units sold
best_selling_product = df.loc[df['units_sold'].idxmax()]

print(best_selling_product)

product_category    Electronics
product                   Phone
units_sold                  200
revenue                    2000
sales_rep                 Alice
region                    North
Name: 2, dtype: object


In [None]:
avg_sales_per_region = df.groupby('region')['revenue'].mean().reset_index()

print(avg_sales_per_region)

  region  revenue
0  North   1500.0
1  South   1650.0
2   West   1200.0


**2. Employee Data Analysis**

Use Case: Manage and analyze employee-related data like salaries, departments, and performance.

Operations:
Count the number of employees per department.

Find the employee with the highest salary.

Calculate average salary per department.

Sort employees based on their performance score or salary.

In [None]:
data = {
    'employee_id': [1, 2, 3, 4, 5],
    'department': ['HR', 'IT', 'HR', 'Finance', 'IT'],
    'salary': [60000, 90000, 65000, 75000, 80000],
    'performance_score': [85, 92, 88, 76, 95]
}

df = pd.DataFrame(data)
print(df.head())

# Count employees per department
employee_count_per_dept = df.groupby('department')['employee_id'].count()
print(employee_count_per_dept)

# Find the employee with the highest salary
highest_salary_employee = df.loc[df['salary'].idxmax()]
print(highest_salary_employee)

# Calculate average salary per department
avg_salary_per_dept = df.groupby('department')['salary'].mean()
print(avg_salary_per_dept)

# Sort employees based on performance score or salary
sorted_employees = df.sort_values(by='performance_score', ascending=False)
print(sorted_employees)

   employee_id department  salary  performance_score
0            1         HR   60000                 85
1            2         IT   90000                 92
2            3         HR   65000                 88
3            4    Finance   75000                 76
4            5         IT   80000                 95
department
Finance    1
HR         2
IT         2
Name: employee_id, dtype: int64
employee_id              2
department              IT
salary               90000
performance_score       92
Name: 1, dtype: object
department
Finance    75000.0
HR         62500.0
IT         85000.0
Name: salary, dtype: float64
   employee_id department  salary  performance_score
4            5         IT   80000                 95
1            2         IT   90000                 92
2            3         HR   65000                 88
0            1         HR   60000                 85
3            4    Finance   75000                 76


**3. Financial Data (Stock Market)**

Use Case: Perform financial data analysis for stocks, assets, and investments.

Operations:
Calculate daily, weekly, or monthly stock returns.

Find the stock with the highest or lowest closing price.

Compute moving averages for stock prices.

Group stock data by industry or sector and calculate key metrics like P/E ratio or market cap.

In [None]:
data = {
    'date': pd.date_range(start='2023-09-01', periods=5, freq='D'),
    'close': [100, 102, 101, 103, 105],
    'stock_id': [1, 2, 3, 4, 5],
    'industry': ['Tech', 'Finance', 'Tech', 'Retail', 'Finance'],
    'price': [150, 200, 170, 80, 220],
    'earnings': [10, 20, 15, 8, 25],
    'market_cap': [1e9, 2e9, 1.5e9, 500e6, 2.2e9]
}

df = pd.DataFrame(data)

# Set the 'date' column as the index
df = df.set_index('date')

# Calculate daily, weekly, or monthly stock returns
daily_returns = df['close'].pct_change()
weekly_returns = df['close'].resample('W').ffill().pct_change()
monthly_returns = df['close'].resample('M').ffill().pct_change()
print(daily_returns)
print(weekly_returns)
print(monthly_returns)

date
2023-09-01         NaN
2023-09-02    0.020000
2023-09-03   -0.009804
2023-09-04    0.019802
2023-09-05    0.019417
Name: close, dtype: float64
date
2023-09-03         NaN
2023-09-10    0.039604
Freq: W-SUN, Name: close, dtype: float64
date
2023-09-30   NaN
Freq: M, Name: close, dtype: float64


In [None]:
df.head()

Unnamed: 0_level_0,close,stock_id,industry,price,earnings,market_cap
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-09-01,100,1,Tech,150,10,1000000000.0
2023-09-02,102,2,Finance,200,20,2000000000.0
2023-09-03,101,3,Tech,170,15,1500000000.0
2023-09-04,103,4,Retail,80,8,500000000.0
2023-09-05,105,5,Finance,220,25,2200000000.0


In [None]:
#Find the stock with the highest or lowest closing price.
highest_closing_stock = df.loc[df['close'].idxmax()]
print(highest_closing_stock)
lowest_closing_stock = df.loc[df['close'].idxmin()]
print(lowest_closing_stock)

close                  105
stock_id                 5
industry           Finance
price                  220
earnings                25
market_cap    2200000000.0
Name: 2023-09-05 00:00:00, dtype: object
close                  100
stock_id                 1
industry              Tech
price                  150
earnings                10
market_cap    1000000000.0
Name: 2023-09-01 00:00:00, dtype: object


In [None]:
#Compute moving averages for stock prices.
df['moving_average'] = df['close'].rolling(window=3).mean()
print(df)

            close  stock_id industry  price  earnings    market_cap  \
date                                                                  
2023-09-01    100         1     Tech    150        10  1.000000e+09   
2023-09-02    102         2  Finance    200        20  2.000000e+09   
2023-09-03    101         3     Tech    170        15  1.500000e+09   
2023-09-04    103         4   Retail     80         8  5.000000e+08   
2023-09-05    105         5  Finance    220        25  2.200000e+09   

            moving_average  
date                        
2023-09-01             NaN  
2023-09-02             NaN  
2023-09-03           101.0  
2023-09-04           102.0  
2023-09-05           103.0  


In [None]:
#Group stock data by industry or sector and calculate key metrics like P/E ratio or market cap.
industry_metrics = df.groupby('industry').agg({'price': 'mean', 'earnings': 'sum', 'market_cap': 'sum'})
print(industry_metrics)

          price  earnings    market_cap
industry                               
Finance   210.0        45  4.200000e+09
Retail     80.0         8  5.000000e+08
Tech      160.0        25  2.500000e+09


**4. Customer Data Analysis**

Use Case: Analyze customer behavior or segmentation data.

Operations:
Group customers by location and calculate total purchases per city.

Find the customer who made the highest number of purchases.

Calculate the average amount spent per customer.

Sort customers by their total purchase value.

In [None]:
data = {
    'customer_id': [1, 2, 3, 4, 5],
    'city': ['New York', 'Los Angeles', 'New York', 'Chicago', 'Los Angeles'],
    'total_purchase': [250, 150, 300, 200, 180]
}

df = pd.DataFrame(data)
print(df.head())

# Group customers by city and calculate total purchases
total_purchases_per_city = df.groupby('city')['total_purchase'].sum()
print(total_purchases_per_city)

# Find the customer who made the highest number of purchases
customer_with_most_purchases = df.loc[df['total_purchase'].idxmax()]
print(customer_with_most_purchases)

# Sort customers by total purchase value
sorted_customers = df.sort_values(by='total_purchase', ascending=False)
print(sorted_customers)

# Calculate the average amount spent per customer
avg_purchase_per_customer = df['total_purchase'].mean()
print(avg_purchase_per_customer)

   customer_id         city  total_purchase
0            1     New York             250
1            2  Los Angeles             150
2            3     New York             300
3            4      Chicago             200
4            5  Los Angeles             180
city
Chicago        200
Los Angeles    330
New York       550
Name: total_purchase, dtype: int64
customer_id              3
city              New York
total_purchase         300
Name: 2, dtype: object
   customer_id         city  total_purchase
2            3     New York             300
0            1     New York             250
3            4      Chicago             200
4            5  Los Angeles             180
1            2  Los Angeles             150
216.0


**5. Healthcare Data**

Use Case: Analyze patient data or medical records to gain insights.

Operations:
Count the number of patients per hospital or department.

Calculate the average hospital stay per disease.

Identify the doctor with the most patients.

Sort patients based on age, condition severity, or treatment costs.

In [None]:
data = {
    'patient_id': [1, 2, 3, 4, 5],
    'hospital': ['City Hospital', 'County Hospital', 'City Hospital', 'General Hospital', 'County Hospital'],
    'department': ['Cardiology', 'Oncology', 'Cardiology', 'Neurology', 'Oncology'],
    'disease': ['Heart Disease', 'Cancer', 'Heart Disease', 'Stroke', 'Cancer'],
    'hospital_stay_days': [5, 10, 7, 12, 9],
    'doctor': ['Dr. Smith', 'Dr. Johnson', 'Dr. Smith', 'Dr. Brown', 'Dr. Johnson'],
    'age': [45, 60, 50, 70, 65],
    'condition_severity': [3, 5, 4, 2, 4],
    'treatment_cost': [10000, 20000, 15000, 18000, 12000]

}

df = pd.DataFrame(data)

print(df.head())

# Count patients per hospital or department
patient_count_per_dept = df.groupby('department')['patient_id'].count()
print(patient_count_per_dept)

#Calculate the average hospital stay per disease
avg_hospital_stay_per_disease = df.groupby('disease')['hospital_stay_days'].mean()
print(avg_hospital_stay_per_disease)

#Identify the doctor with the most patients.
doctor_with_most_patients = df['doctor'].value_counts().idxmax()
print(doctor_with_most_patients)

#sort patients based on age, condition severity, or treatment costs.
sorted_patients = df.sort_values(by='age', ascending=False)
print(sorted_patients)


   patient_id          hospital  department        disease  \
0           1     City Hospital  Cardiology  Heart Disease   
1           2   County Hospital    Oncology         Cancer   
2           3     City Hospital  Cardiology  Heart Disease   
3           4  General Hospital   Neurology         Stroke   
4           5   County Hospital    Oncology         Cancer   

   hospital_stay_days       doctor  age  condition_severity  treatment_cost  
0                   5    Dr. Smith   45                   3           10000  
1                  10  Dr. Johnson   60                   5           20000  
2                   7    Dr. Smith   50                   4           15000  
3                  12    Dr. Brown   70                   2           18000  
4                   9  Dr. Johnson   65                   4           12000  
department
Cardiology    2
Neurology     1
Oncology      2
Name: patient_id, dtype: int64
disease
Cancer            9.5
Heart Disease     6.0
Stroke           

**6. E-commerce Data Analysis**

Use Case: Analyze online sales, product views, and customer engagement data.

Operations:
Count total orders per product category.

Find the most viewed or most purchased product.

Calculate the average cart value of customers.

Sort products by total sales or views.

In [None]:
data = {
    'order_id': [1, 2, 3, 4, 5],
    'product_category': ['Electronics', 'Clothing', 'Electronics', 'Home', 'Clothing'],
    'product': ['Phone', 'Shirt', 'Laptop', 'Sofa', 'Jacket'],
    'views': [150, 200, 300, 100, 250],
    'purchases': [20, 35, 40, 10, 25],
    'cart_value': [1000, 200, 1500, 500, 300]
}

df = pd.DataFrame(data)
print(df.head())

# Count total orders per product category
order_count_per_category = df.groupby('product_category')['order_id'].count()
print(order_count_per_category)

   order_id product_category product  views  purchases  cart_value
0         1      Electronics   Phone    150         20        1000
1         2         Clothing   Shirt    200         35         200
2         3      Electronics  Laptop    300         40        1500
3         4             Home    Sofa    100         10         500
4         5         Clothing  Jacket    250         25         300
product_category
Clothing       2
Electronics    2
Home           1
Name: order_id, dtype: int64


In [None]:
#Find the most viewed or most purchased product.
most_viewed_product = df.loc[df['views'].idxmax()]
print(most_viewed_product)

most_purchased_product = df.loc[df['purchases'].idxmax()]
print(most_purchased_product)

order_id                      3
product_category    Electronics
product                  Laptop
views                       300
purchases                    40
cart_value                 1500
Name: 2, dtype: object
order_id                      3
product_category    Electronics
product                  Laptop
views                       300
purchases                    40
cart_value                 1500
Name: 2, dtype: object


In [None]:
#Calculate the average cart value of customers.
avg_cart_value = df['cart_value'].mean()
print(avg_cart_value)

700.0


In [None]:
#Sort products by total sales or views.
sorted_products = df.sort_values(by='purchases', ascending=False)
print(sorted_products)
sorted_products_views= df.sort_values(by='views', ascending=False)
print(sorted_products_views)

   order_id product_category product  views  purchases  cart_value
2         3      Electronics  Laptop    300         40        1500
1         2         Clothing   Shirt    200         35         200
4         5         Clothing  Jacket    250         25         300
0         1      Electronics   Phone    150         20        1000
3         4             Home    Sofa    100         10         500
   order_id product_category product  views  purchases  cart_value
2         3      Electronics  Laptop    300         40        1500
4         5         Clothing  Jacket    250         25         300
1         2         Clothing   Shirt    200         35         200
0         1      Electronics   Phone    150         20        1000
3         4             Home    Sofa    100         10         500
