In [1]:
import pandas as pd
import numpy as np

In [2]:
products = pd.read_csv("products.csv")
transactions = pd.read_csv("transactions.csv")

print(products.head())
print(transactions.head())


   product_id     brand        model    segment      price
0        1000     Honda        Civic      Sedan  1365313.0
1        1001  Mahindra       XUV300        SUV  2131266.0
2        1002      Tata  Safari ZXi+   Electric        NaN
3        1003  Mahindra    XUV700 LX  Hatchback  1370746.0
4        1004  Mahindra      Scorpio  Hatchback  1413168.0
   transaction_id  product_id  quantity transaction_date
0               1        1085         4       2024-06-16
1               2        1056         3       2024-06-12
2               3        1028         4       2024-02-14
3               4        1077         5       2024-05-30
4               5        1091         1       2024-02-16


In [3]:
products.info()
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   product_id  120 non-null    int64  
 1   brand       120 non-null    object 
 2   model       120 non-null    object 
 3   segment     120 non-null    object 
 4   price       110 non-null    float64
dtypes: float64(1), int64(1), object(3)
memory usage: 4.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850 entries, 0 to 849
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   transaction_id    850 non-null    int64 
 1   product_id        850 non-null    int64 
 2   quantity          850 non-null    int64 
 3   transaction_date  850 non-null    object
dtypes: int64(3), object(1)
memory usage: 26.7+ KB


In [4]:
# Fill missing prices with mean price
products['price'] = products['price'].fillna(products['price'].mean())


In [5]:
transactions = transactions.drop_duplicates()


In [6]:
merged_data = pd.merge(transactions, products, on="product_id")
merged_data.head()


Unnamed: 0,transaction_id,product_id,quantity,transaction_date,brand,model,segment,price
0,1,1085,4,2024-06-16,Honda,WR-V,Hatchback,1960828.0
1,2,1056,3,2024-06-12,Hyundai,Creta,Hatchback,973616.0
2,3,1028,4,2024-02-14,Toyota,Urban Cruiser LX,SUV,2608187.0
3,4,1077,5,2024-05-30,Hyundai,i10,Hatchback,2352290.0
4,5,1091,1,2024-02-16,Mahindra,Scorpio-N,Hatchback,2307923.0


In [7]:
segment_sales = merged_data.groupby('segment')['quantity'].sum()
segment_sales


segment
Electric     490
Hatchback    766
SUV          597
Sedan        543
Name: quantity, dtype: int64

In [8]:
highest_segment = segment_sales.idxmax()
highest_segment


'Hatchback'

In [17]:
# Total quantity sold per brand
brand_sales = merged_data.groupby('brand')['quantity'].sum()

# Least sold brand
least_sold_brand = brand_sales.idxmin()
least_sold_brand


'Tata'

In [9]:
# Convert transaction_date to datetime
merged_data['transaction_date'] = pd.to_datetime(merged_data['transaction_date'])

# Extract month
merged_data['month'] = merged_data['transaction_date'].dt.month

top_months = merged_data.groupby('month')['quantity'].sum().sort_values(ascending=False).head(3)
top_months


month
5    467
1    439
4    413
Name: quantity, dtype: int64

In [16]:
# Calculate total price for each transaction
merged_data['total_price'] = merged_data['price'] * merged_data['quantity']

# Total revenue
total_revenue = merged_data['total_price'].sum()
total_revenue


np.float64(4406120483.1)

In [10]:
missing_prices = products['price'].isnull().sum()
missing_prices


np.int64(0)

In [11]:
missing_percentage = (missing_prices / len(products)) * 100
missing_percentage

np.float64(0.0)

In [12]:
min_price = products['price'].min()
min_price


671295.0

In [13]:
max_price = products['price'].max()
max_price


2979126.0

In [14]:
# Calculate mean price
mean_price = products['price'].mean()
print("Mean Price:", mean_price)

Mean Price: 1808323.5727272728


In [15]:
# Calculate median price
median_price = products['price'].median()
print("Median Price:", median_price)

Median Price: 1808323.5727272728
