In [1]:
import pandas as pd
import numpy as np

# Reading the csv
df = pd.read_csv('sales_dataset.csv')
df.head()

Unnamed: 0,OrderID,Date,Region,CustomerName,Product,Quantity,UnitPrice,TotalSales,PaymentMethod
0,1001,2023-01-15,North,Alice Johnson,Laptop,2.0,700.0,1400.0,Credit Card
1,1002,2023-01-16,South,Rahul Mehta,Mobile Phone,5.0,300.0,1500.0,UPI
2,1003,2023-01-17,East,Fatima Noor,Headphones,10.0,50.0,500.0,Debit Card
3,1004,2023-01-18,West,,Laptop,1.0,720.0,720.0,Credit Card
4,1005,2023-01-19,North,Zoe Carter,Mobile Phone,3.0,,,UPI


In [2]:
#Getting the info of the csv or dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   OrderID        20 non-null     int64  
 1   Date           20 non-null     object 
 2   Region         20 non-null     object 
 3   CustomerName   18 non-null     object 
 4   Product        20 non-null     object 
 5   Quantity       19 non-null     float64
 6   UnitPrice      18 non-null     float64
 7   TotalSales     18 non-null     float64
 8   PaymentMethod  20 non-null     object 
dtypes: float64(3), int64(1), object(5)
memory usage: 1.5+ KB


In [3]:
# Checking null values and dropping them
print(df.isnull().sum())

OrderID          0
Date             0
Region           0
CustomerName     2
Product          0
Quantity         1
UnitPrice        2
TotalSales       2
PaymentMethod    0
dtype: int64


In [4]:
# Dropping the null values and rechecking
df.dropna(inplace=True)
df.isnull().sum()

OrderID          0
Date             0
Region           0
CustomerName     0
Product          0
Quantity         0
UnitPrice        0
TotalSales       0
PaymentMethod    0
dtype: int64

In [5]:
# Sum of sales by region
df.groupby('Region')['TotalSales'].sum()

Region
East     2520.0
North    3765.0
South    3830.0
West      585.0
Name: TotalSales, dtype: float64

In [6]:
# Avg sales by each product
df.groupby('Product')['TotalSales'].mean()

Product
Headphones       415.0
Keyboard         342.5
Laptop          1170.0
Mobile Phone     897.5
Monitor          492.5
Smart Watch      200.0
Tablet           900.0
Name: TotalSales, dtype: float64

In [7]:
# Claculating total sales by each product
product_sales = df.groupby('Product')['TotalSales'].sum().reset_index()
product_sales

Unnamed: 0,Product,TotalSales
0,Headphones,830.0
1,Keyboard,685.0
2,Laptop,3510.0
3,Mobile Phone,3590.0
4,Monitor,985.0
5,Smart Watch,200.0
6,Tablet,900.0


In [8]:
# Highest & lowest selling product
Highest_Selling_Product = product_sales.loc[product_sales['TotalSales'].idxmax()]
Lowest_Selling_Product = product_sales.loc[product_sales['TotalSales'].idxmin()]

print(f"The highest selling product is:\n {Highest_Selling_Product}\n\nThe lowest selling product is:\n {Lowest_Selling_Product}")

The highest selling product is:
 Product       Mobile Phone
TotalSales          3590.0
Name: 3, dtype: object

The lowest selling product is:
 Product       Smart Watch
TotalSales          200.0
Name: 5, dtype: object


**We can also do this way:**

In [10]:
# Find the highest selling product
highest_selling_product = df.groupby('Product')['TotalSales'].sum().nlargest(1)
highest_selling_product

Product
Mobile Phone    3590.0
Name: TotalSales, dtype: float64

In [11]:
# Find the lowest selling product
Lowest_selling_product = df.groupby('Product')['TotalSales'].sum().nsmallest(1)
Lowest_selling_product

Product
Smart Watch    200.0
Name: TotalSales, dtype: float64

In [34]:
# Use NumPy to calculate mean, median, standard deviation of numerical fields.

# Select numerical columns
numerical_columns = df.select_dtypes(include=np.number).columns

# Dropping orderID as it of low importance
numerical_columns = numerical_columns.drop("OrderID")

# Calculating mean, median, and standard deviation for each numerical column
mean_values = df[numerical_columns].mean()
print(f"Mean:\n{mean_values}")
print("------------------------")

median_values = df[numerical_columns].median()
print(f"\nMedian:\n{median_values}")
print("------------------------")

std_dev_values = df[numerical_columns].std()
print(f"\nStandard Deviation:\n{std_dev_values}")
print("------------------------")

Mean:
Quantity        3.200000
UnitPrice     310.000000
TotalSales    713.333333
dtype: float64
------------------------

Median:
Quantity        2.0
UnitPrice     290.0
TotalSales    585.0
dtype: float64
------------------------

Standard Deviation:
Quantity        2.396426
UnitPrice     230.666735
TotalSales    451.559467
dtype: float64
------------------------
