# Python Projeect using Numpy & Pandas 

In [4]:
import pandas as pd
import numpy as np

# <font color="lightblue">PART 0 â€” Dataset Understanding</font> 

## <font color="#FFA500">1. Load the dataset using Pandas.</font>
 

In [5]:
df = pd.read_csv("Dataset/ecommerce_realistic_dataset.csv",encoding = "latin1")
print(df)

       order_id  customer_id  product_id     category   price  quantity  \
0             1         1175         649  Electronics    7.13         3   
1             2         2220         523     Clothing   69.07         4   
2             3         1537         569     Clothing  245.49         5   
3             4         3550         606       Beauty   28.22         2   
4             5         2603         226         Home  111.32         1   
...         ...          ...         ...          ...     ...       ...   
11995     11996         2108         309       Sports   76.22         2   
11996     11997         1824         685        Books   18.28         1   
11997     11998         2682         320       Beauty  188.70         3   
11998     11999         1003         547  Electronics   59.84         3   
11999     12000         4449         565       Sports   58.23         2   

       order_date    country payment_method  rating  returned  revenue  
0      2022-08-23        U

## <font color="#FFA500">2. Display the first 10 and last 10 rows.</font>


In [6]:
print("Head Of Data : ")
print(df.head(10))
print("\nTail Of Data : ")
print(df.tail(10))

Head Of Data : 
   order_id  customer_id  product_id     category   price  quantity  \
0         1         1175         649  Electronics    7.13         3   
1         2         2220         523     Clothing   69.07         4   
2         3         1537         569     Clothing  245.49         5   
3         4         3550         606       Beauty   28.22         2   
4         5         2603         226         Home  111.32         1   
5         6         1211         387     Clothing  117.18         3   
6         7         1919         362         Home  112.31         2   
7         8         2639         665         Home   68.70         4   
8         9         2372         657  Electronics   90.18         4   
9        10         4257         767         Home   43.24         1   

   order_date    country payment_method  rating  returned  revenue  
0  2022-08-23        USA            UPI     5.0         0    21.39  
1  2022-10-01        USA         PayPal     1.0         1   276.

##  <font color="green">3. Answer:</font>
   - <font color="#FFA500">Number of rows and columns</font>
   - <font color="#FFA500">Meaning of each column</font>


In [7]:
df.shape # Number OF ROWS & COLUMNS

# print(df.head(4)) # used for data columns ...

# order_id         ---- unique id of the orders (int)
# customer_id      ---- unique id of the customers (int)
# product_id       ---- unique id of the product (int)
# category         ---- order category 
# price            ---- amount of order/product (float)
# quantity         ---- number of products (int)
# order_date       ---- date of the order done (date yyyy-mm-dd)
# country          ---- origin 
# payment_method   ---- tranaction method
# rating           ---- like or dislikeing of the product (float)
# returned         ---- items returned (int)
# revenue          ---- total amount (float)



(12000, 12)

## <font color="#FFA500">4. Identify numerical, categorical, and date columns.</font>


""" 

Numerical Data - based Columns ((Used for calculations, averages, sums, etc.)) : price , rating , 
revenue , quantity

Categorical Columns ((Used for grouping, filtering, counting)): order_id , customer_id , 
product_id , category , countru , payment_method , returned 

Date Column : order_date â†’ date when order was place


"""


# <font color="lightblue">PART 1 â€” Data Cleaning & Preparation</font>
## <font color="FFA500">5. Identify missing values in each column.</font>


In [8]:
print(df.isnull().sum())   # helps to find out how many  missing values are there in each columns 

order_id            0
customer_id         0
product_id          0
category            0
price               0
quantity            0
order_date          0
country             0
payment_method      0
rating            611
returned            0
revenue             0
dtype: int64


## <font color="#FFA500">6. Handle missing `rating` values using mean and median.</font>


In [9]:
df["rating"].fillna(df["rating"].mean(),inplace=True)  # this we handled the missing values 
print(df.isnull().sum())

order_id          0
customer_id       0
product_id        0
category          0
price             0
quantity          0
order_date        0
country           0
payment_method    0
rating            0
returned          0
revenue           0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["rating"].fillna(df["rating"].mean(),inplace=True)  # this we handled the missing values


## <font color="#FFA500">7. Convert `order_date` to datetime format.</font>


In [10]:
print(df["order_date"].dtype)  # in this it first says the object 
df["order_date"] = pd.to_datetime(df["order_date"])  # by this we converted the column datatype
print(df["order_date"].dtype)

object
datetime64[ns]


## <font color="#FFA500">8. Detect invalid prices or quantities.</font>


In [25]:
invalidPrice = df[df["price"]<=0]
invalidQuantity = df[df["quantity"]<=0]
print(invalidPrice)
print(invalidQuantity)         
# print(invalidPrice.count())          ---- we can also print the count 
# print(invalidQuantity.count())       ---- we can also print the count
# """ SO there is no invalid price and quantity in the dataset. """

Empty DataFrame
Columns: [order_id, customer_id, product_id, category, price, quantity, order_date, country, payment_method, rating, returned, revenue]
Index: []
Empty DataFrame
Columns: [order_id, customer_id, product_id, category, price, quantity, order_date, country, payment_method, rating, returned, revenue]
Index: []


## <font color="#FFA500">9. Verify and correct revenue calculations.</font>


In [26]:
# print(df.head())
calcRevenue = df["price"] * df["quantity"]
# print(calcRevenue)
revenue = df["revenue"]

checkRevenue = df[round(df["revenue"],2)!=round(df["price"]*df["quantity"],2)]
print(checkRevenue)


"""Some revenue values appeared mismatched due to floating-point precision.
This is a common issue when working with decimal values in programming.


Revenue values were recalculated using price and quantity.
Decimal precision was handled to ensure monetary values are consistent.
This avoids floating-point comparison issues.

"""

Empty DataFrame
Columns: [order_id, customer_id, product_id, category, price, quantity, order_date, country, payment_method, rating, returned, revenue]
Index: []


'Some revenue values appeared mismatched due to floating-point precision.\nThis is a common issue when working with decimal values in programming.\n\n\nRevenue values were recalculated using price and quantity.\nDecimal precision was handled to ensure monetary values are consistent.\nThis avoids floating-point comparison issues.\n\n'

# <font color="lightblue">PART 2 â€” NumPy Practice</font>
## <font color="#FFA500">10. Convert price, quantity, and revenue to NumPy arrays.</font>

In [32]:
# print(df.head())
priceArr = np.array(df["price"])
quantityArr = np.array(df["quantity"])
revenueArr = np.array(df["revenue"])

# priceARR2 = df["price"].to_numpy()  we can also use this method to create the numpy array.
# print(priceARR2)
print(priceArr)
print(quantityArr)
print(revenueArr)


[  7.13  69.07 245.49 ... 188.7   59.84  58.23]
[3 4 5 ... 3 3 2]
[  21.39  276.28 1227.45 ...  566.1   179.52  116.46]


## <font color="#FFA500">11. Calculate total revenue, mean, median, and standard deviation.</font>


In [42]:
totalRev = np.sum(revenueArr)
meanRev = np.mean(revenueArr)
medianRev = np.median(revenueArr)
stdRev = np.std(revenueArr)

print(totalRev)
print(meanRev)
print(medianRev)
print(stdRev)

""" ðŸ”¹ Mean vs Median

Mean (238) > Median (170)

This tells us the data is right-skewed

ðŸ‘‰ Interpretation (important):

A small number of high-value orders are increasing the average revenue.

This is real business insight, not math.
"""

""" Standard Deviation

Std (218) is almost as large as the mean

This indicates high variability in order values

ðŸ‘‰ Interpretation:

Order values vary widely; customers do not spend a consistent amount.

"""

2858025.87
238.1688225
170.885
218.30060781390836


' Standard Deviation\n\nStd (218) is almost as large as the mean\n\nThis indicates high variability in order values\n\nðŸ‘‰ Interpretation:\n\nOrder values vary widely; customers do not spend a consistent amount.\n\n'

## <font color="#FFA500">12. Apply boolean masking to identify high-value orders.</font>

## <font color="#FFA500">13. Find top 10 revenue orders using NumPy.</font>

## <font color="#FFA500">14. Calculate percentage of returned orders.</font>

<font color="#FFA500">Number of rows and columns</font>

<font color="#FFA500">Number of rows and columns</font>

<font color="#FFA500">Number of rows and columns</font>

<font color="#FFA500">Number of rows and columns</font>

<font color="#FFA500">Number of rows and columns</font>

<font color="#FFA500">Number of rows and columns</font>

<font color="#FFA500">Number of rows and columns</font>

<font color="#FFA500">Number of rows and columns</font>

<font color="#FFA500">Number of rows and columns</font>

<font color="#FFA500">Number of rows and columns</font>

<font color="#FFA500">Number of rows and columns</font>

# TEXT md colors 
<font color="#FFA500">Number of rows and columns</font>
<font color="green">This text is green!</font>
