In [1]:
import pandas as pd
import numpy as np

In [2]:
# step no 1 create a sales data
data = {
    "order_id":[1,2,3,4,5,6,7,8],
    "customer":['john','joy','rahul','rohan','riya','tiya','heena','nishant'],
    "product":['laptop','phone','mouse','tablet','phone','tablet','laptop','laptop'],
    "quantity":[4,5,2,1,2,1,1,3],
    "price": [1000,300,4500,500,760,980,780,7000],
   "date":['2023-01-03','2023-01-04','2023-01-01','2023-01-04','2023-01-05','2023-01-8','2023-01-23','2023-01-12']    
}

In [28]:
# step no 2 create a dataframe
df = pd.DataFrame(data)

df.to_csv("dataset.csv")

In [4]:
# step no 3 print first 5 row
print(df.head())

   order_id customer product  quantity  price        date
0         1     john  laptop         4   1000  2023-01-03
1         2      joy   phone         5    300  2023-01-04
2         3    rahul   mouse         2   4500  2023-01-01
3         4    rohan  tablet         1    500  2023-01-04
4         5     riya   phone         2    760  2023-01-05


In [5]:
# step no 4 data cleaning
print(df.describe())

       order_id  quantity        price
count   8.00000  8.000000     8.000000
mean    4.50000  2.375000  1977.500000
std     2.44949  1.505941  2433.467544
min     1.00000  1.000000   300.000000
25%     2.75000  1.000000   695.000000
50%     4.50000  2.000000   880.000000
75%     6.25000  3.250000  1875.000000
max     8.00000  5.000000  7000.000000


In [6]:
print(df.dtypes)

order_id     int64
customer    object
product     object
quantity     int64
price        int64
date        object
dtype: object


In [7]:
# convert date column to datetime
# print(df['date'])

df['date'] = pd.to_datetime(df['date'])
print(df.dtypes)

order_id             int64
customer            object
product             object
quantity             int64
price                int64
date        datetime64[ns]
dtype: object


In [8]:
# check for missing values
print(df.isnull().sum())

order_id    0
customer    0
product     0
quantity    0
price       0
date        0
dtype: int64


In [9]:
# step no. 5 Basic data analysis
# calculate total revenue per order

df['total_revenue'] = df['quantity'] * df['price']
print(df.head())



   order_id customer product  quantity  price       date  total_revenue
0         1     john  laptop         4   1000 2023-01-03           4000
1         2      joy   phone         5    300 2023-01-04           1500
2         3    rahul   mouse         2   4500 2023-01-01           9000
3         4    rohan  tablet         1    500 2023-01-04            500
4         5     riya   phone         2    760 2023-01-05           1520


In [10]:
#step no. 6  total revenue by customer (grouping and aggregation)
customer_revenue =  df.groupby('customer')['total_revenue'].sum().reset_index()
print(customer_revenue)

  customer  total_revenue
0    heena            780
1     john           4000
2      joy           1500
3  nishant          21000
4    rahul           9000
5     riya           1520
6    rohan            500
7     tiya            980


In [12]:
# step no. 7 Product analysis
# most popular product by quantity sold 
product_sales = df.groupby('product')['quantity'].sum().reset_index()
print('\n product sales = \n ',product_sales)


 product sales = 
    product  quantity
0  laptop         8
1   mouse         2
2   phone         7
3  tablet         2


In [None]:
# step no. 8 time based analysis
daily_sales = df.groupby('date')['total_revenue'].sum().reset_index()
print(daily_sales)

        date  total_revenue
0 2023-01-01           9000
1 2023-01-03           4000
2 2023-01-04           2000
3 2023-01-05           1520
4 2023-01-08            980
5 2023-01-12          21000
6 2023-01-23            780


In [15]:
# step no. 9 Customer behavior analysis
# avg order value per customers
avg_order_value = df.groupby('customer')['total_revenue'].mean().reset_index()
avg_order_value.columns = ['customer' , 'avg_order_value']
print(f'Average Order Values \n{avg_order_value}')

Average Order Values 
  customer  avg_order_value
0    heena            780.0
1     john           4000.0
2      joy           1500.0
3  nishant          21000.0
4    rahul           9000.0
5     riya           1520.0
6    rohan            500.0
7     tiya            980.0


In [17]:
# step no . 10 filtering 
# high value orders (total_revenue > 1000)

high_value_order = df[df['total_revenue']> 20000]
print(high_value_order)

   order_id customer product  quantity  price       date  total_revenue
7         8  nishant  laptop         3   7000 2023-01-12          21000


In [24]:
h = df['total_revenue']
print(max(h))

21000


In [25]:
print(df.describe())

       order_id  quantity        price                 date  total_revenue
count   8.00000  8.000000     8.000000                    8        8.00000
mean    4.50000  2.375000  1977.500000  2023-01-07 12:00:00     4910.00000
min     1.00000  1.000000   300.000000  2023-01-01 00:00:00      500.00000
25%     2.75000  1.000000   695.000000  2023-01-03 18:00:00      930.00000
50%     4.50000  2.000000   880.000000  2023-01-04 12:00:00     1510.00000
75%     6.25000  3.250000  1875.000000  2023-01-09 00:00:00     5250.00000
max     8.00000  5.000000  7000.000000  2023-01-23 00:00:00    21000.00000
std     2.44949  1.505941  2433.467544                  NaN     7086.34905


In [26]:
print(df)

   order_id customer product  quantity  price       date  total_revenue
0         1     john  laptop         4   1000 2023-01-03           4000
1         2      joy   phone         5    300 2023-01-04           1500
2         3    rahul   mouse         2   4500 2023-01-01           9000
3         4    rohan  tablet         1    500 2023-01-04            500
4         5     riya   phone         2    760 2023-01-05           1520
5         6     tiya  tablet         1    980 2023-01-08            980
6         7    heena  laptop         1    780 2023-01-23            780
7         8  nishant  laptop         3   7000 2023-01-12          21000


In [27]:
# Step no . 11 Data Export 
customer_revenue.to_csv('customer.csv',index=False)
product_sales.to_csv('product_sales.csv',index=False)

In [29]:
data = pd.read_csv('update_dataset.csv')
print(data.head())

   order_id customer product  quantity   price        date
0         1     john  laptop       4.0  1000.0  03-01-2023
1         2      joy   phone       5.0   300.0  04-01-2023
2         3    rahul   mouse       2.0  4500.0  01-01-2023
3         4    rohan  tablet       1.0   500.0  04-01-2023
4         5     riya   phone       2.0   760.0  05-01-2023


In [30]:
print(data.describe())

         order_id    quantity         price
count  199.000000  197.000000    196.000000
mean   100.000000   24.395939  24502.142857
std     57.590508   15.648094  13874.917010
min      1.000000    1.000000    300.000000
25%     50.500000   10.000000  13718.000000
50%    100.000000   25.000000  23761.500000
75%    149.500000   37.000000  35575.750000
max    199.000000   50.000000  49942.000000


In [31]:
print(data.isnull().sum())

order_id    0
customer    2
product     3
quantity    2
price       3
date        6
dtype: int64


In [32]:
print(data.dtypes)

order_id      int64
customer     object
product      object
quantity    float64
price       float64
date         object
dtype: object


In [33]:
df = pd.DataFrame(data)
print(df.head())

   order_id customer product  quantity   price        date
0         1     john  laptop       4.0  1000.0  03-01-2023
1         2      joy   phone       5.0   300.0  04-01-2023
2         3    rahul   mouse       2.0  4500.0  01-01-2023
3         4    rohan  tablet       1.0   500.0  04-01-2023
4         5     riya   phone       2.0   760.0  05-01-2023


In [34]:
print(df.dtypes)

order_id      int64
customer     object
product      object
quantity    float64
price       float64
date         object
dtype: object


In [None]:
# missing value clear 
df_with_na = df.copy()
df_with_na.loc['quantity','price'] = np.nan
print(df_with_na)

          order_id customer product  quantity    price        date
0              1.0     john  laptop       4.0   1000.0  03-01-2023
1              2.0      joy   phone       5.0    300.0  04-01-2023
2              3.0    rahul   mouse       2.0   4500.0  01-01-2023
3              4.0    rohan  tablet       1.0    500.0  04-01-2023
4              5.0     riya   phone       2.0    760.0  05-01-2023
...            ...      ...     ...       ...      ...         ...
195          196.0     tiya  tablet      36.0  34994.0  08-01-2023
196          197.0    heena  laptop      24.0  41464.0  23-01-2023
197          198.0  nishant  laptop      10.0  32911.0  12-01-2023
198          199.0     john  laptop      23.0  17602.0  01-01-2023
quantity       NaN      NaN     NaN       NaN      NaN         NaN

[200 rows x 6 columns]


In [36]:
print(data.isnull().sum())

order_id    0
customer    2
product     3
quantity    2
price       3
date        6
dtype: int64


In [38]:
#filling missing value 
price = df['price']
print(price.isnull().sum())

3


In [43]:
df['price'] = df['price'].fillna(df['price'].mean())
print(df)

     order_id customer product  quantity    price        date  1000.0  300.0  \
0           1     john  laptop       4.0   1000.0  03-01-2023  1000.0  300.0   
1           2      joy   phone       5.0    300.0  04-01-2023  1000.0  300.0   
2           3    rahul   mouse       2.0   4500.0  01-01-2023  1000.0  300.0   
3           4    rohan  tablet       1.0    500.0  04-01-2023  1000.0  300.0   
4           5     riya   phone       2.0    760.0  05-01-2023  1000.0  300.0   
..        ...      ...     ...       ...      ...         ...     ...    ...   
194       195     riya   phone       1.0  46435.0  05-01-2023  1000.0  300.0   
195       196     tiya  tablet      36.0  34994.0  08-01-2023  1000.0  300.0   
196       197    heena  laptop      24.0  41464.0  23-01-2023  1000.0  300.0   
197       198  nishant  laptop      10.0  32911.0  12-01-2023  1000.0  300.0   
198       199     john  laptop      23.0  17602.0  01-01-2023  1000.0  300.0   

     4500.0  500.0  ...  19613.0  13616

In [44]:
print(df['price'].isnull().sum())

0


In [45]:
print(df['quantity'].isnull().sum())

2


In [60]:
df = df.dropna()

In [63]:
print(df.isnull().sum())

order_id    0
customer    0
product     0
quantity    0
price       0
           ..
46435.0     0
34994.0     0
41464.0     0
32911.0     0
17602.0     0
Length: 203, dtype: int64


In [65]:
df.to_csv("data.csv")

In [64]:
# read csv 

data = pd.read_csv('https://raw.githubusercontent.com/Pankaj-Str/Complete-Python-Mastery/refs/heads/main/53%20DataSet/credit_card_transactions.csv')

print(data.head())

   Transaction_ID  Customer_ID Transaction_Date Transaction_Type Merchant  \
0          100000         4452       2023-01-01           Online  Walmart   
1          100001         2775       2023-01-01              ATM  BestBuy   
2          100002         2259       2023-01-01   Mobile Payment     Uber   
3          100003         4545       2023-01-01           Online  BestBuy   
4          100004         2137       2023-01-01              ATM   Amazon   

   Category   Amount Payment_Mode Transaction_Status       Location  
0    Travel  4520.70   Debit Card           Approved      Jonesport  
1    Travel  1437.85   Debit Card           Approved  Port Jennifer  
2  Clothing  3320.52       PayPal           Approved     Port James  
3    Travel  2659.96   Debit Card           Approved     Hawkinston  
4    Travel  2517.07   Debit Card           Approved    Matthewland  


In [66]:
data.shape

(5500, 10)

In [70]:
data.describe()

Unnamed: 0,Transaction_ID,Customer_ID,Amount
count,5500.0,5500.0,5500.0
mean,102749.5,2989.954182,2492.109224
std,1587.857571,1153.928878,1444.493842
min,100000.0,1000.0,6.55
25%,101374.75,1988.0,1236.935
50%,102749.5,3000.0,2502.405
75%,104124.25,3990.0,3767.9575
max,105499.0,4998.0,4997.49
