In [2]:
import pandas as pd
import numpy as np
from random import random

In [32]:
df = pd.read_csv("electronics_sales.csv", parse_dates=["date"])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   order_id    60 non-null     int64         
 1   date        60 non-null     datetime64[ns]
 2   customer    60 non-null     object        
 3   region      60 non-null     object        
 4   product     60 non-null     object        
 5   units       60 non-null     int64         
 6   unit_price  60 non-null     float64       
 7   returned    60 non-null     bool          
 8   notes       50 non-null     object        
 9   revenue     60 non-null     float64       
dtypes: bool(1), datetime64[ns](1), float64(2), int64(2), object(4)
memory usage: 4.4+ KB


pandas stores each column as a NumPy array internally, so dtype matters for performance.

In [10]:
print(df.describe()  )     # numeric columns summary
df.describe(include='all')  # includes categorical columns

          order_id                 date      units    unit_price  \
count    60.000000                   60  60.000000     60.000000   
mean   2030.500000  2025-01-30 12:00:00   4.850000  24610.381500   
min    2001.000000  2025-01-01 00:00:00   1.000000   1248.020000   
25%    2015.750000  2025-01-15 18:00:00   2.000000  12071.017500   
50%    2030.500000  2025-01-30 12:00:00   5.000000  27570.460000   
75%    2045.250000  2025-02-14 06:00:00   7.000000  35151.340000   
max    2060.000000  2025-03-01 00:00:00   9.000000  46899.770000   
std      17.464249                  NaN   2.729779  14234.553392   

             revenue  
count      60.000000  
mean   118680.795667  
min      1248.020000  
25%     44169.992500  
50%     89284.000000  
75%    171809.347500  
max    416789.910000  
std    100570.565700  


Unnamed: 0,order_id,date,customer,region,product,units,unit_price,returned,notes,revenue
count,60.0,60,60,60,60,60.0,60.0,60,50,60.0
unique,,,28,4,6,,,2,4,
top,,,Ira,South,Laptop,,,False,Urgent,
freq,,,4,22,15,,,56,15,
mean,2030.5,2025-01-30 12:00:00,,,,4.85,24610.3815,,,118680.795667
min,2001.0,2025-01-01 00:00:00,,,,1.0,1248.02,,,1248.02
25%,2015.75,2025-01-15 18:00:00,,,,2.0,12071.0175,,,44169.9925
50%,2030.5,2025-01-30 12:00:00,,,,5.0,27570.46,,,89284.0
75%,2045.25,2025-02-14 06:00:00,,,,7.0,35151.34,,,171809.3475
max,2060.0,2025-03-01 00:00:00,,,,9.0,46899.77,,,416789.91


In [13]:
df.loc[:5]

Unnamed: 0,order_id,date,customer,region,product,units,unit_price,returned,notes,revenue
0,2001,2025-01-01,Diya,East,Headphones,5,38319.36,False,,191596.8
1,2002,2025-01-02,Reyansh,East,Smartphone,7,12644.24,False,Promo Applied,88509.68
2,2003,2025-01-03,Sai,North,Headphones,7,36682.6,False,Gift,256778.2
3,2004,2025-01-04,Sara,North,Smartphone,4,19021.37,False,Promo Applied,76085.48
4,2005,2025-01-05,Aditi,East,Laptop,7,31982.99,False,Urgent,223880.93
5,2006,2025-01-06,Anika,South,Smartwatch,3,32042.96,True,Urgent,96128.88


##### Columns

In [19]:
df["customer"].head()       # returns a Series
df[["customer","region"]].head()  # returns a DataFrame
# Columns are internally Series objects, can be manipulated like Series.



Unnamed: 0,customer,region
0,Diya,East
1,Reyansh,East
2,Sai,North
3,Sara,North
4,Aditi,East


##### Rows



In [None]:
# By position: .iloc
df.iloc[0:3]

Unnamed: 0,order_id,date,customer,region,product,units,unit_price,returned,notes,revenue
0,2001,2025-01-01,Diya,East,Headphones,5,38319.36,False,,191596.8
1,2002,2025-01-02,Reyansh,East,Smartphone,7,12644.24,False,Promo Applied,88509.68
2,2003,2025-01-03,Sai,North,Headphones,7,36682.6,False,Gift,256778.2


In [25]:
# By label/index: .loc

df.loc[1:3]
df.loc[0]

order_id                     2001
date          2025-01-01 00:00:00
customer                     Diya
region                       East
product                Headphones
units                           5
unit_price               38319.36
returned                    False
notes                         NaN
revenue                  191596.8
Name: 0, dtype: object

##### Cells 

In [33]:
print(df.at[0, "customer"])     # faster for scalar
print(df.iat[0, 2])     # by position
print(df.loc[0, "revenue"])     # label-based
df.head()

Diya
Diya
191596.8


Unnamed: 0,order_id,date,customer,region,product,units,unit_price,returned,notes,revenue
0,2001,2025-01-01,Diya,East,Headphones,5,38319.36,False,,191596.8
1,2002,2025-01-02,Reyansh,East,Smartphone,7,12644.24,False,Promo Applied,88509.68
2,2003,2025-01-03,Sai,North,Headphones,7,36682.6,False,Gift,256778.2
3,2004,2025-01-04,Sara,North,Smartphone,4,19021.37,False,Promo Applied,76085.48
4,2005,2025-01-05,Aditi,East,Laptop,7,31982.99,False,Urgent,223880.93


Adding / Modifying / Deleting Columns

In [20]:
# Adding a column
df['Profit'] = df['revenue'] * 0.15
df.head(2)

Unnamed: 0,order_id,date,customer,region,product,units,unit_price,returned,notes,revenue,Profit
0,2001,2025-01-01,Diya,East,Headphones,5,38319.36,False,,191596.8,28739.52
1,2002,2025-01-02,Reyansh,East,Smartphone,7,12644.24,False,Promo Applied,88509.68,13276.452


In [21]:
# Modifying a column
df["units"] = df["units"] + 1
df.rename(columns={'Profit': 'profit'}, inplace=True)
df.head()

Unnamed: 0,order_id,date,customer,region,product,units,unit_price,returned,notes,revenue,profit
0,2001,2025-01-01,Diya,East,Headphones,6,38319.36,False,,191596.8,28739.52
1,2002,2025-01-02,Reyansh,East,Smartphone,8,12644.24,False,Promo Applied,88509.68,13276.452
2,2003,2025-01-03,Sai,North,Headphones,8,36682.6,False,Gift,256778.2,38516.73
3,2004,2025-01-04,Sara,North,Smartphone,5,19021.37,False,Promo Applied,76085.48,11412.822
4,2005,2025-01-05,Aditi,East,Laptop,8,31982.99,False,Urgent,223880.93,33582.1395


In [None]:
# Deleting a column
# del df["notes"]
df.drop("returned", axis=1, inplace=False)
# df.head()

Unnamed: 0,order_id,date,customer,region,product,units,unit_price,returned,notes,revenue,Profit
0,2001,2025-01-01,Diya,East,Headphones,5,38319.36,False,,191596.8,28739.52
1,2002,2025-01-02,Reyansh,East,Smartphone,7,12644.24,False,Promo Applied,88509.68,13276.452
2,2003,2025-01-03,Sai,North,Headphones,7,36682.6,False,Gift,256778.2,38516.73
3,2004,2025-01-04,Sara,North,Smartphone,4,19021.37,False,Promo Applied,76085.48,11412.822
4,2005,2025-01-05,Aditi,East,Laptop,7,31982.99,False,Urgent,223880.93,33582.1395


##### Filtering / Boolean Indexing

In [None]:
df[df['units'] > 8]

df[(df["region"]=="North") & (df["units"]>=6)]
df.query('region=="South" and revenue>200000')

# query() is cleaner for complex conditions
# Internal working: creates a boolean mask and applies vectorized filtering

Unnamed: 0,order_id,date,customer,region,product,units,unit_price,returned,notes,revenue,Profit
13,2014,2025-01-14,Aditi,South,Laptop,9,34200.65,False,Promo Applied,307805.85,46170.8775
26,2027,2025-01-27,Ishaan,South,Smartphone,5,43989.63,False,Promo Applied,219948.15,32992.2225
29,2030,2025-01-30,Pari,South,Laptop,8,41043.89,False,Bulk Order,328351.12,49252.668
41,2042,2025-02-11,Raghav,South,Tablet,5,44467.23,False,Bulk Order,222336.15,33350.4225
57,2058,2025-02-27,Yuvraj,South,Smartwatch,7,35896.78,False,Gift,251277.46,37691.619


##### Sorting

In [94]:
df.sort_values(['region','revenue'], ascending=[True, False]).head()

Unnamed: 0,order_id,date,customer,region,product,units,unit_price,returned,notes,revenue,Profit
25,2026,2025-01-26,Ishita,East,Smartphone,9,46309.99,False,Bulk Order,416789.91,62518.4865
4,2005,2025-01-05,Aditi,East,Laptop,7,31982.99,False,Urgent,223880.93,33582.1395
0,2001,2025-01-01,Diya,East,Headphones,5,38319.36,False,,191596.8,28739.52
20,2021,2025-01-21,Krishna,East,Tablet,9,19950.03,True,Promo Applied,179550.27,26932.5405
1,2002,2025-01-02,Reyansh,East,Smartphone,7,12644.24,False,Promo Applied,88509.68,13276.452


##### Handling Missing Data

In [98]:
df.isna().sum()

order_id       0
date           0
customer       0
region         0
product        0
units          0
unit_price     0
returned       0
notes         10
revenue        0
Profit         0
dtype: int64

In [105]:
df['notes'].fillna('None').head(10)

0             None
1    Promo Applied
2             Gift
3    Promo Applied
4           Urgent
5           Urgent
6             Gift
7             None
8           Urgent
9       Bulk Order
Name: notes, dtype: object

In [73]:
df.iloc()


<pandas.core.indexing._iLocIndexer at 0x7971c5f2c3c0>

In [None]:
dafr = pd.DataFrame(
    {
        "a": [1,2,3,4, None],
        "b": [1,2,None, 3,4],
        "c": [None, 1,2,3,4]
    }
)

print((dafr + dafr == 2 * dafr).all().all()) 
print((dafr + dafr).equals(dafr * 2))
     

False
True


#### Pandas – Function Application

##### Table-wise Function Application with pipe()

**pipe()** allows you to **chain multiple operations** in a clean and functional style, applying them to the whole DataFrame.

In [33]:
def add_profit(df):
    df['profit'] = df['revenue'] * 0.1  # vectoriztion
    return df

def add_discount(df, threshold=20000):
    df['discount'] = df['revenue'].apply(lambda x: x * 0.05 if x > threshold else 0)
    return df


result = (df
          .pipe(add_profit)
          .pipe(add_discount, threshold=25000)
          .pipe(lambda d: d.sort_values("customer", ascending=True))
         )

# pipe() passes the whole DataFrame to each function.
# You can chain multiple transformations without nested parentheses.
# Improves readability in complex data processing pipelines.

result.head(10)

Unnamed: 0,order_id,date,customer,region,product,units,unit_price,returned,notes,revenue,profit,discount
31,2032,2025-02-01,Aarav,West,Smartphone,8,26952.88,False,Promo Applied,215623.04,21562.304,10781.152
4,2005,2025-01-05,Aditi,East,Laptop,7,31982.99,False,Urgent,223880.93,22388.093,11194.0465
12,2013,2025-01-13,Aditi,North,Laptop,7,29953.75,False,,209676.25,20967.625,10483.8125
13,2014,2025-01-14,Aditi,South,Laptop,9,34200.65,False,Promo Applied,307805.85,30780.585,15390.2925
58,2059,2025-02-28,Aditya,West,Headphones,4,12625.21,False,Promo Applied,50500.84,5050.084,2525.042
19,2020,2025-01-20,Aditya,West,Laptop,4,34855.95,False,Bulk Order,139423.8,13942.38,6971.19
59,2060,2025-03-01,Ananya,North,Smartwatch,7,16944.59,False,Urgent,118612.13,11861.213,5930.6065
5,2006,2025-01-06,Anika,South,Smartwatch,3,32042.96,True,Urgent,96128.88,9612.888,4806.444
17,2018,2025-01-18,Anika,West,Tablet,10,32613.47,False,Bulk Order,326521.23,32652.123,16326.0615
38,2039,2025-02-08,Anvi,South,Tablet,2,18111.27,False,Urgent,36222.54,3622.254,1811.127


##### Row or Column-wise Function Application

🔹 apply() on DataFrame

- apply() applies a function along an axis (rows or columns).

`DataFrame.apply(func, axis=0, raw=False, result_type=None)`
- axis=0 → function applied column-wise

- axis=1 → function applied row-wise

In [None]:
# Column-wise Aggregation
df[["units", "revenue"]].apply(sum)

units          295.00
revenue    8053847.74
dtype: float64

In [18]:
# Row-wise Custom Function
def total_values(row):
    return row["units"] * row["unit_price"]

df["Computed Revenue"] = df.apply(total_values, axis=1)
df.head()

Unnamed: 0,order_id,date,customer,region,product,units,unit_price,returned,notes,revenue,Computed Revenue
0,2001,2025-01-01,Diya,East,Headphones,5,38319.36,False,,191596.8,191596.8
1,2002,2025-01-02,Reyansh,East,Smartphone,7,12644.24,False,Promo Applied,88509.68,88509.68
2,2003,2025-01-03,Sai,North,Headphones,7,36682.6,False,Gift,256778.2,256778.2
3,2004,2025-01-04,Sara,North,Smartphone,4,19021.37,False,Promo Applied,76085.48,76085.48
4,2005,2025-01-05,Aditi,East,Laptop,7,31982.99,False,Urgent,223880.93,223880.93


In [None]:
df["revenue"] = df["revenue"].apply(lambda x: round(x, 1))
df.head()

Unnamed: 0,order_id,date,customer,region,product,units,unit_price,returned,notes,revenue,Computed Revenue
0,2001,2025-01-01,Diya,East,Headphones,5,38319.36,False,,191597.0,191596.8
1,2002,2025-01-02,Reyansh,East,Smartphone,7,12644.24,False,Promo Applied,88510.0,88509.68
2,2003,2025-01-03,Sai,North,Headphones,7,36682.6,False,Gift,256778.0,256778.2
3,2004,2025-01-04,Sara,North,Smartphone,4,19021.37,False,Promo Applied,76085.0,76085.48
4,2005,2025-01-05,Aditi,East,Laptop,7,31982.99,False,Urgent,223881.0,223880.93


In [None]:
result = df[["units", "unit_price"]].map(lambda x: x * 2)
print(result.head())

# result = df["region"].map(lambda x: x.upper())
# print(result.head())

   units  unit_price
0     10    76638.72
1     14    25288.48
2     14    73365.20
3      8    38042.74
4     14    63965.98
0     East
1     East
2    North
3    North
4     East
Name: region, dtype: object


In [37]:
discount_map = {"North": 0.05, "South": 0.1, "East": 0.08, "West": 0.06}
df["region_discount"] = df["region"].map(discount_map)
df.head()

Unnamed: 0,order_id,date,customer,region,product,units,unit_price,returned,notes,revenue,profit,discount,region_discount
0,2001,2025-01-01,Diya,East,Headphones,5,38319.36,False,,191596.8,19159.68,9579.84,0.08
1,2002,2025-01-02,Reyansh,East,Smartphone,7,12644.24,False,Promo Applied,88509.68,8850.968,4425.484,0.08
2,2003,2025-01-03,Sai,North,Headphones,7,36682.6,False,Gift,256778.2,25677.82,12838.91,0.05
3,2004,2025-01-04,Sara,North,Smartphone,4,19021.37,False,Promo Applied,76085.48,7608.548,3804.274,0.05
4,2005,2025-01-05,Aditi,East,Laptop,7,31982.99,False,Urgent,223880.93,22388.093,11194.0465,0.08


#### Pandas - Reindexing

Reindexing means changing the row or column labels of a DataFrame or Series to a new set of labels.

It’s useful when:

- You want to align two datasets by index
- You want to fill in missing indices
- You want to reorder or rename labels

`DataFrame.reindex(labels=None, index=None, columns=None, method=None,
                  fill_value=None, limit=None, tolerance=None, copy=True)
`

In [52]:
new_index = [ 1, 2, 3, 4, 5, 6]
df_reindexed = df.reindex(new_index)
df_reindexed
# df.head()

Unnamed: 0,order_id,date,customer,region,product,units,unit_price,returned,notes,revenue,profit,discount,region_discount
1,2002,2025-01-02,Reyansh,East,Smartphone,7,12644.24,False,Promo Applied,88509.68,8850.968,4425.484,0.08
2,2003,2025-01-03,Sai,North,Headphones,7,36682.6,False,Gift,256778.2,25677.82,12838.91,0.05
3,2004,2025-01-04,Sara,North,Smartphone,4,19021.37,False,Promo Applied,76085.48,7608.548,3804.274,0.05
4,2005,2025-01-05,Aditi,East,Laptop,7,31982.99,False,Urgent,223880.93,22388.093,11194.0465,0.08
5,2006,2025-01-06,Anika,South,Smartwatch,3,32042.96,True,Urgent,96128.88,9612.888,4806.444,0.1
6,2007,2025-01-07,Sai,West,Smartwatch,6,27252.96,False,Gift,163517.76,16351.776,8175.888,0.06


In [None]:
df1 = pd.DataFrame({"revenue": [1000, 2000, 3000]}, index=["A", "B", "C"])
df2 = pd.DataFrame({"profit": [100, 200]}, index=["A", "D"])

aligned = df1.reindex(df2.index)
print(aligned)
# reindex(df2.index) aligns df1’s data to match df2’s index.

# Where df2 has “D” (missing in df1), pandas inserts NaN.

   revenue
A   1000.0
D      NaN


🔹 When to use apply():

| Goal | Best Method |
|---|---|
|Entire DataFrame	| .pipe()
|Each Row or Column	| .apply()
|Each Element	  | .applymap() or .map()









##### Element-wise Function Application

Element-wise means each **individual cell** or value is processed by a function.

- applymap() → applies to every element of the entire DataFrame
-  map() → applies to each element of a Series

DataFrame.at
- Access a single value for a row/column pair by label.

DataFrame.iat
- Access a single value for a row/column pair by integer position.

DataFrame.loc
- Access a group of rows and columns by label(s).

DataFrame.iloc
- Access a group of rows and columns by integer position(s).

Series.at
- Access a single value by label.

Series.iat
- Access a single value by integer position.

Series.loc
- Access a group of rows by label(s).

Series.iloc
- Access a group of rows by integer position(s).