# Imports

In [2]:
import pandas as pd

# Topics

## Chaining operations in Pandas

In [2]:
# Sample DataFrame
data = {'Name': ['John', 'Anna', 'Peter', 'Linda'],
        'Age': [28, 34, 45, 32],
        'Income': [50000, 60000, 80000, 75000]}
df = pd.DataFrame(data)

# Method chaining example: Filtering and sorting data
result = (
    df
    .loc[df['Age'] > 30]  # Filter rows where Age > 30
    .sort_values(by='Income', ascending=False)  # Sort by Income in descending order
)



In [3]:
result

Unnamed: 0,Name,Age,Income
2,Peter,45,80000
3,Linda,32,75000
1,Anna,34,60000


## Groupby operations in Pandas

In [4]:
data = {'Category': ['A', 'B', 'A', 'B', 'A'],
        'Value': [10, 20, 30, 40, 50]}
df = pd.DataFrame(data)

# Grouping data by 'Category' and calculating the sum of 'Value'
grouped_data = df.groupby('Category').sum()

In [5]:
grouped_data

Unnamed: 0_level_0,Value
Category,Unnamed: 1_level_1
A,90
B,60


## Commanding Time Series Data with Finesse

In [5]:

# Sample time series data
date_range = pd.date_range(start='1/1/2023', end='12/31/2028', freq='D')
traffic_data = pd.Series(range(len(date_range)), index=date_range)
print(traffic_data[:5])


2023-01-01    0
2023-01-02    1
2023-01-03    2
2023-01-04    3
2023-01-05    4
Freq: D, dtype: int64


In [6]:
# Resampling and frequency conversion for monthly analysis
yearly_traffic = traffic_data.resample('YE').sum()
print(yearly_traffic)

2023-12-31     66430
2024-12-31    200385
2025-12-31    333245
2026-12-31    466470
2027-12-31    599695
2028-12-31    735111
Freq: YE-DEC, dtype: int64


*we resampled the data into monthly intervals using the resample function*
*having calculated the sum of traffic for each month*

## Multi-level indexing in Pandas

In [7]:
index = pd.MultiIndex.from_tuples(
    [('A', 1), ('A', 2), ('B', 1), ('B', 2)],
    names=['Category', 'Number']
)

df = pd.DataFrame({
    'Value': [10, 20, 30, 40]
}, index=index)

df

Unnamed: 0_level_0,Unnamed: 1_level_0,Value
Category,Number,Unnamed: 2_level_1
A,1,10
A,2,20
B,1,30
B,2,40


In [9]:
# or

data = {
    'Category': ['A', 'A', 'B', 'B'],
    'Number': [1, 2, 1, 2],
    'Value': [10, 20, 30, 40]
}
df = pd.DataFrame(data)
df.set_index(['Category', 'Number'], inplace=True)

df

Unnamed: 0_level_0,Unnamed: 1_level_0,Value
Category,Number,Unnamed: 2_level_1
A,1,10
A,2,20
B,1,30
B,2,40


In [10]:
print(df.swaplevel())

                 Value
Number Category       
1      A            10
2      A            20
1      B            30
2      B            40


In [11]:
print(df.reset_index())

  Category  Number  Value
0        A       1     10
1        A       2     20
2        B       1     30
3        B       2     40


In [14]:
print(df.loc['A']) 


print(df.loc[('A', 1)])

        Value
Number       
1          10
2          20
Value    10
Name: (A, 1), dtype: int64


In [13]:
print(df.groupby(level=['Category']).sum())

          Value
Category       
A            30
B            70


In [15]:
print(df.groupby(level=['Number']).sum())

        Value
Number       
1          40
2          60


## Pandas to Markdown 

In [3]:
df = pd.DataFrame({'a': [1, 2, 3, 4],
                   'b': [5, 6, 7, 8]})

# You can control the printing of the index column by using the flag index.
# make sure to pip install tabulate
print(df.to_markdown(index=True))

|    |   a |   b |
|---:|----:|----:|
|  0 |   1 |   5 |
|  1 |   2 |   6 |
|  2 |   3 |   7 |
|  3 |   4 |   8 |


How it looks like in MD 

|    |   a |   b |
|---:|----:|----:|
|  0 |   1 |   5 |
|  1 |   2 |   6 |
|  2 |   3 |   7 |
|  3 |   4 |   8 |

## Group Rows into list

In [6]:
df = pd.DataFrame(
    {
        "col1": [1, 2, 3, 4, 3],
        "col2": ["a", "a", "b", "b", "c"],
        "col3": ["d", "e", "f", "g", "h"],
    }
)

# Group by col2
print(df.groupby(["col2"]).agg(
    {
        "col1": "mean",           # get mean
        "col3": lambda x: list(x) # get list
    }
))

      col1    col3
col2              
a      1.5  [d, e]
b      3.5  [f, g]
c      3.0     [h]


## Dataframe.explode()

In [7]:
df = pd.DataFrame({"a": ["1,2", "4,5"],
                   "b": [11, 13]})

# Turn strings into lists
df.a = df.a.str.split(",")
print(df)

        a   b
0  [1, 2]  11
1  [4, 5]  13


In [8]:
df.explode("a", ignore_index=False)

Unnamed: 0,a,b
0,1,11
0,2,11
1,4,13
1,5,13


## Groupby().count vs Groupby( ).size

In [9]:
df = pd.DataFrame(
    {
        "col1": ["a", "b", "b", "c", "c", "d"],
        "col2": ["S", "S", "M", "L", "L", "L"]
    }
)

# get the count of elements in one column
df.groupby(["col1"]).count()

Unnamed: 0_level_0,col2
col1,Unnamed: 1_level_1
a,1
b,2
c,2
d,1


In [10]:
df.groupby(["col1", "col2"]).size()

col1  col2
a     S       1
b     M       1
      S       1
c     L       2
d     L       1
dtype: int64

## More on groupby

In [14]:
df = pd.read_csv("data/Dummy_Sales_Data_v1.csv")

In [15]:
df_group = df.groupby('Product_Category')
df_group.get_group('Healthcare')
df.groupby("Status").get_group('Not Delivered')
# same as : df[df["Status"]=='Not Delivered']

Unnamed: 0,OrderID,Quantity,UnitPrice(USD),Status,OrderDate,Product_Category,Sales_Manager,Shipping_Cost(USD),Delivery_Time(Days),Shipping_Address,Product_Code,OrderCode
0,2951110000999929511,92,238,Not Delivered,2021-08-08,Healthcare,Pablo,21,25.0,Singapore,HC-188,444116
1,2181910000999921819,61,136,Not Delivered,2021-10-03,Office,Pablo,34,14.0,UK,O-555,444772
2,3239110000999932391,67,235,Not Delivered,2021-09-27,Office,Kristen,25,11.0,Kenya,O-188,444666
4,1548310000999915483,13,189,Not Delivered,2021-08-15,Fashion,Stella,24,19.0,Kenya,F-555,444223
5,4934810000999949348,51,204,Not Delivered,2021-11-13,Entertainment,Abdul,32,18.0,UK,ENT-188,445113
...,...,...,...,...,...,...,...,...,...,...,...,...
9974,4241810000999942418,79,220,Not Delivered,2021-08-18,Home,Pablo,32,17.0,Singapore,H-555,444226
9983,4107210000999941072,57,162,Not Delivered,2021-08-26,Healthcare,Abdul,23,21.0,India,HC-188,444334
9986,1868610000999918686,30,162,Not Delivered,2021-12-12,Healthcare,Pablo,35,11.0,Italy,HC-630,445442
9988,4264110000999942641,4,187,Not Delivered,2021-11-20,Healthcare,Maria,34,15.0,UK,HC-203,445220


In [16]:
function_dictionary = {'OrderID':'count','Quantity':'mean'}
df.groupby("Product_Category").aggregate(function_dictionary)

Unnamed: 0_level_0,OrderID,Quantity
Product_Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Entertainment,1968,49.851118
Fashion,1971,51.2552
Healthcare,1953,50.905274
Home,2060,50.70534
Office,2011,50.913476


In [17]:
df.groupby("Product_Category")[["Quantity"]].aggregate(['min',
                                                        'max',
                                                        'sum',
                                                        'mean'])

Unnamed: 0_level_0,Quantity,Quantity,Quantity,Quantity
Unnamed: 0_level_1,min,max,sum,mean
Product_Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Entertainment,1,100,98107,49.851118
Fashion,1,100,101024,51.2552
Healthcare,1,100,99418,50.905274
Home,1,100,104453,50.70534
Office,1,100,102387,50.913476


## DF.corwith()

In [11]:
df1 = pd.DataFrame({
    "a": [1, 2, 3, 4],
    "b": [2, 3, 4, 6]
})

df2 = pd.DataFrame({
    "a": [1, 2, 3, 3],
    "b": [2, 2, 5, 4]
})

df1.corrwith(df2)

a    0.94388
b    0.68313
dtype: float64

## Cross Tabulation 2 

N.B : crosstab 1 is in ['Notebook Number 2'](./2-%20Decorators_crosstab_knnimp.ipynb)

In [12]:
network = [
    ("Ben", "Smith"),
    ("Ben", "Patrick"),
    ("Warren", "Jone"),
    ("Warren", "Smith"),
    ("Smith", "Patrick"),
]

# Create a dataframe of the network
friends1 = pd.DataFrame(
    network, columns=["person1", "person2"]
)

# Create the order of the columns
friends2 = pd.DataFrame(
    network, columns=["person2", "person1"]
)

# Create a symmetric dataframe
friends = pd.concat([friends1, friends2])

# Create a cross tabulation
pd.crosstab(friends.person1, friends.person2)

person2,Ben,Jone,Patrick,Smith,Warren
person1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ben,0,0,1,1,0
Jone,0,0,0,0,1
Patrick,1,0,0,1,0
Smith,1,0,1,0,1
Warren,0,1,0,1,0


## Unpivot (melt)

In [13]:
df = pd.DataFrame({
    "fruit": ["apple", "orange"],
    "Aldi": [4, 5],
    "Walmart": [6, 7],
    "Costco": [1, 2]
})

df

Unnamed: 0,fruit,Aldi,Walmart,Costco
0,apple,4,6,1
1,orange,5,7,2


In [14]:
df.melt(id_vars=["fruit"],
        value_vars=["Aldi", "Walmart", "Costco"],
        var_name='store')

Unnamed: 0,fruit,store,value
0,apple,Aldi,4
1,orange,Aldi,5
2,apple,Walmart,6
3,orange,Walmart,7
4,apple,Costco,1
5,orange,Costco,2


## Rename aggregated column

In [15]:
df = pd.DataFrame({"size": ["S", "S", "M", "L"],
                   "price": [44, 29.99, 10, 19]})

df.groupby('size').agg({'price': 'mean'})

Unnamed: 0_level_0,price
size,Unnamed: 1_level_1
L,19.0
M,10.0
S,36.995


## Normalized value counts

In [16]:
size = pd.Series(["S", "S", "M", "L", "S", "XL", "S", "M",])

# Get count of each value
size.value_counts(normalize=True)

S     0.500
M     0.250
L     0.125
XL    0.125
Name: proportion, dtype: float64

## Fill in NULL Values

In [17]:
store1 = pd.DataFrame({
    "orange": [None, 5, 9],
    "apple": [4, None, 12]
})

store2 = pd.DataFrame({
    "orange": [31, 52, 91],
    "apple": [11, 71, 21]
})

# Fill null values of the store1 with values at the same
# locations from store2
store1.combine_first(store2)

Unnamed: 0,orange,apple
0,31.0,4.0
1,5.0,71.0
2,9.0,12.0


## Value_Count missing values

In [18]:
size = pd.Series(["S", "S", None, "M", "L", "S", None, "XL", "S", "M",])

# Get count of each value, it does not count missing values
size.value_counts()


# pass dropna=False to get missing value count
size.value_counts(dropna=False)

S       4
None    2
M       2
L       1
XL      1
Name: count, dtype: int64

## Filter Columns 

In [20]:
df = pd.DataFrame({'Temp': ['Hot', 'Cold', 'Warm', 'Cold'],
                   'Degree': [35, 3, 15, 2]})
print(df)

df = pd.get_dummies(df, columns=['Temp'],dtype=int)
print(df)

print(df.filter(like='Temp', axis=1))


   Temp  Degree
0   Hot      35
1  Cold       3
2  Warm      15
3  Cold       2
   Degree  Temp_Cold  Temp_Hot  Temp_Warm
0      35          0         1          0
1       3          1         0          0
2      15          0         0          1
3       2          1         0          0
   Temp_Cold  Temp_Hot  Temp_Warm
0          0         1          0
1          1         0          0
2          0         0          1
3          1         0          0


## Multiple columns assign using assing

In [22]:
time_sentences = ["Saturday: Weekend (Not working day)",
                  "Sunday: Weekend (Not working day)",
                  "Monday: Doctor appointment at 2:45pm.",
                  "Tuesday: Dentist appointment at 11:30 am.",
                  "Wednesday: basketball game At 7:00pm",
                  "Thursday: Back home by 11:15 pm.",
                  "Friday: Take the train at 08:10 am."]

df = pd.DataFrame(time_sentences, columns=['text'])

# Use Assign instead of using direct assignment 
# df['text'] = df.text.str.lower()
# df['text_len'] = df.text.str.len()
# df['word_count'] = df.text.str.count(" ") + 1
# df['weekend'] = df.text.str.contains("saturday|sunday", case=False)
print((
    df
    .assign(text=df.text.str.lower(),
            text_len=df.text.str.len(),
            word_count=df.text.str.count(" ") + 1,
            weekend=df.text.str.contains("saturday|friday", case=False),
           )
))

                                        text  text_len  word_count  weekend
0        saturday: weekend (not working day)        35           5     True
1          sunday: weekend (not working day)        33           5    False
2      monday: doctor appointment at 2:45pm.        37           5    False
3  tuesday: dentist appointment at 11:30 am.        41           6    False
4       wednesday: basketball game at 7:00pm        36           5    False
5           thursday: back home by 11:15 pm.        32           6    False
6        friday: take the train at 08:10 am.        35           7     True


## Reading html tables

In [25]:
# Without a marcher we will get a list of all tables in the 
# page. To make a table selection, pass table title to the 
# match parameter
table = pd.read_html(
    "https://en.wikipedia.org/wiki/Minnesota", 
    match="Average daily"
)

print(table[0].head())

      Location July (°F) July (°C) January (°F) January (°C)
0  Minneapolis     83/64     28/18         23/7       −4/−13
1   Saint Paul     83/63     28/17         23/6       −5/−14
2    Rochester     82/63     28/17         23/3       −5/−16
3       Duluth     76/55     24/13         19/1       −7/−17
4    St. Cloud     81/58     27/14        18/−1       −7/−18


## Ranking 

In [27]:
df = pd.DataFrame({'Students': ['John', 'Smith', 'Patrick', 'Bob', 'Jose'],
                   'Marks': [80, 56, 95, 75, 45]})
print(df)

df["Rank"] = df["Marks"].rank(ascending=False).astype(int)
print(df)

  Students  Marks
0     John     80
1    Smith     56
2  Patrick     95
3      Bob     75
4     Jose     45
  Students  Marks  Rank
0     John     80     2
1    Smith     56     4
2  Patrick     95     1
3      Bob     75     3
4     Jose     45     5


## Color Values in DataFrame

In [28]:
df = pd.DataFrame({'Students': ['John', 'Smith', 'Patrick', 'Bob', 'Jose'],
                   'Physics': [80, 56, 95, 75, 45], 
                   'Mathematics': [90, 85, 55, 65, 75]})
df.set_index('Students', inplace=True)




In [30]:
def pass_condition(val):
    color = 'blue' if val > 70 else 'red'
    return f"background-color: {color}"

df.style.map(pass_condition)

Unnamed: 0_level_0,Physics,Mathematics
Students,Unnamed: 1_level_1,Unnamed: 2_level_1
John,80,90
Smith,56,85
Patrick,95,55
Bob,75,65
Jose,45,75


## Query 

In [2]:
df = pd.read_csv("data/Dummy_Sales_Data_v1.csv")

In [3]:
df.rename(columns={'UnitPrice(USD)':'UnitPrice', 
                   'Shipping_Cost(USD)':'Shipping_Cost',
                   'Delivery_Time(Days)':'Delivery_Time'},
          inplace=True)
df.head()

Unnamed: 0,OrderID,Quantity,UnitPrice,Status,OrderDate,Product_Category,Sales_Manager,Shipping_Cost,Delivery_Time,Shipping_Address,Product_Code,OrderCode
0,2951110000999929511,92,238,Not Delivered,2021-08-08,Healthcare,Pablo,21,25.0,Singapore,HC-188,444116
1,2181910000999921819,61,136,Not Delivered,2021-10-03,Office,Pablo,34,14.0,UK,O-555,444772
2,3239110000999932391,67,235,Not Delivered,2021-09-27,Office,Kristen,25,11.0,Kenya,O-188,444666
3,1112610000999911126,33,133,Not Shipped,2021-07-30,Fashion,Abdul,34,24.0,USA,F-555,444007
4,1548310000999915483,13,189,Not Delivered,2021-08-15,Fashion,Stella,24,19.0,Kenya,F-555,444223


In [4]:
df.query("sqrt(UnitPrice) < Shipping_Cost/2")

Unnamed: 0,OrderID,Quantity,UnitPrice,Status,OrderDate,Product_Category,Sales_Manager,Shipping_Cost,Delivery_Time,Shipping_Address,Product_Code,OrderCode
1,2181910000999921819,61,136,Not Delivered,2021-10-03,Office,Pablo,34,14.0,UK,O-555,444772
3,1112610000999911126,33,133,Not Shipped,2021-07-30,Fashion,Abdul,34,24.0,USA,F-555,444007
5,4934810000999949348,51,204,Not Delivered,2021-11-13,Entertainment,Abdul,32,18.0,UK,ENT-188,445113
6,2750410000999927504,73,242,Not Delivered,2021-07-08,Healthcare,Emma,34,10.0,UK,HC-555,443885
8,2804110000999928041,31,163,Not Shipped,2021-12-23,Fashion,Abdul,34,16.0,Kenya,F-901,445553
...,...,...,...,...,...,...,...,...,...,...,...,...
9988,4264110000999942641,4,187,Not Delivered,2021-11-20,Healthcare,Maria,34,15.0,UK,HC-203,445220
9993,2363310000999923633,82,116,Shipped,2021-09-21,Entertainment,Emma,33,12.0,Singapore,ENT-188,444660
9995,1847410000999918474,37,135,Shipped,2021-10-03,Healthcare,Maria,30,23.0,China,HC-901,444772
9996,2385710000999923857,81,207,Delivered,2021-11-13,Office,Emma,29,18.0,USA,O-203,445113


In [5]:
#Filtering using one condition and Maths calculation in it with built-in function
df.query("sqrt(UnitPrice) > 15")

Unnamed: 0,OrderID,Quantity,UnitPrice,Status,OrderDate,Product_Category,Sales_Manager,Shipping_Cost,Delivery_Time,Shipping_Address,Product_Code,OrderCode
0,2951110000999929511,92,238,Not Delivered,2021-08-08,Healthcare,Pablo,21,25.0,Singapore,HC-188,444116
2,3239110000999932391,67,235,Not Delivered,2021-09-27,Office,Kristen,25,11.0,Kenya,O-188,444666
6,2750410000999927504,73,242,Not Delivered,2021-07-08,Healthcare,Emma,34,10.0,UK,HC-555,443885
7,4797510000999947975,48,240,Delivered,2021-10-04,Fashion,Abdul,22,21.0,Kenya,F-203,444773
10,4337210000999943372,57,226,Not Shipped,2021-09-27,Home,John,24,14.0,UK,H-555,444666
...,...,...,...,...,...,...,...,...,...,...,...,...
9967,3167110000999931671,99,233,Delivered,2021-11-02,Home,Emma,30,19.0,Kenya,H-555,445002
9976,4106710000999941067,47,237,Not Shipped,2021-09-10,Office,Stella,24,15.0,USA,O-188,444449
9985,1541410000999915414,92,230,Shipped,2021-10-26,Fashion,Emma,26,24.0,China,F-901,444995
9989,1928010000999919280,17,235,Shipped,2021-09-14,Fashion,Emma,27,18.0,Germany,F-901,444553


In [6]:
df.query("Quantity**2 + Shipping_Cost**2 < 500")

Unnamed: 0,OrderID,Quantity,UnitPrice,Status,OrderDate,Product_Category,Sales_Manager,Shipping_Cost,Delivery_Time,Shipping_Address,Product_Code,OrderCode
30,1408910000999914089,7,130,Delivered,2021-07-04,Healthcare,Sofia,20,,UK,HC-630,443881
177,1923610000999919236,3,147,Delivered,2021-12-24,Office,Anthony,20,21.0,Italy,O-203,445554
178,2930810000999929308,3,190,Not Shipped,2021-10-11,Office,Pablo,21,20.0,China,O-555,444880
323,1321310000999913213,6,202,Not Shipped,2021-07-11,Fashion,Emma,21,15.0,USA,F-630,443888
336,2362110000999923621,3,178,Not Shipped,2021-07-10,Fashion,Stella,22,15.0,Italy,F-188,443887
...,...,...,...,...,...,...,...,...,...,...,...,...
9685,1382310000999913823,9,148,Shipped,2021-10-14,Entertainment,Jacob,20,18.0,Kenya,ENT-203,444883
9737,1370410000999913704,2,147,Not Delivered,2021-12-07,Home,Maria,21,15.0,USA,H-101,445337
9769,2248110000999922481,7,205,Delivered,2021-12-23,Office,Pablo,20,18.0,Kenya,O-101,445553
9849,3684910000999936849,2,149,Delivered,2021-12-24,Healthcare,Emma,20,14.0,Kenya,HC-630,445554


In [7]:
# Filtering on Text column
df.query("Status == 'Not Shipped'")

Unnamed: 0,OrderID,Quantity,UnitPrice,Status,OrderDate,Product_Category,Sales_Manager,Shipping_Cost,Delivery_Time,Shipping_Address,Product_Code,OrderCode
3,1112610000999911126,33,133,Not Shipped,2021-07-30,Fashion,Abdul,34,24.0,USA,F-555,444007
8,2804110000999928041,31,163,Not Shipped,2021-12-23,Fashion,Abdul,34,16.0,Kenya,F-901,445553
10,4337210000999943372,57,226,Not Shipped,2021-09-27,Home,John,24,14.0,UK,H-555,444666
13,4583610000999945836,46,208,Not Shipped,2021-07-28,Home,Jacob,29,19.0,UK,H-188,444005
15,1444810000999914448,5,119,Not Shipped,2021-10-13,Office,Stella,33,20.0,India,O-555,444882
...,...,...,...,...,...,...,...,...,...,...,...,...
9965,2561710000999925617,55,102,Not Shipped,2021-07-14,Entertainment,Kristen,31,14.0,USA,ENT-101,443991
9971,4839110000999948391,74,106,Not Shipped,2021-09-10,Office,Jacob,25,22.0,Kenya,O-901,444449
9976,4106710000999941067,47,237,Not Shipped,2021-09-10,Office,Stella,24,15.0,USA,O-188,444449
9978,4626810000999946268,10,103,Not Shipped,2021-10-03,Fashion,Sofia,32,19.0,Singapore,F-203,444772


In [8]:
df.query("Quantity != 95")

Unnamed: 0,OrderID,Quantity,UnitPrice,Status,OrderDate,Product_Category,Sales_Manager,Shipping_Cost,Delivery_Time,Shipping_Address,Product_Code,OrderCode
0,2951110000999929511,92,238,Not Delivered,2021-08-08,Healthcare,Pablo,21,25.0,Singapore,HC-188,444116
1,2181910000999921819,61,136,Not Delivered,2021-10-03,Office,Pablo,34,14.0,UK,O-555,444772
2,3239110000999932391,67,235,Not Delivered,2021-09-27,Office,Kristen,25,11.0,Kenya,O-188,444666
3,1112610000999911126,33,133,Not Shipped,2021-07-30,Fashion,Abdul,34,24.0,USA,F-555,444007
4,1548310000999915483,13,189,Not Delivered,2021-08-15,Fashion,Stella,24,19.0,Kenya,F-555,444223
...,...,...,...,...,...,...,...,...,...,...,...,...
9994,2301610000999923016,75,201,Not Delivered,2021-10-16,Home,Sofia,20,14.0,Kenya,H-555,444885
9995,1847410000999918474,37,135,Shipped,2021-10-03,Healthcare,Maria,30,23.0,China,HC-901,444772
9996,2385710000999923857,81,207,Delivered,2021-11-13,Office,Emma,29,18.0,USA,O-203,445113
9997,2281610000999922816,18,117,Shipped,2021-12-23,Fashion,Stella,22,24.0,Italy,F-101,445553


In [9]:
df.query("Quantity == 95 | UnitPrice == 182")

Unnamed: 0,OrderID,Quantity,UnitPrice,Status,OrderDate,Product_Category,Sales_Manager,Shipping_Cost,Delivery_Time,Shipping_Address,Product_Code,OrderCode
78,3457310000999934573,95,182,Not Shipped,2021-11-16,Healthcare,John,28,25.0,Singapore,HC-203,445116
102,1464410000999914644,95,227,Shipped,2021-10-26,Healthcare,Sofia,30,18.0,UK,HC-901,444995
235,4983710000999949837,62,182,Not Shipped,2021-11-25,Fashion,Sofia,24,21.0,Kenya,F-101,445225
240,3044810000999930448,39,182,Not Shipped,2021-12-18,Home,Maria,28,21.0,Kenya,H-101,445448
256,4575110000999945751,74,182,Not Delivered,2021-08-26,Fashion,Emma,23,19.0,USA,F-630,444334
...,...,...,...,...,...,...,...,...,...,...,...,...
9650,3903910000999939039,95,223,Not Delivered,2021-07-12,Home,Anthony,21,14.0,USA,H-630,443889
9695,2859810000999928598,51,182,Not Delivered,2021-11-01,Office,Stella,24,18.0,USA,O-101,445001
9745,2690010000999926900,72,182,Not Shipped,2021-07-03,Entertainment,Emma,35,22.0,Kenya,ENT-188,443880
9805,1484210000999914842,95,152,Not Delivered,2021-12-30,Fashion,Jacob,30,21.0,Kenya,F-101,445660


In [10]:
df["OrderDate"] = pd.to_datetime(df["OrderDate"], format="%Y-%m-%d")
#Filtering using Datetime columns
df.query("OrderDate.dt.month == 8")

Unnamed: 0,OrderID,Quantity,UnitPrice,Status,OrderDate,Product_Category,Sales_Manager,Shipping_Cost,Delivery_Time,Shipping_Address,Product_Code,OrderCode
0,2951110000999929511,92,238,Not Delivered,2021-08-08,Healthcare,Pablo,21,25.0,Singapore,HC-188,444116
4,1548310000999915483,13,189,Not Delivered,2021-08-15,Fashion,Stella,24,19.0,Kenya,F-555,444223
9,1735910000999917359,62,214,Not Delivered,2021-08-14,Fashion,John,22,17.0,Germany,F-901,444222
17,1932510000999919325,62,237,Delivered,2021-08-24,Fashion,Stella,32,13.0,Italy,F-203,444332
32,1859110000999918591,22,155,Shipped,2021-08-14,Home,Jacob,35,13.0,Kenya,H-203,444222
...,...,...,...,...,...,...,...,...,...,...,...,...
9974,4241810000999942418,79,220,Not Delivered,2021-08-18,Home,Pablo,32,17.0,Singapore,H-555,444226
9975,3001610000999930016,80,174,Delivered,2021-08-30,Home,Maria,33,25.0,China,H-188,444338
9977,3374310000999933743,37,156,Delivered,2021-08-28,Home,Maria,23,18.0,Singapore,H-188,444336
9981,4904910000999949049,73,209,Shipped,2021-08-20,Healthcare,Sofia,33,12.0,UK,HC-203,444228


In [11]:
df.query("OrderDate.dt.month == 8 and OrderDate.dt.year == 2021 and OrderDate.dt.day >=15")

Unnamed: 0,OrderID,Quantity,UnitPrice,Status,OrderDate,Product_Category,Sales_Manager,Shipping_Cost,Delivery_Time,Shipping_Address,Product_Code,OrderCode
4,1548310000999915483,13,189,Not Delivered,2021-08-15,Fashion,Stella,24,19.0,Kenya,F-555,444223
17,1932510000999919325,62,237,Delivered,2021-08-24,Fashion,Stella,32,13.0,Italy,F-203,444332
33,3255510000999932555,96,202,Delivered,2021-08-20,Office,Anthony,22,24.0,Italy,O-555,444228
57,2585010000999925850,23,143,Delivered,2021-08-24,Office,Jacob,27,17.0,Germany,O-188,444332
64,1999610000999919996,50,197,Delivered,2021-08-27,Home,Sofia,30,15.0,Germany,H-630,444335
...,...,...,...,...,...,...,...,...,...,...,...,...
9974,4241810000999942418,79,220,Not Delivered,2021-08-18,Home,Pablo,32,17.0,Singapore,H-555,444226
9975,3001610000999930016,80,174,Delivered,2021-08-30,Home,Maria,33,25.0,China,H-188,444338
9977,3374310000999933743,37,156,Delivered,2021-08-28,Home,Maria,23,18.0,Singapore,H-188,444336
9981,4904910000999949049,73,209,Shipped,2021-08-20,Healthcare,Sofia,33,12.0,UK,HC-203,444228


In [12]:
df.query("OrderDate > '2021-08-15'")

Unnamed: 0,OrderID,Quantity,UnitPrice,Status,OrderDate,Product_Category,Sales_Manager,Shipping_Cost,Delivery_Time,Shipping_Address,Product_Code,OrderCode
1,2181910000999921819,61,136,Not Delivered,2021-10-03,Office,Pablo,34,14.0,UK,O-555,444772
2,3239110000999932391,67,235,Not Delivered,2021-09-27,Office,Kristen,25,11.0,Kenya,O-188,444666
5,4934810000999949348,51,204,Not Delivered,2021-11-13,Entertainment,Abdul,32,18.0,UK,ENT-188,445113
7,4797510000999947975,48,240,Delivered,2021-10-04,Fashion,Abdul,22,21.0,Kenya,F-203,444773
8,2804110000999928041,31,163,Not Shipped,2021-12-23,Fashion,Abdul,34,16.0,Kenya,F-901,445553
...,...,...,...,...,...,...,...,...,...,...,...,...
9993,2363310000999923633,82,116,Shipped,2021-09-21,Entertainment,Emma,33,12.0,Singapore,ENT-188,444660
9994,2301610000999923016,75,201,Not Delivered,2021-10-16,Home,Sofia,20,14.0,Kenya,H-555,444885
9995,1847410000999918474,37,135,Shipped,2021-10-03,Healthcare,Maria,30,23.0,China,HC-901,444772
9996,2385710000999923857,81,207,Delivered,2021-11-13,Office,Emma,29,18.0,USA,O-203,445113


In [13]:
df.query("OrderDate >= '2021-08-15' and OrderDate <= '2021-08-31'")

Unnamed: 0,OrderID,Quantity,UnitPrice,Status,OrderDate,Product_Category,Sales_Manager,Shipping_Cost,Delivery_Time,Shipping_Address,Product_Code,OrderCode
4,1548310000999915483,13,189,Not Delivered,2021-08-15,Fashion,Stella,24,19.0,Kenya,F-555,444223
17,1932510000999919325,62,237,Delivered,2021-08-24,Fashion,Stella,32,13.0,Italy,F-203,444332
33,3255510000999932555,96,202,Delivered,2021-08-20,Office,Anthony,22,24.0,Italy,O-555,444228
57,2585010000999925850,23,143,Delivered,2021-08-24,Office,Jacob,27,17.0,Germany,O-188,444332
64,1999610000999919996,50,197,Delivered,2021-08-27,Home,Sofia,30,15.0,Germany,H-630,444335
...,...,...,...,...,...,...,...,...,...,...,...,...
9974,4241810000999942418,79,220,Not Delivered,2021-08-18,Home,Pablo,32,17.0,Singapore,H-555,444226
9975,3001610000999930016,80,174,Delivered,2021-08-30,Home,Maria,33,25.0,China,H-188,444338
9977,3374310000999933743,37,156,Delivered,2021-08-28,Home,Maria,23,18.0,Singapore,H-188,444336
9981,4904910000999949049,73,209,Shipped,2021-08-20,Healthcare,Sofia,33,12.0,UK,HC-203,444228


## Rounding Dates in Pandas , Ceil and Floor 

In [4]:
# Create a sample datetime
date = pd.to_datetime('2024-07-28 15:45:23')
print("Original:", date)
print("Rounded to minute:", date.round('min'))
print("Rounded to hour:", date.round('h'))
print("Rounded to day:", date.round('D'))

Original: 2024-07-28 15:45:23
Rounded to minute: 2024-07-28 15:45:00
Rounded to hour: 2024-07-28 16:00:00
Rounded to day: 2024-07-29 00:00:00


In [5]:
# Create a sample datetime
date = pd.to_datetime('2024-07-28 15:45:23')
print("Original:", date)

# Rounding down (floor)
print("Rounded down to hour:", date.floor('h'))
print("Rounded down to day:", date.floor('D'))



Original: 2024-07-28 15:45:23
Rounded down to hour: 2024-07-28 15:00:00
Rounded down to day: 2024-07-28 00:00:00


In [6]:
# Rounding up (ceil)
print("Rounded up to hour:", date.ceil('h'))
print("Rounded up to day:", date.ceil('D'))

Rounded up to hour: 2024-07-28 16:00:00
Rounded up to day: 2024-07-29 00:00:00


In [8]:
# with multiple dates (pd.series)

dates = pd.Series([
    '2024-07-28 15:45:23',
    '2024-07-28 16:30:45',
    '2024-07-28 17:15:10'
])

dates = pd.to_datetime(dates)

print("Rounded to nearest hour:")
print(dates.round('h'))

print("Rounded down to nearest hour:")
for date in dates:
    print(date.floor('h')) # we notice that we should apply it for each date


Rounded to nearest hour:
0   2024-07-28 15:45:23
1   2024-07-28 16:30:45
2   2024-07-28 17:15:10
dtype: datetime64[ns]
Rounded down to nearest hour:
2024-07-28 15:00:00
2024-07-28 16:00:00
2024-07-28 17:00:00


## Performing Time Arithmetic with pd.Timedelta

In [9]:
# Sample data
date_series = pd.Series(['2023-07-01', '2023-07-02', '2024-01-01'])

# Check the data type
print("Original data type:")
print(date_series.dtype)
print(date_series)

# Convert to datetime
date_series = pd.to_datetime(date_series)

# Add 1 week to each date
date_series_plus_week = date_series + pd.Timedelta(weeks=1)

# Subtract 2 days from each date
date_series_minus_days = date_series - pd.Timedelta(days=2)

print("Original dates:")
print(date_series)
print("\nDates + 1 week:")
print(date_series_plus_week)
print("\nDates - 2 days:")
print(date_series_minus_days)

Original data type:
object
0    2023-07-01
1    2023-07-02
2    2024-01-01
dtype: object
Original dates:
0   2023-07-01
1   2023-07-02
2   2024-01-01
dtype: datetime64[ns]

Dates + 1 week:
0   2023-07-08
1   2023-07-09
2   2024-01-08
dtype: datetime64[ns]

Dates - 2 days:
0   2023-06-29
1   2023-06-30
2   2023-12-30
dtype: datetime64[ns]


While pd.Timedelta is great for adding or subtracting fixed periods like days or weeks, it falls short when dealing with months or years because their length can vary. For these cases, Pandas provides pd.DateOffset, which is specifically designed to handle such irregularities

In [10]:
# Add 1 month to each date
date_series_plus_month = date_series + pd.DateOffset(months=1)

# Subtract 1 year from each date
date_series_minus_year = date_series - pd.DateOffset(years=1)

print("Original dates:")
print(date_series)
print("\nDates + 1 month:")
print(date_series_plus_month)
print("\nDates - 1 year:")
print(date_series_minus_year)

Original dates:
0   2023-07-01
1   2023-07-02
2   2024-01-01
dtype: datetime64[ns]

Dates + 1 month:
0   2023-08-01
1   2023-08-02
2   2024-02-01
dtype: datetime64[ns]

Dates - 1 year:
0   2022-07-01
1   2022-07-02
2   2023-01-01
dtype: datetime64[ns]


## Finding the Difference Between Dates Using pd.Timestamp

In [11]:
# Create two Timestamp objects
date1 = pd.Timestamp('2023-01-01')
date2 = pd.Timestamp('2023-12-31')

# Calculate the difference
difference = date2 - date1

print(f"The difference between {date2.date()} and {date1.date()} is {difference.days} days")

# we can use datetime and dateutils 

The difference between 2023-12-31 and 2023-01-01 is 364 days
