# Filtering DataFrames

In [1]:
import pandas as pd


df = pd.DataFrame({
    
    "name": ["John", "Jane", "Emily", "Lisa", "Matt", "Andy"],
    "note_a": [92, 94, 87, 82, 90, 94],
    "note_b": [87, 90, 87, 89, 94, 87],
    "profession":["Electrical engineer", "Mechanical engineer", "Data scientist", "Accountant", "Athlete", "Dentist"],
    "date_of_birth":["1998-11-01", "2002-08-14", "1996-01-12", "2002-10-24", "2004-04-05", "2005-06-10"],
    "group":["A", "B", "B", "A", "C", "D"]
    
})

df

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,90,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


## Exercise 1 - select columns

In [2]:
df[["name", "note_a"]]

Unnamed: 0,name,note_a
0,John,92
1,Jane,94
2,Emily,87
3,Lisa,82
4,Matt,90
5,Andy,94


## Exercise 2 - select rows and columns with iloc

In [3]:
df.iloc[:, [0,1]]

Unnamed: 0,name,note_a
0,John,92
1,Jane,94
2,Emily,87
3,Lisa,82
4,Matt,90
5,Andy,94


## Exercise 3 - select rows and columns with iloc

In [4]:
df.iloc[:3, [0,1]]

Unnamed: 0,name,note_a
0,John,92
1,Jane,94
2,Emily,87


## Exercise 4 - select rows and columns with iloc

In [5]:
df.iloc[:3, :3]

Unnamed: 0,name,note_a,note_b
0,John,92,87
1,Jane,94,90
2,Emily,87,87


## Exercise 5 - select rows and columns with loc

In [6]:
df.loc[:, ["name", "profession"]]

Unnamed: 0,name,profession
0,John,Electrical engineer
1,Jane,Mechanical engineer
2,Emily,Data scientist
3,Lisa,Accountant
4,Matt,Athlete
5,Andy,Dentist


## Exercise 6 - logical operators

In [7]:
df

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,90,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


In [8]:
df[df["note_a"] > 90]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
5,Andy,94,87,Dentist,2005-06-10,D


## Exercise 7 - logical operators

In [9]:
df[df.group == "A"]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
3,Lisa,82,89,Accountant,2002-10-24,A


In [10]:
df[df.group != "A"]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
4,Matt,90,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


## Exercise 8 - logical operators

In [11]:
df[df["note_a"] > df["note_b"]]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
5,Andy,94,87,Dentist,2005-06-10,D


In [12]:
df[df["note_a"] >= df["note_b"]]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
5,Andy,94,87,Dentist,2005-06-10,D


## Exercise 9 - logical operators

In [13]:
df[(df["note_a"] > 80) & (df["group"] == "B")]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B


## Exercise 10 - logical operators

In [14]:
df[(df["note_a"] > 90) | (df["group"] == "B")]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
5,Andy,94,87,Dentist,2005-06-10,D


## Exercise 11 - between

In [15]:
df

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,90,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


In [16]:
df[df["note_a"].between(85, 90)]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
2,Emily,87,87,Data scientist,1996-01-12,B
4,Matt,90,94,Athlete,2004-04-05,C


## Exercise 12 - between

* inclusive parameter: both, neither, left, right (default both)

In [17]:
df[df["note_a"].between(85, 90, inclusive="neither")]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
2,Emily,87,87,Data scientist,1996-01-12,B


In [18]:
df[df["note_a"].between(85, 90, inclusive="left")]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
2,Emily,87,87,Data scientist,1996-01-12,B


In [19]:
df[df["note_a"].between(85, 90, inclusive="right")]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
2,Emily,87,87,Data scientist,1996-01-12,B
4,Matt,90,94,Athlete,2004-04-05,C


## Exercise 13 - isin method

In [20]:
df

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,90,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


In [21]:
df[
    (df["group"] == "A") |
    (df["group"] == "B") |
    (df["group"] == "C")
]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,90,94,Athlete,2004-04-05,C


In [22]:
df[df["group"].isin(["A", "B", "C"])]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,90,94,Athlete,2004-04-05,C


## Exercise 14 - isin method

In [23]:
df[
    (df["group"].isin(["A", "B", "C"])) &
    (df["note_a"] > 90)
]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B


## Exercise 15 - not (tilde) operator

In [24]:
df

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,90,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


In [25]:
df[~df["group"].isin(["A", "B", "C"])]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
5,Andy,94,87,Dentist,2005-06-10,D


In [26]:
df[~df["group"].isin(["A", "B"])]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
4,Matt,90,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


## Exercise 16 - str accessor

In [27]:
df

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,90,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


In [28]:
df[df["name"].str.startswith("J")]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B


## Exercise 17 - str accessor

* Case-sensitive

In [29]:
df[df["name"].str.startswith("j")]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group


In [30]:
df[df["name"].str.lower().str.startswith("j")]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B


## Exercise 18 - str accessor

In [31]:
df[df["name"].str.endswith("e")]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
1,Jane,94,90,Mechanical engineer,2002-08-14,B


## Exercise 19 - str accessor

In [32]:
df[df["profession"].str.contains("engineer")]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B


## Exercise 20 - str accessor

In [33]:
df[
    (df["profession"].str.contains("engineer")) &
    (df["group"] == "A")
]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A


## Exercise 21 - str accessor

In [34]:
df

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,90,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


In [35]:
df[df["profession"].str.len() < 10]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
4,Matt,90,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


## Exercise 22 - str accessor

* strings with multiple words

In [36]:
df[df["profession"].str.contains(" ")]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B


## Exercise 23 - dt accessor

In [37]:
df

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,90,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


In [38]:
df.dtypes

name             object
note_a            int64
note_b            int64
profession       object
date_of_birth    object
group            object
dtype: object

In [39]:
df["date_of_birth"] = df["date_of_birth"].astype("datetime64[ns]")

df.dtypes

name                     object
note_a                    int64
note_b                    int64
profession               object
date_of_birth    datetime64[ns]
group                    object
dtype: object

## Exercise 24 - dt accessor

In [40]:
df

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,90,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


In [41]:
df[df["date_of_birth"].dt.year < 2000]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
2,Emily,87,87,Data scientist,1996-01-12,B


## Exercise 25 - dt accessor

In [42]:
df[df["date_of_birth"].dt.month == 10]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
3,Lisa,82,89,Accountant,2002-10-24,A


## Exercise 26 - dt accessor

In [43]:
df[df["date_of_birth"].dt.month.isin([8, 9, 10])]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
1,Jane,94,90,Mechanical engineer,2002-08-14,B
3,Lisa,82,89,Accountant,2002-10-24,A


## Exercise 27 - dt accessor

In [44]:
df[
    (df["date_of_birth"].dt.month.isin([8, 9, 10])) &
    (df["group"] == "A")
]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
3,Lisa,82,89,Accountant,2002-10-24,A


## Exercise 28 - dt accessor

In [45]:
df["date_of_birth"].dt.quarter

0    4
1    3
2    1
3    4
4    2
5    2
Name: date_of_birth, dtype: int32

In [46]:
df[df["date_of_birth"].dt.quarter == 2]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
4,Matt,90,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


## Exercise 29 - dt accessor

In [47]:
df[df["date_of_birth"].dt.day <= 15]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
4,Matt,90,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


## Exercise 30 - dt accessor

In [48]:
df["date_of_birth"].dt.to_period("M")

0    1998-11
1    2002-08
2    1996-01
3    2002-10
4    2004-04
5    2005-06
Name: date_of_birth, dtype: period[M]

In [49]:
df[df["date_of_birth"].dt.to_period("M") == "2002-08"]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
1,Jane,94,90,Mechanical engineer,2002-08-14,B


## Exercise 31 - dt accessor

In [50]:
pd.to_datetime("today")

Timestamp('2023-06-22 19:37:42.136726')

In [51]:
pd.to_datetime("today") - df["date_of_birth"]

0    8999 days 19:37:42.139942
1    7617 days 19:37:42.139942
2   10023 days 19:37:42.139942
3    7546 days 19:37:42.139942
4    7017 days 19:37:42.139942
5    6586 days 19:37:42.139942
Name: date_of_birth, dtype: timedelta64[ns]

In [52]:
# Find the number of days in a timedelta object
(pd.to_datetime("today") - df["date_of_birth"]).dt.days

0     8999
1     7617
2    10023
3     7546
4     7017
5     6586
Name: date_of_birth, dtype: int64

In [53]:
365 * 20

7300

In [54]:
df[(pd.to_datetime("today") - df["date_of_birth"]).dt.days > 7300]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A


## Exercise 32 - nlargest

In [55]:
df

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,90,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


In [56]:
df.nlargest(3, "note_a")

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
1,Jane,94,90,Mechanical engineer,2002-08-14,B
5,Andy,94,87,Dentist,2005-06-10,D
0,John,92,87,Electrical engineer,1998-11-01,A


In [57]:
df.sort_values(by="note_a", ascending=False)

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
1,Jane,94,90,Mechanical engineer,2002-08-14,B
5,Andy,94,87,Dentist,2005-06-10,D
0,John,92,87,Electrical engineer,1998-11-01,A
4,Matt,90,94,Athlete,2004-04-05,C
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A


## Exercise 33 - nsmallest

In [58]:
df.nsmallest(3, "note_a")

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
3,Lisa,82,89,Accountant,2002-10-24,A
2,Emily,87,87,Data scientist,1996-01-12,B
4,Matt,90,94,Athlete,2004-04-05,C


## Exercise 34 - nlargest and nsmallest in case of equality

In [59]:
df.sort_values(by="note_a", ascending=False)

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
1,Jane,94,90,Mechanical engineer,2002-08-14,B
5,Andy,94,87,Dentist,2005-06-10,D
0,John,92,87,Electrical engineer,1998-11-01,A
4,Matt,90,94,Athlete,2004-04-05,C
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A


In [60]:
df.iloc[4, 1] = 92

df.sort_values(by="note_a", ascending=False)

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
1,Jane,94,90,Mechanical engineer,2002-08-14,B
5,Andy,94,87,Dentist,2005-06-10,D
0,John,92,87,Electrical engineer,1998-11-01,A
4,Matt,92,94,Athlete,2004-04-05,C
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A


In [61]:
df.nlargest(3, "note_a")

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
1,Jane,94,90,Mechanical engineer,2002-08-14,B
5,Andy,94,87,Dentist,2005-06-10,D
0,John,92,87,Electrical engineer,1998-11-01,A


In [62]:
df.nlargest(3, "note_a", keep="all")

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
1,Jane,94,90,Mechanical engineer,2002-08-14,B
5,Andy,94,87,Dentist,2005-06-10,D
0,John,92,87,Electrical engineer,1998-11-01,A
4,Matt,92,94,Athlete,2004-04-05,C


## Exercise 35 - nlargest and nsmallest with multiple columns

In [63]:
df.nlargest(3, ["note_a", "note_b"], keep="all")

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
1,Jane,94,90,Mechanical engineer,2002-08-14,B
5,Andy,94,87,Dentist,2005-06-10,D
4,Matt,92,94,Athlete,2004-04-05,C


## Exercise 36 - query function

* The query function accepts strings as filters.

In [64]:
df

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,92,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


In [65]:
df.query("note_a > 90")

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
4,Matt,92,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


## Exercise 37 - query function

In [66]:
df.query("group == 'A'")

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
3,Lisa,82,89,Accountant,2002-10-24,A


## Exercise 38 - query function

In [67]:
df.query("note_a > 90 and group == 'A'")

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A


## Exercise 39 - query function

In [68]:
df.query("note_a > 90 or group == 'A'")

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,92,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


## Exercise 40 - apply function

In [69]:
df

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,92,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


In [70]:
df[df["profession"].apply(lambda x: len(x) > 15)]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B


## Exercise 41 - apply function

In [71]:
df["note_a"] = df["note_a"].astype("string")

df.iloc[0, 1] = "92?"

df

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92?,87,Electrical engineer,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,92,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


In [72]:
df.dtypes

name                     object
note_a           string[python]
note_b                    int64
profession               object
date_of_birth    datetime64[ns]
group                    object
dtype: object

In [73]:
df[df["note_a"].apply(lambda x: x.isnumeric())]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,92,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


In [74]:
df[df["note_a"].apply(lambda x: x.isnumeric()==False)]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92?,87,Electrical engineer,1998-11-01,A


## Exercise 42 - isna

In [75]:
df.iloc[0, 3] = None

df

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92?,87,,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,92,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


In [76]:
df[df["profession"].isna()]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92?,87,,1998-11-01,A


In [77]:
df[df["profession"].isna()==True]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92?,87,,1998-11-01,A


## Exercise 43 - notna

In [78]:
df[df["profession"].isna()==False]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,92,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


In [79]:
df[df["profession"].notna()]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,92,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


## Exercise 44 - sample function

In [80]:
df

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92?,87,,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,92,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


In [81]:
df.sample(n=3)

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
1,Jane,94,90,Mechanical engineer,2002-08-14,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,92,94,Athlete,2004-04-05,C


## Exercise 45 - sample function

In [82]:
df.sample(frac=0.5)

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
3,Lisa,82,89,Accountant,2002-10-24,A
0,John,92?,87,,1998-11-01,A
4,Matt,92,94,Athlete,2004-04-05,C


In [83]:
df.sample(frac=0.8)

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
4,Matt,92,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D
1,Jane,94,90,Mechanical engineer,2002-08-14,B
0,John,92?,87,,1998-11-01,A
3,Lisa,82,89,Accountant,2002-10-24,A


## Exercise 46 - sample function

* Use the replace parameter to allow or disallow sampling of the same row more than once.
* Default value is False.

In [84]:
df.sample(n=5, replace=True)

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
4,Matt,92,94,Athlete,2004-04-05,C
2,Emily,87,87,Data scientist,1996-01-12,B
1,Jane,94,90,Mechanical engineer,2002-08-14,B
0,John,92?,87,,1998-11-01,A
5,Andy,94,87,Dentist,2005-06-10,D


## Exercise 47 - problem

* Find the people who are in group B and were born before 2000

In [85]:
df[
    (df["group"] == "B") &
    (df["date_of_birth"].dt.year < 2000)
]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
2,Emily,87,87,Data scientist,1996-01-12,B


## Exercise 48 - problem

* Find the people whose name start with a letter that comes after J in alphabetical order

In [86]:
df[df["name"] > "J"]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92?,87,,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,92,94,Athlete,2004-04-05,C


In [87]:
df[df["name"] > "K"]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,92,94,Athlete,2004-04-05,C


## Exercise 49 - problem

* Find the people whose total note is greater than 180

In [88]:
df

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92?,87,,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,92,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


In [89]:
# The data type of this column is string so use "92" instead of 92
df.iloc[0, 1] = "92" 

# Change the data type
df["note_a"] = df["note_a"].astype("int") 

df

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,92,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


In [90]:
df[(df["note_a"] + df["note_b"]) > 180]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
1,Jane,94,90,Mechanical engineer,2002-08-14,B
4,Matt,92,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


## Exercise 50 - problem

* Find the people whose total average note is greater than 91

In [91]:
df[(df["note_a"] + df["note_b"])/2 > 91]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
1,Jane,94,90,Mechanical engineer,2002-08-14,B
4,Matt,92,94,Athlete,2004-04-05,C


## Exercise 51 - problem

* Find the people who were born in January, February, or March

In [92]:
df[df["date_of_birth"].dt.month.isin([0, 1, 2])]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
2,Emily,87,87,Data scientist,1996-01-12,B


In [93]:
df[df["date_of_birth"].dt.quarter == 1]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
2,Emily,87,87,Data scientist,1996-01-12,B


## Exercise 52 - problem

* Find the people whose total note is higher than 180 and are not in group B

In [94]:
df

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
0,John,92,87,,1998-11-01,A
1,Jane,94,90,Mechanical engineer,2002-08-14,B
2,Emily,87,87,Data scientist,1996-01-12,B
3,Lisa,82,89,Accountant,2002-10-24,A
4,Matt,92,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D


In [95]:
df[
    ((df["note_a"] + df["note_b"]) > 180) &
    (df["group"] != "B")
]

Unnamed: 0,name,note_a,note_b,profession,date_of_birth,group
4,Matt,92,94,Athlete,2004-04-05,C
5,Andy,94,87,Dentist,2005-06-10,D
