## Groups of Functions in pandas for Data Analysius

### A. Creating Series and DataFrames

In [64]:
# Import padndas Package
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Set up views
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)


# Step 2: Define a list
data = [1,2,3,4,5,6,7,8,9,10]

# Step 3: Create the series
series = pd.Series(data)

# Lets view the series that we have created
series.head(10)
series.tail(2)

8     9
9    10
dtype: int64

In [65]:
# lets confirm to be sure we had created a pandas series
type(series)

pandas.core.series.Series

In [66]:
# Lets create a series using the same list but now we will be adding our own serial number
series2 = pd.Series(data, index = ["a","b","c","d","e","f","g","h","i","j"])
series2.head(10)

a     1
b     2
c     3
d     4
e     5
f     6
g     7
h     8
i     9
j    10
dtype: int64

In [67]:
# Lets create a series using python dictionary

# Lets create a python dictionary
data2 = {'a': 10, 'b': 20, 'c': 30}

# lets create the series
series3 = pd.Series(data2)
series3.head()

a    10
b    20
c    30
dtype: int64

#### Hands on practice

In [68]:
# Creating bucket list
list = ["Business","suit","Sneakers","Money","Car","Travel"]

to_pandas = pd.Series(list, index=['a','b','c','d','e','f'])

to_pandas.head(2)

a    Business
b        suit
dtype: object

In [69]:
# Creating simple python disctionary
bio_data = {'Name': "Oluwadamilare", 'Age': 30, 'State': "Ogun", 'Work': "Engr", 'Want': "Financial freedom"}

data_pandas = pd.Series(bio_data)

data_pandas.head()

Name         Oluwadamilare
Age                     30
State                 Ogun
Work                  Engr
Want     Financial freedom
dtype: object

#### Creating Dataframe

In [70]:
# Defining the data using dictionary that is having its values as a list

data = {
    'Name': ["Shola", "Ayo", "Chisom"],
    'Age': [26, 24, 30],
    'Home_Town': ["Benin", "Ibadan", "Lagos"]
}

# Lets create the dataframe using df as short for dataframe
df = pd.DataFrame(data, index= ['i','ii','iii'])
df.head()

Unnamed: 0,Name,Age,Home_Town
i,Shola,26,Benin
ii,Ayo,24,Ibadan
iii,Chisom,30,Lagos


In [71]:
# Lets do something by using the list of dictionaries
data2 = [
    {'Name': "Tola", 'Age': 20, 'Home_Town': "Oyo"},
    {'Name': "Kudi", 'Age': 29, 'Home_Town': "Ekiti"},
    {'Name': "Ola", 'Age': 25, 'Home_Town': ""}
]

# Lets define the dataframe
df2 = pd.DataFrame(data2)
df2.head()

Unnamed: 0,Name,Age,Home_Town
0,Tola,20,Oyo
1,Kudi,29,Ekiti
2,Ola,25,


In [72]:
#  Lets do something again using list of list

data3 = [
    ["Chris", 22, 'Benin'],
    ["Ayo", 25, 'Osun'],
    ["Tope", 227, ''],
]

df3 = pd.DataFrame(data3, columns=["Name", "Age", "Home_Town"])
df3.head()


Unnamed: 0,Name,Age,Home_Town
0,Chris,22,Benin
1,Ayo,25,Osun
2,Tope,227,


In [73]:
# Lets print the types to be sure we have defined dataframes
print(type(df))
print(type(df2))
print(type(df3))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


### B. Data input and Output

In [74]:
# Lets get to work ...
save_data = df3.to_csv('Test_Bio_Data.csv', index= False)
df3.head()

# To read from csv
read_data = pd.read_csv('Test_Bio_Data.csv')

read_data.head()

Unnamed: 0,Name,Age,Home_Town
0,Chris,22,Benin
1,Ayo,25,Osun
2,Tope,227,


### Data Inspection and Exploration

In [75]:
# Using data inspection on my data

# Check head
inspect_data =  pd.read_csv('Test_Bio_Data.csv')
inspect_data.head()

# Check Tail
inspect_data.tail(2)

# Check info
inspect_data.info()

# Describe
inspect_data.describe()

# Shape
inspect_data.shape

# Columns
inspect_data.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Name       3 non-null      object
 1   Age        3 non-null      int64 
 2   Home_Town  2 non-null      object
dtypes: int64(1), object(2)
memory usage: 204.0+ bytes


Index(['Name', 'Age', 'Home_Town'], dtype='object')

### D. Data Cleaning

#### Handling Missing Values

In [76]:
test_data = pd.read_csv('Test_Bio_Data.csv')

# Check for missing values
# test_data.isnull()
test_data.isna()

# Check total number of missing values
test_data.isna().sum()

# fill up missing values
test_data.fillna('Canada')

# Drop missing values
test_data.dropna()


# Finding and Handling Duplicates


Unnamed: 0,Name,Age,Home_Town
0,Chris,22,Benin
1,Ayo,25,Osun


#### Finding and Handling Duplicates

In [77]:
# Checks duplicate
test_data.duplicated()

# Drop duplicate
test_data.drop_duplicates()

Unnamed: 0,Name,Age,Home_Town
0,Chris,22,Benin
1,Ayo,25,Osun
2,Tope,227,


#### Correcting Data Types

In [78]:
test_data = pd.read_csv('Test_Bio_Data.csv')

test_data.dtype()

df.astype()

pd.to_datetime()

AttributeError: 'DataFrame' object has no attribute 'dtype'

### E. Data Selection and Filtering

#### Viewing Data Column

In [None]:
bio_data = pd.read_csv('prac_bio_data.csv')

bio_data.columns

Index(['Timestamp', 'First Name', 'Last Name', 'Course Track', 'City',
       'Gender', 'Seat Number', 'PC-Make', 'PC - OS', 'Feedback'],
      dtype='object')

#### Column Selection

In [None]:
# Lets look through a single column
bio_data['First Name']

# Alternatively using dot
# bio_data.First Name


0             Peter
1          Toyeebat
2         Perpetual
3            Mahfuz
4            Divine
5        Abdulmalik
6          Naheemot
7         Kanyisola
8          Blessing
9            Hannah
10          Deborah
11           Esther
12          Opeyemi
13      Olasunkanmi
14           Saheed
15         Kehinde 
16          Oluwole
17           Samuel
18          Ademola
19           Victor
20           Sherif
21            Ayuba
22            Hamid
23          Olajide
24          Solomon
25    Oluwadamilare
26        Oluwaseyi
27           Adeoye
28        Babatunde
29           Samuel
30          Gabriel
31      Ridwanullah
32      Oluwapelumi
33          Michael
Name: First Name, dtype: object

In [None]:
# Lets select multiple columns
bio_data[['First Name', 'Last Name']].head(15)

Unnamed: 0,First Name,Last Name
0,Peter,Okonmah
1,Toyeebat,Nababa
2,Perpetual,Meninwa
3,Mahfuz,Abdulhameed
4,Divine,Gbadamosi
5,Abdulmalik,Adedotun
6,Naheemot,Adebiyi
7,Kanyisola,Fagbayi
8,Blessing,James
9,Hannah,Tanimola


In [None]:
# Lets Select more columns
bio_data[['First Name', 'Last Name', 'Seat Number']].head(10)

Unnamed: 0,First Name,Last Name,Seat Number
0,Peter,Okonmah,28
1,Toyeebat,Nababa,24
2,Perpetual,Meninwa,22
3,Mahfuz,Abdulhameed,44
4,Divine,Gbadamosi,35
5,Abdulmalik,Adedotun,200
6,Naheemot,Adebiyi,32
7,Kanyisola,Fagbayi,82
8,Blessing,James,45678
9,Hannah,Tanimola,30


#### Cell Selection

In [None]:
# Lets select a single cell
bio_data['First Name'][2]

# Another method
bio_data.at[0, "Last Name"]

# Another method
bio_data.iat[3, 2]

'Abdulhameed'

#### Row Selection

In [None]:
# iloc - to select rows using index slicing (For rows and columns without label)
bio_data.loc[0:5]


# # Combination of row and columns selection
bio_data.iloc[0:5, 0:3]

# # loc - to select rows and columns with their names
bio_data.loc[0:5]
bio_data.loc[0:5, ['First Name']]


Unnamed: 0,Timestamp,First Name,Last Name,Course Track,City,Gender,Seat Number,PC-Make,PC - OS,Feedback
0,2025/09/11 12:55:34 PM GMT+1,Peter,Okonmah,AI,Ogun,Male,28,MACBOOK,Mac OS,non
1,2025/09/11 12:56:11 PM GMT+1,Toyeebat,Nababa,AI,Abeokuta,Female,24,HP,Windows,Excellent
2,2025/09/11 12:57:08 PM GMT+1,Perpetual,Meninwa,AI,Lagos,Female,22,HP,Windows,Thank you so much for the opportunity.
3,2025/09/11 12:57:56 PM GMT+1,Mahfuz,Abdulhameed,AI,Abeokuta,Male,44,HP,Windows,Amazing Shit
4,2025/09/11 12:58:41 PM GMT+1,Divine,Gbadamosi,AI,Abeokuta,Male,35,DELL,Windows,Brain Racking
5,2025/09/11 12:58:55 PM GMT+1,Abdulmalik,Adedotun,AI,Abeokuta,Male,200,HP,Windows,Enjoying the course so far


#### Conditional Filtering

In [None]:
# Filters rows where course track is 'AI'. This is going to retuen dataframe
filtered_track = bio_data[bio_data['Course Track'] == 'AI']
print("Rows where Course_Track is 'AI':")
filtered_track

Rows where Course_Track is 'AI':


Unnamed: 0,Timestamp,First Name,Last Name,Course Track,City,Gender,Seat Number,PC-Make,PC - OS,Feedback
0,2025/09/11 12:55:34 PM GMT+1,Peter,Okonmah,AI,Ogun,Male,28,MACBOOK,Mac OS,non
1,2025/09/11 12:56:11 PM GMT+1,Toyeebat,Nababa,AI,Abeokuta,Female,24,HP,Windows,Excellent
2,2025/09/11 12:57:08 PM GMT+1,Perpetual,Meninwa,AI,Lagos,Female,22,HP,Windows,Thank you so much for the opportunity.
3,2025/09/11 12:57:56 PM GMT+1,Mahfuz,Abdulhameed,AI,Abeokuta,Male,44,HP,Windows,Amazing Shit
4,2025/09/11 12:58:41 PM GMT+1,Divine,Gbadamosi,AI,Abeokuta,Male,35,DELL,Windows,Brain Racking
5,2025/09/11 12:58:55 PM GMT+1,Abdulmalik,Adedotun,AI,Abeokuta,Male,200,HP,Windows,Enjoying the course so far
6,2025/09/11 12:58:55 PM GMT+1,Naheemot,Adebiyi,AI,Abeokuta,Female,32,DELL,Windows,Grateful for the opportunity to be here.
9,2025/09/11 12:59:28 PM GMT+1,Hannah,Tanimola,AI,Abeokuta,Male,30,HP,Windows,On God
11,2025/09/11 12:59:43 PM GMT+1,Esther,Kudoro,AI,Abeokuta,Female,1,HP,Windows,Chill
13,2025/09/11 1:00:13 PM GMT+1,Olasunkanmi,Rasak,AI,Kobape,Male,3,HP,Windows,My gratitude to the sponsor of this program an...


In [None]:
# Import padndas Package
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Set up views
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)


bio_data = pd.read_csv('prac_bio_data.csv')
# Filter rows where City is 'Lagos' and Course Track is Data Science
filtered_city = bio_data[(bio_data['City'] == 'Lagos') & (bio_data['Course Track'] == 'AI')]
print("Rows where city is 'Lagos' and Course Track is 'AI':")
filtered_city

Rows where city is 'Lagos' and Course Track is 'AI':


Unnamed: 0,Timestamp,First Name,Last Name,Course Track,City,Gender,Seat Number,PC-Make,PC - OS,Feedback
2,2025/09/11 12:57:08 PM GMT+1,Perpetual,Meninwa,AI,Lagos,Female,22,HP,Windows,Thank you so much for the opportunity.


In [None]:
# Filter rows where city is either 'Lagos'or 'Abuja'
cities = ['Lagos', 'Sango']
city_filtered = bio_data[bio_data['City'].isin(cities)]

print("Rows where city is either Lagos or Sango:")
city_filtered

Rows where city is either Lagos or Sango:


Unnamed: 0,Timestamp,First Name,Last Name,Course Track,City,Gender,Seat Number,PC-Make,PC - OS,Feedback
2,2025/09/11 12:57:08 PM GMT+1,Perpetual,Meninwa,AI,Lagos,Female,22,HP,Windows,Thank you so much for the opportunity.
7,2025/09/11 12:59:00 PM GMT+1,Kanyisola,Fagbayi,AI;Data Science,Lagos,Female,82,HP,Windows,One chin chin for you for this form
17,2025/09/11 1:00:49 PM GMT+1,Samuel,Oyewusi,Web Dev,Lagos,Male,15,HP,Windows,Satisfactory
25,2025/09/11 1:02:56 PM GMT+1,Oluwadamilare,Bello,AI,Sango,Male,373,DELL;HP;LENOVO;MACBOOK;ASUS;,Mac OS,Coding is interesting when you understand


#### Using the .query() method

In [None]:
# Use query() to filter rows where course Track is 'AI' andFeedback is 'Excellent'
query_filtered = bio_data.query("City == 'Abeokuta' and Feedback == 'Excellent'")
print("Rows Filtered using query() method:")
query_filtered

Rows Filtered using query() method:


Unnamed: 0,Timestamp,First Name,Last Name,Course Track,City,Gender,Seat Number,PC-Make,PC - OS,Feedback
1,2025/09/11 12:56:11 PM GMT+1,Toyeebat,Nababa,AI,Abeokuta,Female,24,HP,Windows,Excellent


In [None]:
# Filter rows where Gender is 'Male'
data_science = bio_data.query("Gender == 'Female'")
print("Students in that are female:")
data_science

Students in the Data Science Track:


Unnamed: 0,Timestamp,First Name,Last Name,Course Track,City,Gender,Seat Number,PC-Make,PC - OS,Feedback
1,2025/09/11 12:56:11 PM GMT+1,Toyeebat,Nababa,AI,Abeokuta,Female,24,HP,Windows,Excellent
2,2025/09/11 12:57:08 PM GMT+1,Perpetual,Meninwa,AI,Lagos,Female,22,HP,Windows,Thank you so much for the opportunity.
6,2025/09/11 12:58:55 PM GMT+1,Naheemot,Adebiyi,AI,Abeokuta,Female,32,DELL,Windows,Grateful for the opportunity to be here.
7,2025/09/11 12:59:00 PM GMT+1,Kanyisola,Fagbayi,AI;Data Science,Lagos,Female,82,HP,Windows,One chin chin for you for this form
8,2025/09/11 12:59:16 PM GMT+1,Blessing,James,Cyber Security,Nairobi,Female,45678,HP,Windows,Thanks for creating the form.
10,2025/09/11 12:59:41 PM GMT+1,Deborah,Adelegan,AI;Data Science,Abeokuta,Female,1,HP,Windows,None for now
11,2025/09/11 12:59:43 PM GMT+1,Esther,Kudoro,AI,Abeokuta,Female,1,HP,Windows,Chill
27,2025/09/11 1:03:12 PM GMT+1,Adeoye,Mary,AI,abeokuta,Female,15,LENOVO,Windows,Still processing


In [None]:
# Filter rows using multiple conditions with logical operators
webdev_high_seat_No = bio_data.query("Seat Number > 30 and Course_Track == 'Web Dev'")

webdev_high_seat_No

In [None]:
# Filter rows where PC-Make is either 'Hp' or 'Dell'
hp_dell = bio_data.query("PC_make in ['HP', 'DELL']")
print("Rows where PC_make is either HP or Dell:")
hp_dell


In [None]:
# Using python variable inside query @
# Define a variable for the course track
desired_track = 'Cloud Computing'

# Use the variable in the query expression
cloud_computing_students = bio_data.query("Course_Track == @desired_track")
print("Students in the Cloud Computing track:")
cloud_computing_students

In [None]:
# Filter rows where Feedback != Poor and City is 'Lagos'
good_feedback_lagos = bio_data.query("Feedback != 'Poor' and City == 'Lagos'")
print("Students in Lagos with other feedback than poor:")
good_feedback_lagos

Students in Lagos with other feedback than poor:


Unnamed: 0,Timestamp,First Name,Last Name,Course Track,City,Gender,Seat Number,PC-Make,PC - OS,Feedback
2,2025/09/11 12:57:08 PM GMT+1,Perpetual,Meninwa,AI,Lagos,Female,22,HP,Windows,Thank you so much for the opportunity.
7,2025/09/11 12:59:00 PM GMT+1,Kanyisola,Fagbayi,AI;Data Science,Lagos,Female,82,HP,Windows,One chin chin for you for this form
17,2025/09/11 1:00:49 PM GMT+1,Samuel,Oyewusi,Web Dev,Lagos,Male,15,HP,Windows,Satisfactory


In [None]:
#Lets create a more complex query filter for Course_Track,Feedback and Seat_No
complex_query = bio_data.query("Course_Track == 'Data Science' or (Feedback == 'Excellent' and Seat_No < 115)")
print("Complex query result:")
complex_query


### F. Data Transformation

#### Renaming Column Name

In [None]:
# Lets modify the columnn names by fixing the old names as keys and the new ones as values
bio_data.rename(columns={'First Name': 'FirstName', 'Last Name': 'LastName', 'Course Track': 'CourseTrack', 'Seat Number': 'SeatNo', 'PC-Make': 'PcMake', 'PC - OS': 'PcOS'}).head()

Unnamed: 0,Timestamp,FirstName,LastName,CourseTrack,City,Gender,SeatNo,PcMake,PcOS,Feedback
0,2025/09/11 12:55:34 PM GMT+1,Peter,Okonmah,AI,Ogun,Male,28,MACBOOK,Mac OS,non
1,2025/09/11 12:56:11 PM GMT+1,Toyeebat,Nababa,AI,Abeokuta,Female,24,HP,Windows,Excellent
2,2025/09/11 12:57:08 PM GMT+1,Perpetual,Meninwa,AI,Lagos,Female,22,HP,Windows,Thank you so much for the opportunity.
3,2025/09/11 12:57:56 PM GMT+1,Mahfuz,Abdulhameed,AI,Abeokuta,Male,44,HP,Windows,Amazing Shit
4,2025/09/11 12:58:41 PM GMT+1,Divine,Gbadamosi,AI,Abeokuta,Male,35,DELL,Windows,Brain Racking


#### Applying String Methods

In [None]:
bio_data['Feedback'] = bio_data['Feedback'].str.lower()

# Converting to upper case
bio_data['PC - OS'] = bio_data['PC - OS'].str.upper()

# COnverting first name using title()
bio_data['First Name'] = bio_data['First Name'].str.title()

# Defining a lambda function
lambda x: x.str.title()

# .apply() method to help apply the function to the selected column
# bio_data['Last Name'] = bio_data['Last Name'].apply(lambda x: x.str.title())


bio_data.head()

Unnamed: 0,Timestamp,First Name,Last Name,Course Track,City,Gender,Seat Number,PC-Make,PC - OS,Feedback
0,2025/09/11 12:55:34 PM GMT+1,Peter,Okonmah,AI,Ogun,Male,28,MACBOOK,MAC OS,non
1,2025/09/11 12:56:11 PM GMT+1,Toyeebat,Nababa,AI,Abeokuta,Female,24,HP,WINDOWS,excellent
2,2025/09/11 12:57:08 PM GMT+1,Perpetual,Meninwa,AI,Lagos,Female,22,HP,WINDOWS,thank you so much for the opportunity.
3,2025/09/11 12:57:56 PM GMT+1,Mahfuz,Abdulhameed,AI,Abeokuta,Male,44,HP,WINDOWS,amazing shit
4,2025/09/11 12:58:41 PM GMT+1,Divine,Gbadamosi,AI,Abeokuta,Male,35,DELL,WINDOWS,brain racking


In [None]:
# We can decide to apply the lambda funtion to every element in the dataset
bio_data.applymap(lambda x: x.str.title())
bio_data.head()

#### Sorting Values

In [6]:
import pandas as pd
bio_data = pd.read_csv('prac_bio_data.csv')
# Sorting columns
bio_data.sort_values(by= 'City', ascending=True)
bio_data.head()

Unnamed: 0,Timestamp,First Name,Last Name,Course Track,City,Gender,Seat Number,PC-Make,PC - OS,Feedback
0,2025/09/11 12:55:34 PM GMT+1,Peter,Okonmah,AI,Ogun,Male,28,MACBOOK,Mac OS,non
1,2025/09/11 12:56:11 PM GMT+1,Toyeebat,Nababa,AI,Abeokuta,Female,24,HP,Windows,Excellent
2,2025/09/11 12:57:08 PM GMT+1,Perpetual,Meninwa,AI,Lagos,Female,22,HP,Windows,Thank you so much for the opportunity.
3,2025/09/11 12:57:56 PM GMT+1,Mahfuz,Abdulhameed,AI,Abeokuta,Male,44,HP,Windows,Amazing Shit
4,2025/09/11 12:58:41 PM GMT+1,Divine,Gbadamosi,AI,Abeokuta,Male,35,DELL,Windows,Brain Racking


In [None]:
# Sorting Columns