### DataFrames in Pandas

In [30]:
# import libraries
import pandas as pd
import numpy as np

In [31]:
# creating data frames using a dictionary
data = {
    "Name": ["Alice", "Bob", "Charlie", "David", "Eve"],
    "Gender": ["Female", "Male", "Male", "Male", "Female"],
    "Age": [23, 34, 29, 45, 31],
    "Height": [165, 180, 175, 190, 168],
    "Weight": [55, 80, 72, 95, 60]
}

In [32]:
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Gender,Age,Height,Weight
0,Alice,Female,23,165,55
1,Bob,Male,34,180,80
2,Charlie,Male,29,175,72
3,David,Male,45,190,95
4,Eve,Female,31,168,60


In [33]:
# head function default five
df.head()

Unnamed: 0,Name,Gender,Age,Height,Weight
0,Alice,Female,23,165,55
1,Bob,Male,34,180,80
2,Charlie,Male,29,175,72
3,David,Male,45,190,95
4,Eve,Female,31,168,60


In [34]:
# custom
df.head(2)

Unnamed: 0,Name,Gender,Age,Height,Weight
0,Alice,Female,23,165,55
1,Bob,Male,34,180,80


In [35]:
# tail function default five
df.tail()

Unnamed: 0,Name,Gender,Age,Height,Weight
0,Alice,Female,23,165,55
1,Bob,Male,34,180,80
2,Charlie,Male,29,175,72
3,David,Male,45,190,95
4,Eve,Female,31,168,60


In [36]:
# custom
df.tail(2)

Unnamed: 0,Name,Gender,Age,Height,Weight
3,David,Male,45,190,95
4,Eve,Female,31,168,60


In [37]:
# info: column types, non-null counts
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    5 non-null      object
 1   Gender  5 non-null      object
 2   Age     5 non-null      int64 
 3   Height  5 non-null      int64 
 4   Weight  5 non-null      int64 
dtypes: int64(3), object(2)
memory usage: 332.0+ bytes


In [38]:
# describe: statistical summary for numeric cols
df.describe()

Unnamed: 0,Age,Height,Weight
count,5.0,5.0,5.0
mean,32.4,175.6,72.4
std,8.11172,9.964939,16.009372
min,23.0,165.0,55.0
25%,29.0,168.0,60.0
50%,31.0,175.0,72.0
75%,34.0,180.0,80.0
max,45.0,190.0,95.0


In [39]:
# selecting columns using brackets
df["Name"]

0      Alice
1        Bob
2    Charlie
3      David
4        Eve
Name: Name, dtype: object

In [40]:
# using dot
df.Name

0      Alice
1        Bob
2    Charlie
3      David
4        Eve
Name: Name, dtype: object

In [41]:
# multiple columns
df[["Name", "Age"]]

Unnamed: 0,Name,Age
0,Alice,23
1,Bob,34
2,Charlie,29
3,David,45
4,Eve,31


In [42]:
# accessing rows
# By labels (index name) with .loc
df.loc[[0]]

Unnamed: 0,Name,Gender,Age,Height,Weight
0,Alice,Female,23,165,55


In [43]:
# By position (integer index) with .iloc
df.iloc[[1]]

Unnamed: 0,Name,Gender,Age,Height,Weight
1,Bob,Male,34,180,80


In [44]:
df.iloc[0:3]

Unnamed: 0,Name,Gender,Age,Height,Weight
0,Alice,Female,23,165,55
1,Bob,Male,34,180,80
2,Charlie,Male,29,175,72


In [45]:
# filtering data
# Using bracket notation
males = df[df["Gender"] == "Male"]
males

Unnamed: 0,Name,Gender,Age,Height,Weight
1,Bob,Male,34,180,80
2,Charlie,Male,29,175,72
3,David,Male,45,190,95


In [46]:
# Using dot notation
females = df[df.Gender == "Female"]
females

Unnamed: 0,Name,Gender,Age,Height,Weight
0,Alice,Female,23,165,55
4,Eve,Female,31,168,60


In [47]:
# Chained condition
light_females = females[females.Weight < 90]
light_females

Unnamed: 0,Name,Gender,Age,Height,Weight
0,Alice,Female,23,165,55
4,Eve,Female,31,168,60


In [48]:
# using logical operators
# AND (&)
df[(df.Gender == "Female") & (df.Weight < 90)]

Unnamed: 0,Name,Gender,Age,Height,Weight
0,Alice,Female,23,165,55
4,Eve,Female,31,168,60


In [49]:
# AND + OR
df[(df.Gender == "Female") & 
   (df.Weight < 90) & 
   ((df.Age >= 30) | (df.Age >= 40))]

Unnamed: 0,Name,Gender,Age,Height,Weight
4,Eve,Female,31,168,60


In [50]:
# Range check
df[(df.Gender == "Male") & 
   (df.Height >= 180) & 
   (df.Height <= 190)]

Unnamed: 0,Name,Gender,Age,Height,Weight
1,Bob,Male,34,180,80
3,David,Male,45,190,95


In [51]:
# Between: Height in a range
df[df.Height.between(170, 180)]

Unnamed: 0,Name,Gender,Age,Height,Weight
1,Bob,Male,34,180,80
2,Charlie,Male,29,175,72


In [52]:
# isin: Matching multiple values
df[df.Name.isin(["Alice", "Eve"])]

Unnamed: 0,Name,Gender,Age,Height,Weight
0,Alice,Female,23,165,55
4,Eve,Female,31,168,60


In [53]:
# Group by a column
grouped = df.groupby("Gender")

In [54]:
# Get specific group
grouped.get_group("Male")

Unnamed: 0,Name,Gender,Age,Height,Weight
1,Bob,Male,34,180,80
2,Charlie,Male,29,175,72
3,David,Male,45,190,95


In [55]:
# Check for null values
df.isnull()

Unnamed: 0,Name,Gender,Age,Height,Weight
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False


In [56]:
# Count missing values in each column
df.isnull().sum()

Name      0
Gender    0
Age       0
Height    0
Weight    0
dtype: int64

In [57]:
# Replace invalid values with NaN
df.replace([33333, 11111], np.nan, inplace=True)

In [59]:
# Drop a specific column
df.drop("Age", axis=1)

Unnamed: 0,Name,Gender,Height,Weight
0,Alice,Female,165,55
1,Bob,Male,180,80
2,Charlie,Male,175,72
3,David,Male,190,95
4,Eve,Female,168,60


In [60]:
# Drop rows with any missing value
df.dropna()

Unnamed: 0,Name,Gender,Age,Height,Weight
0,Alice,Female,23,165,55
1,Bob,Male,34,180,80
2,Charlie,Male,29,175,72
3,David,Male,45,190,95
4,Eve,Female,31,168,60


In [61]:
# Drop rows with all values missing
df.dropna(how="all")

Unnamed: 0,Name,Gender,Age,Height,Weight
0,Alice,Female,23,165,55
1,Bob,Male,34,180,80
2,Charlie,Male,29,175,72
3,David,Male,45,190,95
4,Eve,Female,31,168,60


In [62]:
# Keep rows with at least 3 non-null values
df.dropna(thresh=3)

Unnamed: 0,Name,Gender,Age,Height,Weight
0,Alice,Female,23,165,55
1,Bob,Male,34,180,80
2,Charlie,Male,29,175,72
3,David,Male,45,190,95
4,Eve,Female,31,168,60


In [63]:
# Drop and reset index
df.dropna(ignore_index=True)

Unnamed: 0,Name,Gender,Age,Height,Weight
0,Alice,Female,23,165,55
1,Bob,Male,34,180,80
2,Charlie,Male,29,175,72
3,David,Male,45,190,95
4,Eve,Female,31,168,60


In [64]:
# Drop based on specific column(s)
df.dropna(subset=["Age", "Weight"])

Unnamed: 0,Name,Gender,Age,Height,Weight
0,Alice,Female,23,165,55
1,Bob,Male,34,180,80
2,Charlie,Male,29,175,72
3,David,Male,45,190,95
4,Eve,Female,31,168,60


In [65]:
# Replace NaN with a fixed value
df.fillna(0)

Unnamed: 0,Name,Gender,Age,Height,Weight
0,Alice,Female,23,165,55
1,Bob,Male,34,180,80
2,Charlie,Male,29,175,72
3,David,Male,45,190,95
4,Eve,Female,31,168,60


In [76]:
# Forward fill (use previous value)
df.ffill()

Unnamed: 0,Name,Gender,Age,Height,Weight
0,Alice,Female,23,165,55
1,Bob,Male,34,180,80
2,Charlie,Male,29,175,72
3,David,Male,45,190,95
4,Eve,Female,31,168,60


In [77]:
# Backwards fill (use next value)
df.bfill()

Unnamed: 0,Name,Gender,Age,Height,Weight
0,Alice,Female,23,165,55
1,Bob,Male,34,180,80
2,Charlie,Male,29,175,72
3,David,Male,45,190,95
4,Eve,Female,31,168,60


In [78]:
# Limit how many NaNs to fill
df.ffill(limit=2)

Unnamed: 0,Name,Gender,Age,Height,Weight
0,Alice,Female,23,165,55
1,Bob,Male,34,180,80
2,Charlie,Male,29,175,72
3,David,Male,45,190,95
4,Eve,Female,31,168,60


In [79]:
# Fill with column-specific values using dictionary
df.fillna({"Age": 30, "Gender": "Unknown"})

Unnamed: 0,Name,Gender,Age,Height,Weight
0,Alice,Female,23,165,55
1,Bob,Male,34,180,80
2,Charlie,Male,29,175,72
3,David,Male,45,190,95
4,Eve,Female,31,168,60


In [80]:
# Replace with mean value
df["Age"] = df["Age"].fillna(df["Age"].mean())

In [81]:
# Replace with median value
df["Age"] = df["Age"].fillna(df["Age"].median())

In [82]:
# Replace with mode (most frequent value)
df["Age"] = df["Age"].fillna(df["Age"].mode()[0])

In [85]:
# Linear interpolation
df["Age"] = df["Age"].interpolate()

In [90]:
# Downcast to reduce memory usage (convert float64 → float32 or int64 → int32)
df["Age"] = df["Age"].fillna(0).astype(int)

In [91]:
df1 = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
df1

Unnamed: 0,A,B
0,1,3
1,2,4


In [92]:
df2 = pd.DataFrame({"A": [5, 6], "B": [7, 8]})
df2

Unnamed: 0,A,B
0,5,7
1,6,8


In [94]:
# Concatenate along rows
df_concat = pd.concat([df1, df2], ignore_index=True)
df_concat

Unnamed: 0,A,B
0,1,3
1,2,4
2,5,7
3,6,8


In [95]:
left = pd.DataFrame({"ID": [1, 2, 3], "Name": ["A", "B", "C"]})
left

Unnamed: 0,ID,Name
0,1,A
1,2,B
2,3,C


In [96]:
right = pd.DataFrame({"ID": [1, 2, 4], "Age": [20, 25, 30]})
right

Unnamed: 0,ID,Age
0,1,20
1,2,25
2,4,30


In [98]:
# Merge on ID column
df_merge = pd.merge(left, right, on="ID", how="inner")
df_merge

Unnamed: 0,ID,Name,Age
0,1,A,20
1,2,B,25


In [99]:
# Take 3 random rows
df.sample(n=3)

Unnamed: 0,Name,Gender,Age,Height,Weight
1,Bob,Male,34,180,80
3,David,Male,45,190,95
0,Alice,Female,23,165,55


In [100]:
# Take fraction of rows (e.g., 30%)
df.sample(frac=0.3)

Unnamed: 0,Name,Gender,Age,Height,Weight
0,Alice,Female,23,165,55
4,Eve,Female,31,168,60


In [102]:
# Reproducible random sampling
df.sample(n=3, random_state=42)

Unnamed: 0,Name,Gender,Age,Height,Weight
1,Bob,Male,34,180,80
4,Eve,Female,31,168,60
2,Charlie,Male,29,175,72
