In [2]:
import numpy as np
import pandas as pd

# Create a DataFrame from the 'test_data.csv' file
df = pd.read_csv('test_data.csv')

# Display column statistics for df2
df.describe()

Unnamed: 0,ID,Age,Score
count,8.0,6.0,7.0
mean,4.5,25.0,66.285714
std,2.44949,6.78233,32.57665
min,1.0,18.0,-1.0
25%,2.75,19.75,60.0
50%,4.5,23.5,78.0
75%,6.25,29.5,87.5
max,8.0,35.0,92.0


# Filtering a DataFrame

In [3]:
# Filtering for rows where Age is greater than 18
df_filtered_age = df[df['Age'] > 18]

# Displaying the filtered DataFrame
print("DataFrame filtered by Age > 18:")
print(df_filtered_age)

DataFrame filtered by Age > 18:
   ID     Name   Age     City  Score
0   1   Alicia  25.0       NY   85.0
2   3  Charlie  22.0  Chicago   -1.0
3   4  Dominic  19.0  Houston   65.0
4   5      Eve  31.0  Phoenix   78.0
5   6    Frank  35.0   Boston    NaN


In [4]:
# Filtering for rows where Score is greater than 50 and less than 100
df_filtered_score = df[(df['Score'] > 50) & (df['Score'] < 100)]

# Displaying the filtered DataFrame
print("\nDataFrame filtered by 50 < Score < 100:")
print(df_filtered_score)


DataFrame filtered by 50 < Score < 100:
   ID     Name   Age       City  Score
0   1   Alicia  25.0         NY   85.0
1   2      Bob   NaN         LA   92.0
3   4  Dominic  19.0    Houston   65.0
4   5      Eve  31.0    Phoenix   78.0
6   7    Grace   NaN     Austin   55.0
7   8    Heidi  18.0  San Diego   90.0


#  Modifying a DataFrame

In [5]:
# Removing rows where 'Score' is less than 0
df_cleaned = df.drop(df[df['Score'] < 0].index)

# Displaying the cleaned DataFrame
df_cleaned

Unnamed: 0,ID,Name,Age,City,Score
0,1,Alicia,25.0,NY,85.0
1,2,Bob,,LA,92.0
3,4,Dominic,19.0,Houston,65.0
4,5,Eve,31.0,Phoenix,78.0
5,6,Frank,35.0,Boston,
6,7,Grace,,Austin,55.0
7,8,Heidi,18.0,San Diego,90.0


In [6]:
# Removing the 'City' column from the DataFrame
df_cleaned = df_cleaned.drop('City', axis=1)

# Displaying the updated DataFrame
df_cleaned

Unnamed: 0,ID,Name,Age,Score
0,1,Alicia,25.0,85.0
1,2,Bob,,92.0
3,4,Dominic,19.0,65.0
4,5,Eve,31.0,78.0
5,6,Frank,35.0,
6,7,Grace,,55.0
7,8,Heidi,18.0,90.0


In [7]:
# Calculating the mean of the 'Age' column
mean_age = df_cleaned['Age'].mean()

# Replacing missing 'Age' values with the mean
df_cleaned['Age'] = df_cleaned['Age'].fillna(mean_age)

# Displaying the updated DataFrame
df_cleaned

Unnamed: 0,ID,Name,Age,Score
0,1,Alicia,25.0,85.0
1,2,Bob,25.6,92.0
3,4,Dominic,19.0,65.0
4,5,Eve,31.0,78.0
5,6,Frank,35.0,
6,7,Grace,25.6,55.0
7,8,Heidi,18.0,90.0


In [8]:
# Calculating the mean of the 'Score' column
mean_score = df_cleaned['Score'].mean()

# Replacing any 'Score' value of 0 with the mean
df_cleaned['Score'] = df_cleaned['Score'].replace(0, mean_score)

# Displaying the updated DataFrame
df_cleaned

Unnamed: 0,ID,Name,Age,Score
0,1,Alicia,25.0,85.0
1,2,Bob,25.6,92.0
3,4,Dominic,19.0,65.0
4,5,Eve,31.0,78.0
5,6,Frank,35.0,
6,7,Grace,25.6,55.0
7,8,Heidi,18.0,90.0


In [9]:
# Removing rows with any NaN values
df_cleaned = df_cleaned.dropna()

# Displaying the final cleaned DataFrame
df_cleaned

Unnamed: 0,ID,Name,Age,Score
0,1,Alicia,25.0,85.0
1,2,Bob,25.6,92.0
3,4,Dominic,19.0,65.0
4,5,Eve,31.0,78.0
6,7,Grace,25.6,55.0
7,8,Heidi,18.0,90.0


In [10]:
# Setting 'ID' as the index
df_cleaned = df_cleaned.set_index('ID')

# Displaying the updated DataFrame
df_cleaned

Unnamed: 0_level_0,Name,Age,Score
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Alicia,25.0,85.0
2,Bob,25.6,92.0
4,Dominic,19.0,65.0
5,Eve,31.0,78.0
7,Grace,25.6,55.0
8,Heidi,18.0,90.0
