In [1]:
import pandas as pd
import numpy as np

# Part 1: Reading and Exploring Data


In [2]:
# Simulate CSV reading
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, 30, np.nan, 35, 40],
    'Gender': ['F', 'M', 'M', 'M', 'F'],
    'Score': [85, 90, 88, np.nan, 95],
    'City': ['Kathmandu', 'Pokhara', 'Lalitpur', 'Biratnagar', 'Butwal']
}

df = pd.DataFrame(data)
print("Initial Data:\n", df)

Initial Data:
       Name   Age Gender  Score        City
0    Alice  25.0      F   85.0   Kathmandu
1      Bob  30.0      M   90.0     Pokhara
2  Charlie   NaN      M   88.0    Lalitpur
3    David  35.0      M    NaN  Biratnagar
4      Eva  40.0      F   95.0      Butwal


# Part 2: Cleaning Data


In [3]:

# Fill missing Age with mean
df['Age'] = df['Age'].fillna(df['Age'].mean())

# Fill missing Score with median
df['Score'] = df['Score'].fillna(df['Score'].median())

# Rename columns
df.rename(columns={'Score': 'TestScore'}, inplace=True)

# Change dtype
df['Age'] = df['Age'].astype(int)

print("\nCleaned Data:\n", df)


Cleaned Data:
       Name  Age Gender  TestScore        City
0    Alice   25      F       85.0   Kathmandu
1      Bob   30      M       90.0     Pokhara
2  Charlie   32      M       88.0    Lalitpur
3    David   35      M       89.0  Biratnagar
4      Eva   40      F       95.0      Butwal


# Part 3: Filtering and Sorting


In [4]:
# Filter rows with Age > 30
filtered_df = df[df['Age'] > 30]
print("\nAge > 30:\n", filtered_df)

# Sort by TestScore
sorted_df = df.sort_values(by='TestScore', ascending=False)
print("\nSorted by TestScore:\n", sorted_df)


Age > 30:
       Name  Age Gender  TestScore        City
2  Charlie   32      M       88.0    Lalitpur
3    David   35      M       89.0  Biratnagar
4      Eva   40      F       95.0      Butwal

Sorted by TestScore:
       Name  Age Gender  TestScore        City
4      Eva   40      F       95.0      Butwal
1      Bob   30      M       90.0     Pokhara
3    David   35      M       89.0  Biratnagar
2  Charlie   32      M       88.0    Lalitpur
0    Alice   25      F       85.0   Kathmandu


# Part 4: Grouping and Aggregation


In [5]:
grouped = df.groupby('Gender').agg({
    'Age': 'mean',
    'TestScore': 'mean'
})
print("\nGroup by Gender:\n", grouped)


Group by Gender:
               Age  TestScore
Gender                      
F       32.500000       90.0
M       32.333333       89.0


# Part 5: ML Prep – One-hot Encoding & Normalization


In [6]:

# One-hot encoding for Gender
encoded_df = pd.get_dummies(df, columns=['Gender'])
print("\nOne-hot Encoded:\n", encoded_df)

# Normalize TestScore (Min-Max)
encoded_df['TestScore_Norm'] = (encoded_df['TestScore'] - encoded_df['TestScore'].min()) / \
                               (encoded_df['TestScore'].max() - encoded_df['TestScore'].min())
print("\nWith Normalized Score:\n", encoded_df)


One-hot Encoded:
       Name  Age  TestScore        City  Gender_F  Gender_M
0    Alice   25       85.0   Kathmandu      True     False
1      Bob   30       90.0     Pokhara     False      True
2  Charlie   32       88.0    Lalitpur     False      True
3    David   35       89.0  Biratnagar     False      True
4      Eva   40       95.0      Butwal      True     False

With Normalized Score:
       Name  Age  TestScore        City  Gender_F  Gender_M  TestScore_Norm
0    Alice   25       85.0   Kathmandu      True     False             0.0
1      Bob   30       90.0     Pokhara     False      True             0.5
2  Charlie   32       88.0    Lalitpur     False      True             0.3
3    David   35       89.0  Biratnagar     False      True             0.4
4      Eva   40       95.0      Butwal      True     False             1.0
