# Pandas Practice Questions

This notebook covers essential pandas operations including data loading, filtering, grouping, handling missing values, and data manipulation.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Create sample data for demonstration
np.random.seed(42)
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve', 'Frank', 'Grace', 'Henry', 'Ivy', 'Jack'],
    'Age': [25, 30, 35, 28, 32, 29, 27, 31, 26, 33],
    'Salary': [50000, 60000, 70000, 55000, 65000, np.nan, 58000, 72000, 52000, 68000],
    'Department': ['IT', 'HR', 'IT', 'Finance', 'IT', 'HR', 'Finance', 'IT', 'HR', 'Finance'],
    'Experience': [2, 5, 8, 3, 6, 4, 2, 7, 1, 9],
    'Performance_Score': [85, 92, 88, 90, 87, 89, np.nan, 94, 86, 91]
}

df = pd.DataFrame(data)
print("Sample dataset created successfully!")

## 1. Load Dataset and Display First Rows

Load a dataset and display the first few rows along with basic information about the dataset.

In [None]:
# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nColumn Names:")
print(df.columns.tolist())
print("\nData Types:")
print(df.dtypes)
print("\nFirst 5 rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nBasic Statistics:")
print(df.describe())

## 2. Filter Rows and Show Specific Columns

Filter the dataset based on certain conditions and display specific columns.

In [None]:
# Filter employees with age > 30
print("Employees with age > 30:")
filtered_age = df[df['Age'] > 30]
print(filtered_age[['Name', 'Age', 'Department']])
print()

# Filter IT department employees
print("IT Department employees:")
it_employees = df[df['Department'] == 'IT']
print(it_employees[['Name', 'Salary', 'Experience']])
print()

# Multiple conditions: IT employees with experience > 5
print("IT employees with experience > 5 years:")
experienced_it = df[(df['Department'] == 'IT') & (df['Experience'] > 5)]
print(experienced_it[['Name', 'Age', 'Experience', 'Salary']])
print()

# Filter by salary range (excluding NaN values)
print("Employees with salary between 55000 and 70000:")
salary_range = df[(df['Salary'] >= 55000) & (df['Salary'] <= 70000)]
print(salary_range[['Name', 'Salary', 'Department']])

## 3. Group by Target and Calculate Means

Group the data by department and calculate mean values for numerical columns.

In [None]:
# Group by Department and calculate means
print("Mean values by Department:")
dept_means = df.groupby('Department').mean(numeric_only=True)
print(dept_means)
print()

# More detailed groupby analysis
print("Detailed statistics by Department:")
dept_stats = df.groupby('Department').agg({
    'Age': ['mean', 'min', 'max'],
    'Salary': ['mean', 'median', 'count'],
    'Experience': ['mean', 'std'],
    'Performance_Score': ['mean', 'count']
})
print(dept_stats)
print()

# Group by multiple columns
print("Average salary by Department and Experience level:")
df['Experience_Level'] = df['Experience'].apply(lambda x: 'Junior' if x <= 3 else 'Senior')
multi_group = df.groupby(['Department', 'Experience_Level'])['Salary'].mean()
print(multi_group)

## 4. Handle Missing Values

Identify and handle missing values in the dataset using various strategies.

In [None]:
# Check for missing values
print("Missing values in each column:")
print(df.isnull().sum())
print()

print("Percentage of missing values:")
missing_percent = (df.isnull().sum() / len(df)) * 100
print(missing_percent)
print()

# Display rows with missing values
print("Rows with missing values:")
rows_with_missing = df[df.isnull().any(axis=1)]
print(rows_with_missing)
print()

# Create a copy for handling missing values
df_cleaned = df.copy()

# Strategy 1: Fill missing salary with mean
mean_salary = df_cleaned['Salary'].mean()
df_cleaned['Salary'].fillna(mean_salary, inplace=True)

# Strategy 2: Fill missing performance score with median
median_performance = df_cleaned['Performance_Score'].median()
df_cleaned['Performance_Score'].fillna(median_performance, inplace=True)

print("After handling missing values:")
print(df_cleaned.isnull().sum())
print("\nCleaned dataset:")
print(df_cleaned)

## 5. Create New Column and Find Top 3 Values

Create new calculated columns and identify top performers in the dataset.

In [None]:
# Create new columns
df_final = df_cleaned.copy()

# 1. Salary per year of experience
df_final['Salary_per_Experience'] = df_final['Salary'] / df_final['Experience']

# 2. Performance category
df_final['Performance_Category'] = df_final['Performance_Score'].apply(
    lambda x: 'Excellent' if x >= 90 else 'Good' if x >= 85 else 'Average'
)

# 3. Total score (weighted combination)
df_final['Total_Score'] = (df_final['Performance_Score'] * 0.6) + (df_final['Experience'] * 5) + (df_final['Age'] * 0.5)

print("Dataset with new columns:")
print(df_final[['Name', 'Salary_per_Experience', 'Performance_Category', 'Total_Score']].head())
print()

# Find top 3 employees by different criteria
print("Top 3 employees by Salary:")
top_salary = df_final.nlargest(3, 'Salary')[['Name', 'Salary', 'Department']]
print(top_salary)
print()

print("Top 3 employees by Performance Score:")
top_performance = df_final.nlargest(3, 'Performance_Score')[['Name', 'Performance_Score', 'Department']]
print(top_performance)
print()

print("Top 3 employees by Total Score:")
top_total = df_final.nlargest(3, 'Total_Score')[['Name', 'Total_Score', 'Performance_Category']]
print(top_total)
print()

print("Top 3 employees by Salary per Experience:")
top_efficiency = df_final.nlargest(3, 'Salary_per_Experience')[['Name', 'Salary_per_Experience', 'Experience']]
print(top_efficiency)