In [1]:
import numpy as np
import pandas as pd

In [2]:
# Example-1:Create a DataFrame of students' names, marks, and grades

data = {
    'Name': ['Aarav', 'Bhavna', 'Chirag', 'Dhruvi', 'Eshan', 'Farhan', 'Garima'],
    'Marks': [92, 76, 85, 67, 88, 81, 73],
    'Grade': ['A', 'C', 'A', 'D', 'A', 'B', 'C']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Marks,Grade
0,Aarav,92,A
1,Bhavna,76,C
2,Chirag,85,A
3,Dhruvi,67,D
4,Eshan,88,A
5,Farhan,81,B
6,Garima,73,C


In [3]:
# Example-2: Filter students with marks > 80

df[df["Marks"] > 80]

Unnamed: 0,Name,Marks,Grade
0,Aarav,92,A
2,Chirag,85,A
4,Eshan,88,A
5,Farhan,81,B


In [4]:
# Example-3: Employees joined before 2020

df = pd.read_csv("data/employee.csv")
df[pd.to_datetime(df["JoinDate"]) < "2020-01-01"]

Unnamed: 0,ID,Name,Age,Gender,Department,Salary,JoinDate,Rating
1,2,Bob,28.0,Male,Marketing,52000.0,2019-03-22,4.0
2,3,Charlie,30.0,Male,HR,,2018-07-30,3.8
7,8,Hank,31.0,Male,HR,51000.0,2017-08-16,3.6
9,10,John,28.0,Male,IT,57000.0,2019-06-14,4.3


In [5]:
# Example-4: Employees with rating > 4.0

df[df["Rating"] > 4.0]

Unnamed: 0,ID,Name,Age,Gender,Department,Salary,JoinDate,Rating
0,1,Alice,25.0,Female,Sales,50000.0,2020-01-15,4.5
3,4,David,24.0,Male,Sales,48000.0,2021-05-19,4.2
4,5,Eva,27.0,Female,IT,,2022-02-11,4.7
6,7,Grace,26.0,Female,IT,56000.0,2021-12-01,4.9
8,9,Ivy,29.0,Female,Sales,50000.0,2020-09-25,4.1
9,10,John,28.0,Male,IT,57000.0,2019-06-14,4.3


In [6]:
# Example-5: Employees in IT with salary > 55000

df[(df["Department"].isin(["IT"])) & (df["Salary"] > 55000)]

Unnamed: 0,ID,Name,Age,Gender,Department,Salary,JoinDate,Rating
6,7,Grace,26.0,Female,IT,56000.0,2021-12-01,4.9
9,10,John,28.0,Male,IT,57000.0,2019-06-14,4.3


In [7]:
company_excel = pd.ExcelFile("data/company.xlsx")
company_excel.sheet_names # ['Employees', 'Departments']

employees_df = pd.read_excel("data/company.xlsx", sheet_name="Employees")
departments_df = pd.read_excel("data/company.xlsx", sheet_name="Departments")


In [8]:
# Example-6 : Count how many missing values are in each column.

employees_df.isnull().sum()

employee_id     0
name            2
age             2
gender          0
department      2
salary          0
experience      0
joining_date    0
city            1
email           0
dtype: int64

In [9]:
# Example-7 : Drop rows with any missing values.
employees_df.dropna()

Unnamed: 0,employee_id,name,age,gender,department,salary,experience,joining_date,city,email
0,101,Alice,25.0,F,HR,50000,2,2020-01-15,New York,alice@example.com
1,102,Bob,30.0,M,IT,60000,5,2019-03-10,Chicago,bob@example.com
3,104,David,45.0,M,Sales,52000,10,2015-09-01,Boston,david@example.com
6,107,Grace,33.0,F,Sales,51000,7,2017-02-14,Boston,grace@example.com
9,108,Grace,40.0,F,Sales,51000,7,2023-01-01,Chicago,grace@example.com


In [10]:
# Example-8 : Fill missing values in the age column with the average age.
employees_df.fillna({"age":employees_df["age"].mean()})

Unnamed: 0,employee_id,name,age,gender,department,salary,experience,joining_date,city,email
0,101,Alice,25.0,F,HR,50000,2,2020-01-15,New York,alice@example.com
1,102,Bob,30.0,M,IT,60000,5,2019-03-10,Chicago,bob@example.com
2,103,Charlie,35.125,M,IT,55000,4,2021-06-25,Chicago,charlie@example.com
3,104,David,45.0,M,Sales,52000,10,2015-09-01,Boston,david@example.com
4,105,Eva,28.0,F,HR,58000,-1,2022-11-30,,eva@example.com
5,106,Frank,35.125,M,Finance,62000,6,2018-07-22,New York,frank@example.com
6,107,Grace,33.0,F,Sales,51000,7,2017-02-14,Boston,grace@example.com
7,108,,40.0,F,,53000,8,2023-01-01,Chicago,grace@example.com
8,108,,40.0,F,,53000,8,2023-01-01,Chicago,grace@example.com
9,108,Grace,40.0,F,Sales,51000,7,2023-01-01,Chicago,grace@example.com


In [11]:
# Example-9 : Fill missing values in the city column using forward fill.
employees_df["city"].ffill()

0    New York
1     Chicago
2     Chicago
3      Boston
4      Boston
5    New York
6      Boston
7     Chicago
8     Chicago
9     Chicago
Name: city, dtype: object

In [12]:
# Example-10 : Drop rows where name or department is missing.
employees_df.dropna(subset=["name", "department"])

Unnamed: 0,employee_id,name,age,gender,department,salary,experience,joining_date,city,email
0,101,Alice,25.0,F,HR,50000,2,2020-01-15,New York,alice@example.com
1,102,Bob,30.0,M,IT,60000,5,2019-03-10,Chicago,bob@example.com
2,103,Charlie,,M,IT,55000,4,2021-06-25,Chicago,charlie@example.com
3,104,David,45.0,M,Sales,52000,10,2015-09-01,Boston,david@example.com
4,105,Eva,28.0,F,HR,58000,-1,2022-11-30,,eva@example.com
5,106,Frank,,M,Finance,62000,6,2018-07-22,New York,frank@example.com
6,107,Grace,33.0,F,Sales,51000,7,2017-02-14,Boston,grace@example.com
9,108,Grace,40.0,F,Sales,51000,7,2023-01-01,Chicago,grace@example.com


In [13]:
# Example-11 : Check if there are any duplicate rows in the dataset.
employees_df.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8     True
9    False
dtype: bool

In [14]:
# Example-12: Remove all duplicate rows and reset the index.
employees_df.drop_duplicates().reset_index(drop=True)

Unnamed: 0,employee_id,name,age,gender,department,salary,experience,joining_date,city,email
0,101,Alice,25.0,F,HR,50000,2,2020-01-15,New York,alice@example.com
1,102,Bob,30.0,M,IT,60000,5,2019-03-10,Chicago,bob@example.com
2,103,Charlie,,M,IT,55000,4,2021-06-25,Chicago,charlie@example.com
3,104,David,45.0,M,Sales,52000,10,2015-09-01,Boston,david@example.com
4,105,Eva,28.0,F,HR,58000,-1,2022-11-30,,eva@example.com
5,106,Frank,,M,Finance,62000,6,2018-07-22,New York,frank@example.com
6,107,Grace,33.0,F,Sales,51000,7,2017-02-14,Boston,grace@example.com
7,108,,40.0,F,,53000,8,2023-01-01,Chicago,grace@example.com
8,108,Grace,40.0,F,Sales,51000,7,2023-01-01,Chicago,grace@example.com


In [15]:
# Example:13: Identify duplicate rows based on employee_id and name.
employees_df[employees_df.duplicated(subset=["employee_id", "name"])]

Unnamed: 0,employee_id,name,age,gender,department,salary,experience,joining_date,city,email
8,108,,40.0,F,,53000,8,2023-01-01,Chicago,grace@example.com


In [16]:
# Example-14 : Keep only the first occurrence of each email in the email column.
employees_df.drop_duplicates(subset="email")

Unnamed: 0,employee_id,name,age,gender,department,salary,experience,joining_date,city,email
0,101,Alice,25.0,F,HR,50000,2,2020-01-15,New York,alice@example.com
1,102,Bob,30.0,M,IT,60000,5,2019-03-10,Chicago,bob@example.com
2,103,Charlie,,M,IT,55000,4,2021-06-25,Chicago,charlie@example.com
3,104,David,45.0,M,Sales,52000,10,2015-09-01,Boston,david@example.com
4,105,Eva,28.0,F,HR,58000,-1,2022-11-30,,eva@example.com
5,106,Frank,,M,Finance,62000,6,2018-07-22,New York,frank@example.com
6,107,Grace,33.0,F,Sales,51000,7,2017-02-14,Boston,grace@example.com


In [17]:
# Example-15 : Rename the column name to employee_name.
employees_df.rename(columns={"name":"employee_name"})

Unnamed: 0,employee_id,employee_name,age,gender,department,salary,experience,joining_date,city,email
0,101,Alice,25.0,F,HR,50000,2,2020-01-15,New York,alice@example.com
1,102,Bob,30.0,M,IT,60000,5,2019-03-10,Chicago,bob@example.com
2,103,Charlie,,M,IT,55000,4,2021-06-25,Chicago,charlie@example.com
3,104,David,45.0,M,Sales,52000,10,2015-09-01,Boston,david@example.com
4,105,Eva,28.0,F,HR,58000,-1,2022-11-30,,eva@example.com
5,106,Frank,,M,Finance,62000,6,2018-07-22,New York,frank@example.com
6,107,Grace,33.0,F,Sales,51000,7,2017-02-14,Boston,grace@example.com
7,108,,40.0,F,,53000,8,2023-01-01,Chicago,grace@example.com
8,108,,40.0,F,,53000,8,2023-01-01,Chicago,grace@example.com
9,108,Grace,40.0,F,Sales,51000,7,2023-01-01,Chicago,grace@example.com


In [18]:
# Example-16 : Replace 'M' with 'Male' and 'F' with 'Female' in the gender column.
employees_df["gender"] = employees_df["gender"].replace({"M":"Male", "F":"Female"})
employees_df

Unnamed: 0,employee_id,name,age,gender,department,salary,experience,joining_date,city,email
0,101,Alice,25.0,Female,HR,50000,2,2020-01-15,New York,alice@example.com
1,102,Bob,30.0,Male,IT,60000,5,2019-03-10,Chicago,bob@example.com
2,103,Charlie,,Male,IT,55000,4,2021-06-25,Chicago,charlie@example.com
3,104,David,45.0,Male,Sales,52000,10,2015-09-01,Boston,david@example.com
4,105,Eva,28.0,Female,HR,58000,-1,2022-11-30,,eva@example.com
5,106,Frank,,Male,Finance,62000,6,2018-07-22,New York,frank@example.com
6,107,Grace,33.0,Female,Sales,51000,7,2017-02-14,Boston,grace@example.com
7,108,,40.0,Female,,53000,8,2023-01-01,Chicago,grace@example.com
8,108,,40.0,Female,,53000,8,2023-01-01,Chicago,grace@example.com
9,108,Grace,40.0,Female,Sales,51000,7,2023-01-01,Chicago,grace@example.com


In [19]:
# Example-17 : Convert the joining_date column to datetime format.
employees_df["joining_date"] = pd.to_datetime(employees_df["joining_date"])
employees_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   employee_id   10 non-null     int64         
 1   name          8 non-null      object        
 2   age           8 non-null      float64       
 3   gender        10 non-null     object        
 4   department    8 non-null      object        
 5   salary        10 non-null     int64         
 6   experience    10 non-null     int64         
 7   joining_date  10 non-null     datetime64[ns]
 8   city          9 non-null      object        
 9   email         10 non-null     object        
dtypes: datetime64[ns](1), float64(1), int64(3), object(5)
memory usage: 932.0+ bytes


In [20]:
# Example-18 : Change the salary column's data type to float.
# employees_df.dtypes
employees_df["salary"] = employees_df["salary"].astype("float")
employees_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   employee_id   10 non-null     int64         
 1   name          8 non-null      object        
 2   age           8 non-null      float64       
 3   gender        10 non-null     object        
 4   department    8 non-null      object        
 5   salary        10 non-null     float64       
 6   experience    10 non-null     int64         
 7   joining_date  10 non-null     datetime64[ns]
 8   city          9 non-null      object        
 9   email         10 non-null     object        
dtypes: datetime64[ns](1), float64(2), int64(2), object(5)
memory usage: 932.0+ bytes


In [21]:
# Example-19 : Replace negative values in the experience column with 0.
employees_df.loc[(employees_df["experience"] < 0), "experience"] = 0
employees_df

Unnamed: 0,employee_id,name,age,gender,department,salary,experience,joining_date,city,email
0,101,Alice,25.0,Female,HR,50000.0,2,2020-01-15,New York,alice@example.com
1,102,Bob,30.0,Male,IT,60000.0,5,2019-03-10,Chicago,bob@example.com
2,103,Charlie,,Male,IT,55000.0,4,2021-06-25,Chicago,charlie@example.com
3,104,David,45.0,Male,Sales,52000.0,10,2015-09-01,Boston,david@example.com
4,105,Eva,28.0,Female,HR,58000.0,0,2022-11-30,,eva@example.com
5,106,Frank,,Male,Finance,62000.0,6,2018-07-22,New York,frank@example.com
6,107,Grace,33.0,Female,Sales,51000.0,7,2017-02-14,Boston,grace@example.com
7,108,,40.0,Female,,53000.0,8,2023-01-01,Chicago,grace@example.com
8,108,,40.0,Female,,53000.0,8,2023-01-01,Chicago,grace@example.com
9,108,Grace,40.0,Female,Sales,51000.0,7,2023-01-01,Chicago,grace@example.com


In [22]:
# Example-20 : Sort the dataset by salary in descending order.
employees_df.sort_values("salary", ascending=False)

Unnamed: 0,employee_id,name,age,gender,department,salary,experience,joining_date,city,email
5,106,Frank,,Male,Finance,62000.0,6,2018-07-22,New York,frank@example.com
1,102,Bob,30.0,Male,IT,60000.0,5,2019-03-10,Chicago,bob@example.com
4,105,Eva,28.0,Female,HR,58000.0,0,2022-11-30,,eva@example.com
2,103,Charlie,,Male,IT,55000.0,4,2021-06-25,Chicago,charlie@example.com
8,108,,40.0,Female,,53000.0,8,2023-01-01,Chicago,grace@example.com
7,108,,40.0,Female,,53000.0,8,2023-01-01,Chicago,grace@example.com
3,104,David,45.0,Male,Sales,52000.0,10,2015-09-01,Boston,david@example.com
6,107,Grace,33.0,Female,Sales,51000.0,7,2017-02-14,Boston,grace@example.com
9,108,Grace,40.0,Female,Sales,51000.0,7,2023-01-01,Chicago,grace@example.com
0,101,Alice,25.0,Female,HR,50000.0,2,2020-01-15,New York,alice@example.com


In [23]:
# Example-21 : Filter employees whose age is above 30 and department is Sales.
employees_df.query("age > 30 and department == 'Sales'")

Unnamed: 0,employee_id,name,age,gender,department,salary,experience,joining_date,city,email
3,104,David,45.0,Male,Sales,52000.0,10,2015-09-01,Boston,david@example.com
6,107,Grace,33.0,Female,Sales,51000.0,7,2017-02-14,Boston,grace@example.com
9,108,Grace,40.0,Female,Sales,51000.0,7,2023-01-01,Chicago,grace@example.com


In [24]:
# Example-22 : Sort the data first by joining_date, then by salary.
employees_df["joining_date"] = pd.to_datetime(employees_df["joining_date"])
employees_df.sort_values(["joining_date", "salary"], ascending=[True, True], inplace=True)
employees_df

Unnamed: 0,employee_id,name,age,gender,department,salary,experience,joining_date,city,email
3,104,David,45.0,Male,Sales,52000.0,10,2015-09-01,Boston,david@example.com
6,107,Grace,33.0,Female,Sales,51000.0,7,2017-02-14,Boston,grace@example.com
5,106,Frank,,Male,Finance,62000.0,6,2018-07-22,New York,frank@example.com
1,102,Bob,30.0,Male,IT,60000.0,5,2019-03-10,Chicago,bob@example.com
0,101,Alice,25.0,Female,HR,50000.0,2,2020-01-15,New York,alice@example.com
2,103,Charlie,,Male,IT,55000.0,4,2021-06-25,Chicago,charlie@example.com
4,105,Eva,28.0,Female,HR,58000.0,0,2022-11-30,,eva@example.com
9,108,Grace,40.0,Female,Sales,51000.0,7,2023-01-01,Chicago,grace@example.com
7,108,,40.0,Female,,53000.0,8,2023-01-01,Chicago,grace@example.com
8,108,,40.0,Female,,53000.0,8,2023-01-01,Chicago,grace@example.com


In [25]:
# Example-23 : Show the top 5 employees with the highest experience.
employees_df.sort_values(["experience"], ascending=False).head(5)

Unnamed: 0,employee_id,name,age,gender,department,salary,experience,joining_date,city,email
3,104,David,45.0,Male,Sales,52000.0,10,2015-09-01,Boston,david@example.com
7,108,,40.0,Female,,53000.0,8,2023-01-01,Chicago,grace@example.com
8,108,,40.0,Female,,53000.0,8,2023-01-01,Chicago,grace@example.com
6,107,Grace,33.0,Female,Sales,51000.0,7,2017-02-14,Boston,grace@example.com
9,108,Grace,40.0,Female,Sales,51000.0,7,2023-01-01,Chicago,grace@example.com


In [26]:
# Example-24 : Use a query to find employees aged between 25 and 40.
employees_df.query("age >= 25 and age <= 40")
# employees_df[(employees_df["age"] >= 25) & (employees_df["age"] <= 40)]


Unnamed: 0,employee_id,name,age,gender,department,salary,experience,joining_date,city,email
6,107,Grace,33.0,Female,Sales,51000.0,7,2017-02-14,Boston,grace@example.com
1,102,Bob,30.0,Male,IT,60000.0,5,2019-03-10,Chicago,bob@example.com
0,101,Alice,25.0,Female,HR,50000.0,2,2020-01-15,New York,alice@example.com
4,105,Eva,28.0,Female,HR,58000.0,0,2022-11-30,,eva@example.com
9,108,Grace,40.0,Female,Sales,51000.0,7,2023-01-01,Chicago,grace@example.com
7,108,,40.0,Female,,53000.0,8,2023-01-01,Chicago,grace@example.com
8,108,,40.0,Female,,53000.0,8,2023-01-01,Chicago,grace@example.com


In [27]:
# Example-25: Merge the Employees and Departments sheets on department ID.
# employees_df, departments_df
# employees_df.columns, departments_df.columns
pd.merge(employees_df, departments_df, left_on="department", right_on="dept_id", how="inner")

Unnamed: 0,employee_id,name,age,gender,department,salary,experience,joining_date,city,email,dept_id,manager,location
0,104,David,45.0,Male,Sales,52000.0,10,2015-09-01,Boston,david@example.com,Sales,Monica,Boston
1,107,Grace,33.0,Female,Sales,51000.0,7,2017-02-14,Boston,grace@example.com,Sales,Monica,Boston
2,106,Frank,,Male,Finance,62000.0,6,2018-07-22,New York,frank@example.com,Finance,Chandler,New York
3,102,Bob,30.0,Male,IT,60000.0,5,2019-03-10,Chicago,bob@example.com,IT,Ross,Chicago
4,101,Alice,25.0,Female,HR,50000.0,2,2020-01-15,New York,alice@example.com,HR,Rachel,New York
5,103,Charlie,,Male,IT,55000.0,4,2021-06-25,Chicago,charlie@example.com,IT,Ross,Chicago
6,105,Eva,28.0,Female,HR,58000.0,0,2022-11-30,,eva@example.com,HR,Rachel,New York
7,108,Grace,40.0,Female,Sales,51000.0,7,2023-01-01,Chicago,grace@example.com,Sales,Monica,Boston


In [28]:
# Example-26 : Concatenate the employee dataset with itself row-wise.
pd.concat([employees_df, employees_df], ignore_index=True)


Unnamed: 0,employee_id,name,age,gender,department,salary,experience,joining_date,city,email
0,104,David,45.0,Male,Sales,52000.0,10,2015-09-01,Boston,david@example.com
1,107,Grace,33.0,Female,Sales,51000.0,7,2017-02-14,Boston,grace@example.com
2,106,Frank,,Male,Finance,62000.0,6,2018-07-22,New York,frank@example.com
3,102,Bob,30.0,Male,IT,60000.0,5,2019-03-10,Chicago,bob@example.com
4,101,Alice,25.0,Female,HR,50000.0,2,2020-01-15,New York,alice@example.com
5,103,Charlie,,Male,IT,55000.0,4,2021-06-25,Chicago,charlie@example.com
6,105,Eva,28.0,Female,HR,58000.0,0,2022-11-30,,eva@example.com
7,108,Grace,40.0,Female,Sales,51000.0,7,2023-01-01,Chicago,grace@example.com
8,108,,40.0,Female,,53000.0,8,2023-01-01,Chicago,grace@example.com
9,108,,40.0,Female,,53000.0,8,2023-01-01,Chicago,grace@example.com


In [29]:
# Example-27 : Perform a left join between two datasets using employee_id.
performance_df = pd.DataFrame({
    'employee_id': [101, 102, 105, 106],
    'performance_rating': ['A', 'B', 'A', 'C']
})
pd.merge(employees_df, performance_df, on="employee_id", how="left")

Unnamed: 0,employee_id,name,age,gender,department,salary,experience,joining_date,city,email,performance_rating
0,104,David,45.0,Male,Sales,52000.0,10,2015-09-01,Boston,david@example.com,
1,107,Grace,33.0,Female,Sales,51000.0,7,2017-02-14,Boston,grace@example.com,
2,106,Frank,,Male,Finance,62000.0,6,2018-07-22,New York,frank@example.com,C
3,102,Bob,30.0,Male,IT,60000.0,5,2019-03-10,Chicago,bob@example.com,B
4,101,Alice,25.0,Female,HR,50000.0,2,2020-01-15,New York,alice@example.com,A
5,103,Charlie,,Male,IT,55000.0,4,2021-06-25,Chicago,charlie@example.com,
6,105,Eva,28.0,Female,HR,58000.0,0,2022-11-30,,eva@example.com,A
7,108,Grace,40.0,Female,Sales,51000.0,7,2023-01-01,Chicago,grace@example.com,
8,108,,40.0,Female,,53000.0,8,2023-01-01,Chicago,grace@example.com,
9,108,,40.0,Female,,53000.0,8,2023-01-01,Chicago,grace@example.com,


In [30]:
# Example-28 : Add the department manager to each employee using a merge.
employees_df = pd.DataFrame({
    'employee_id': [1, 2, 3],
    'name': ['Alice', 'Bob', 'Charlie'],
    'department': ['HR', 'Sales', 'IT']
})
departments_df = pd.DataFrame({
    'dept_id': ['HR', 'Sales', 'IT'],
    'manager': ['Karen', 'Steve', 'Raj']
})

merged_df = pd.merge(employees_df, departments_df, left_on="department", right_on="dept_id", how="left")
merged_df.drop(columns="dept_id", inplace=True)
merged_df

Unnamed: 0,employee_id,name,department,manager
0,1,Alice,HR,Karen
1,2,Bob,Sales,Steve
2,3,Charlie,IT,Raj


In [31]:
# Example-29 : Merge employee, department, and location data into one DataFrame.
employees_df = pd.DataFrame({
    'employee_id': [1, 2, 3, 4],
    'name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'department': ['HR', 'Sales', 'IT', 'HR']
})
departments_df = pd.DataFrame({
    'dept_id': ['HR', 'Sales', 'IT'],
    'manager': ['Karen', 'Steve', 'Raj'],
    'location_id': [101, 102, 103]
})
locations_df = pd.DataFrame({
    'location_id': [101, 102, 103],
    'location_name': ['New York', 'Los Angeles', 'Bangalore']
})

emp_dept_df = pd.merge(employees_df, departments_df, left_on="department", right_on="dept_id", how="left")
emp_dept_df.drop(columns="dept_id", inplace=True)
emp_dept_loc_df = pd.merge(emp_dept_df, locations_df, left_on="location_id", right_on="location_id", how="left")
emp_dept_loc_df.drop(columns="location_id", inplace=True)
emp_dept_loc_df

Unnamed: 0,employee_id,name,department,manager,location_name
0,1,Alice,HR,Karen,New York
1,2,Bob,Sales,Steve,Los Angeles
2,3,Charlie,IT,Raj,Bangalore
3,4,Diana,HR,Karen,New York


In [32]:
# Example-30 : Find the average salary per department.
employees_df = pd.DataFrame({
    'employee_id': [1, 2, 3, 4, 5, 6],
    'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Ethan', 'Fiona'],
    'age': [28, 34, 41, 29, 36, 30],
    'gender': ['F', 'M', 'M', 'F', 'M', 'F'],
    'department': ['HR', 'Sales', 'IT', 'HR', 'Sales', 'IT'],
    'salary': [50000, 60000, 75000, 52000, 62000, 77000],
    'experience': [3, 5, 10, 4, 6, 9],
    'city': ['New York', 'Los Angeles', 'Bangalore', 'New York', 'Los Angeles', 'Bangalore']
})
employees_df
# employees_df.groupby("department")["salary"].mean()

Unnamed: 0,employee_id,name,age,gender,department,salary,experience,city
0,1,Alice,28,F,HR,50000,3,New York
1,2,Bob,34,M,Sales,60000,5,Los Angeles
2,3,Charlie,41,M,IT,75000,10,Bangalore
3,4,Diana,29,F,HR,52000,4,New York
4,5,Ethan,36,M,Sales,62000,6,Los Angeles
5,6,Fiona,30,F,IT,77000,9,Bangalore


In [33]:
# Example-31 : Calculate total experience grouped by city.
employees_df.groupby("city")["experience"].sum()

city
Bangalore      19
Los Angeles    11
New York        7
Name: experience, dtype: int64

In [34]:
# Example-32 : Count how many employees are in each city.
employees_df.groupby("city")["employee_id"].count()

city
Bangalore      2
Los Angeles    2
New York       2
Name: employee_id, dtype: int64

In [35]:
# Example-33 : Get the maximum age for each department and gender combination.
employees_df.groupby(["department","gender"])["age"].max()

department  gender
HR          F         29
IT          F         30
            M         41
Sales       M         36
Name: age, dtype: int64

In [36]:
# Example-34 : Create a summary showing total salary and average experience by department.
employees_df.groupby("department").agg({"salary":"sum", "experience":"mean"})

Unnamed: 0_level_0,salary,experience
department,Unnamed: 1_level_1,Unnamed: 2_level_1
HR,102000,3.5
IT,152000,9.5
Sales,122000,5.5
