In [1]:
import pandas as pd
import numpy as np

In [2]:
#Create a DataFrame

df = pd.DataFrame({ "Name" : ['Alice', 'Bob', 'Charlie', 'Diana'],
                   "Age" : [25, 30, 35, 28],
                   "City" : ['New York', 'Los Angeles', 'Chicago', 'Houston'],
                   "Salary" : [70000, 80000, 90000, 65000]})

df

Unnamed: 0,Name,Age,City,Salary
0,Alice,25,New York,70000
1,Bob,30,Los Angeles,80000
2,Charlie,35,Chicago,90000
3,Diana,28,Houston,65000


In [3]:
#Print the first two rows of the DataFrame
df.iloc[0:2, ]

Unnamed: 0,Name,Age,City,Salary
0,Alice,25,New York,70000
1,Bob,30,Los Angeles,80000


In [4]:
#Display basic information and statistics about the DataFrame
df.describe()

Unnamed: 0,Age,Salary
count,4.0,4.0
mean,29.5,76250.0
std,4.203173,11086.778913
min,25.0,65000.0
25%,27.25,68750.0
50%,29.0,75000.0
75%,31.25,82500.0
max,35.0,90000.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    4 non-null      object
 1   Age     4 non-null      int64 
 2   City    4 non-null      object
 3   Salary  4 non-null      int64 
dtypes: int64(2), object(2)
memory usage: 260.0+ bytes


In [6]:
#Select the Name and Salary columns and create a new DataFrame.
new_df = df[["Name", "Salary"]]
new_df

Unnamed: 0,Name,Salary
0,Alice,70000
1,Bob,80000
2,Charlie,90000
3,Diana,65000


In [7]:
#Filter the rows where the salary is greater than 70,000.
df[df.Salary > 70000]

Unnamed: 0,Name,Age,City,Salary
1,Bob,30,Los Angeles,80000
2,Charlie,35,Chicago,90000


In [8]:
#Display only the rows where the age is greater than 30.
df[df.Age > 30]

Unnamed: 0,Name,Age,City,Salary
2,Charlie,35,Chicago,90000


In [9]:
#Handling Missing Data

data = {"Name" : ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
                   "Age" : [25, np.nan, 35, 28, 45],
                   "City" : ['New York', 'Los Angeles', np.nan, 'Houston', 'Phoenix'],
                   "Salary" : [70000, 80000, 90000, np.nan, 75000]}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City,Salary
0,Alice,25.0,New York,70000.0
1,Bob,,Los Angeles,80000.0
2,Charlie,35.0,,90000.0
3,Diana,28.0,Houston,
4,Eve,45.0,Phoenix,75000.0


In [10]:
#Detect missing values
df.isnull()

Unnamed: 0,Name,Age,City,Salary
0,False,False,False,False
1,False,True,False,False
2,False,False,True,False
3,False,False,False,True
4,False,False,False,False


In [11]:
df.isnull().sum()

Name      0
Age       1
City      1
Salary    1
dtype: int64

In [12]:
#Fill missing values in the Salary column with the mean salary.
df['Salary'] = df['Salary'].fillna(df['Salary'].mean())
df

Unnamed: 0,Name,Age,City,Salary
0,Alice,25.0,New York,70000.0
1,Bob,,Los Angeles,80000.0
2,Charlie,35.0,,90000.0
3,Diana,28.0,Houston,78750.0
4,Eve,45.0,Phoenix,75000.0


In [13]:
#Drop rows where any column has a missing value.
df.dropna()

Unnamed: 0,Name,Age,City,Salary
0,Alice,25.0,New York,70000.0
3,Diana,28.0,Houston,78750.0
4,Eve,45.0,Phoenix,75000.0


In [14]:
#Add a new column Bonus where each employee’s bonus is 10% of their salary.

df["Bonus"] = df["Salary"]*(10/100)
df

Unnamed: 0,Name,Age,City,Salary,Bonus
0,Alice,25.0,New York,70000.0,7000.0
1,Bob,,Los Angeles,80000.0,8000.0
2,Charlie,35.0,,90000.0,9000.0
3,Diana,28.0,Houston,78750.0,7875.0
4,Eve,45.0,Phoenix,75000.0,7500.0


In [15]:
#Add another column Total Compensation which sums the Salary and Bonus.

df["Total Compensation"] = df.Salary + df.Bonus
df

Unnamed: 0,Name,Age,City,Salary,Bonus,Total Compensation
0,Alice,25.0,New York,70000.0,7000.0,77000.0
1,Bob,,Los Angeles,80000.0,8000.0,88000.0
2,Charlie,35.0,,90000.0,9000.0,99000.0
3,Diana,28.0,Houston,78750.0,7875.0,86625.0
4,Eve,45.0,Phoenix,75000.0,7500.0,82500.0


In [16]:
#Modify the City column by replacing all instances of "New York" with "NYC".
df["City"] = df["City"].replace("New York", "NYC")
df

Unnamed: 0,Name,Age,City,Salary,Bonus,Total Compensation
0,Alice,25.0,NYC,70000.0,7000.0,77000.0
1,Bob,,Los Angeles,80000.0,8000.0,88000.0
2,Charlie,35.0,,90000.0,9000.0,99000.0
3,Diana,28.0,Houston,78750.0,7875.0,86625.0
4,Eve,45.0,Phoenix,75000.0,7500.0,82500.0


In [17]:
#Add a new column Department to the DataFrame, where:
#Alice, Bob, and Eve are in the "HR" department.
#Charlie and Diana are in the "IT" department.

def department(Name) :
    if Name in ["Alice", "Bob" , "Eve"] :
        return 'HR'
    elif Name in ["Charlie", "Diana"] :
        return 'IT'

df["Department"] = df["Name"].apply(department)
df

Unnamed: 0,Name,Age,City,Salary,Bonus,Total Compensation,Department
0,Alice,25.0,NYC,70000.0,7000.0,77000.0,HR
1,Bob,,Los Angeles,80000.0,8000.0,88000.0,HR
2,Charlie,35.0,,90000.0,9000.0,99000.0,IT
3,Diana,28.0,Houston,78750.0,7875.0,86625.0,IT
4,Eve,45.0,Phoenix,75000.0,7500.0,82500.0,HR


In [18]:
#Group the DataFrame by the Department and calculate the average salary for each department.
df.groupby('Department')["Salary"].mean()

Department
HR    75000.0
IT    84375.0
Name: Salary, dtype: float64

In [19]:
#Find the total salary and bonus for each department.
df.groupby('Department')[["Salary","Bonus"]].sum()

Unnamed: 0_level_0,Salary,Bonus
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
HR,225000.0,22500.0
IT,168750.0,16875.0


In [20]:
#Display the number of employees in each department.
df["Department"].value_counts()

Department
HR    3
IT    2
Name: count, dtype: int64