In [1]:
import pandas as pd

# 1. Create a DataFrame

In [2]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank'],
    'Age': [24, 27, 22, 32, 29, 40],
    'Department': ['HR', 'IT', 'Finance', 'IT', 'HR', 'Finance'],
    'Salary': [40000, 50000, 45000, 60000, 52000, 70000],
    'Joining_Year': [2021, 2020, 2022, 2019, 2021, 2018]
}

df = pd.DataFrame(data)

print("Original DataFrame:\n", df)

Original DataFrame:
       Name  Age Department  Salary  Joining_Year
0    Alice   24         HR   40000          2021
1      Bob   27         IT   50000          2020
2  Charlie   22    Finance   45000          2022
3    David   32         IT   60000          2019
4      Eva   29         HR   52000          2021
5    Frank   40    Finance   70000          2018


# 2. Exploring Data

In [3]:
print("\nFirst 5 rows:\n", df.head())


First 5 rows:
       Name  Age Department  Salary  Joining_Year
0    Alice   24         HR   40000          2021
1      Bob   27         IT   50000          2020
2  Charlie   22    Finance   45000          2022
3    David   32         IT   60000          2019
4      Eva   29         HR   52000          2021


In [4]:
print("\nLast 3 rows:\n", df.tail(3))


Last 3 rows:
     Name  Age Department  Salary  Joining_Year
3  David   32         IT   60000          2019
4    Eva   29         HR   52000          2021
5  Frank   40    Finance   70000          2018


In [5]:
print("\nShape of DataFrame:", df.shape)


Shape of DataFrame: (6, 5)


In [6]:
print("\nColumn Names:", df.columns)


Column Names: Index(['Name', 'Age', 'Department', 'Salary', 'Joining_Year'], dtype='object')


In [7]:
print("\nData Types:\n", df.dtypes)


Data Types:
 Name            object
Age              int64
Department      object
Salary           int64
Joining_Year     int64
dtype: object


In [8]:
print("\nInfo:\n")
df.info()


Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Name          6 non-null      object
 1   Age           6 non-null      int64 
 2   Department    6 non-null      object
 3   Salary        6 non-null      int64 
 4   Joining_Year  6 non-null      int64 
dtypes: int64(3), object(2)
memory usage: 372.0+ bytes


In [9]:
print("\nSummary Statistics:\n", df.describe())


Summary Statistics:
              Age        Salary  Joining_Year
count   6.000000      6.000000      6.000000
mean   29.000000  52833.333333   2020.166667
std     6.449806  10778.064143      1.471960
min    22.000000  40000.000000   2018.000000
25%    24.750000  46250.000000   2019.250000
50%    28.000000  51000.000000   2020.500000
75%    31.250000  58000.000000   2021.000000
max    40.000000  70000.000000   2022.000000


# 3. Selecting Data

In [10]:
print("\nAge column:\n", df['Age'])


Age column:
 0    24
1    27
2    22
3    32
4    29
5    40
Name: Age, dtype: int64


In [11]:
print("\nName and Salary:\n", df[['Name', 'Salary']])


Name and Salary:
       Name  Salary
0    Alice   40000
1      Bob   50000
2  Charlie   45000
3    David   60000
4      Eva   52000
5    Frank   70000


In [12]:
print("\nFirst row:\n", df.iloc[0])


First row:
 Name            Alice
Age                24
Department         HR
Salary          40000
Joining_Year     2021
Name: 0, dtype: object


In [13]:
print("\nEmployee David:\n", df.loc[df['Name'] == 'David'])


Employee David:
     Name  Age Department  Salary  Joining_Year
3  David   32         IT   60000          2019


# 4. Filtering Rows

In [14]:
print("\nIT Department:\n", df[df['Department'] == 'IT'])


IT Department:
     Name  Age Department  Salary  Joining_Year
1    Bob   27         IT   50000          2020
3  David   32         IT   60000          2019


In [15]:
print("\nHigh Salary Employees:\n", df[df['Salary'] > 50000])


High Salary Employees:
     Name  Age Department  Salary  Joining_Year
3  David   32         IT   60000          2019
4    Eva   29         HR   52000          2021
5  Frank   40    Finance   70000          2018


In [16]:
print("\nIT with Age < 30:\n", df[(df['Department'] == 'IT') & (df['Age'] < 30)])


IT with Age < 30:
   Name  Age Department  Salary  Joining_Year
1  Bob   27         IT   50000          2020


# 5. Adding / Modifying Columns

In [17]:
# New column (Bonus 10% of Salary)
df['Bonus'] = df['Salary'] * 0.10

# Salary after Bonus
df['Total_Salary'] = df['Salary'] + df['Bonus']

print("\nDataFrame with Bonus:\n", df)



DataFrame with Bonus:
       Name  Age Department  Salary  Joining_Year   Bonus  Total_Salary
0    Alice   24         HR   40000          2021  4000.0       44000.0
1      Bob   27         IT   50000          2020  5000.0       55000.0
2  Charlie   22    Finance   45000          2022  4500.0       49500.0
3    David   32         IT   60000          2019  6000.0       66000.0
4      Eva   29         HR   52000          2021  5200.0       57200.0
5    Frank   40    Finance   70000          2018  7000.0       77000.0


# 6. Grouping & Aggregation

In [18]:
print("\nAverage Salary by Department:\n", df.groupby('Department')['Salary'].mean())


Average Salary by Department:
 Department
Finance    57500.0
HR         46000.0
IT         55000.0
Name: Salary, dtype: float64


In [19]:
print("\nSalary Stats by Department:\n", 
      df.groupby('Department')['Salary'].agg(['mean', 'min', 'max', 'count']))


Salary Stats by Department:
                mean    min    max  count
Department                              
Finance     57500.0  45000  70000      2
HR          46000.0  40000  52000      2
IT          55000.0  50000  60000      2


# 7. Sorting

In [20]:
# Sort by Salary (descending)
print("\nSorted by Salary:\n", df.sort_values(by='Salary', ascending=False))


Sorted by Salary:
       Name  Age Department  Salary  Joining_Year   Bonus  Total_Salary
5    Frank   40    Finance   70000          2018  7000.0       77000.0
3    David   32         IT   60000          2019  6000.0       66000.0
4      Eva   29         HR   52000          2021  5200.0       57200.0
1      Bob   27         IT   50000          2020  5000.0       55000.0
2  Charlie   22    Finance   45000          2022  4500.0       49500.0
0    Alice   24         HR   40000          2021  4000.0       44000.0


In [21]:
# Sort by multiple columns
print("\nSort by Department then Age:\n", df.sort_values(by=['Department', 'Age']))


Sort by Department then Age:
       Name  Age Department  Salary  Joining_Year   Bonus  Total_Salary
2  Charlie   22    Finance   45000          2022  4500.0       49500.0
5    Frank   40    Finance   70000          2018  7000.0       77000.0
0    Alice   24         HR   40000          2021  4000.0       44000.0
4      Eva   29         HR   52000          2021  5200.0       57200.0
1      Bob   27         IT   50000          2020  5000.0       55000.0
3    David   32         IT   60000          2019  6000.0       66000.0


# 8. Handling Missing Data

In [22]:
# Introduce NaN for demo
df.loc[2, 'Salary'] = None  

In [23]:
print("\nWith Missing Value:\n", df)


With Missing Value:
       Name  Age Department   Salary  Joining_Year   Bonus  Total_Salary
0    Alice   24         HR  40000.0          2021  4000.0       44000.0
1      Bob   27         IT  50000.0          2020  5000.0       55000.0
2  Charlie   22    Finance      NaN          2022  4500.0       49500.0
3    David   32         IT  60000.0          2019  6000.0       66000.0
4      Eva   29         HR  52000.0          2021  5200.0       57200.0
5    Frank   40    Finance  70000.0          2018  7000.0       77000.0


In [24]:
# Drop missing values
print("\nDrop rows with NaN:\n", df.dropna())


Drop rows with NaN:
     Name  Age Department   Salary  Joining_Year   Bonus  Total_Salary
0  Alice   24         HR  40000.0          2021  4000.0       44000.0
1    Bob   27         IT  50000.0          2020  5000.0       55000.0
3  David   32         IT  60000.0          2019  6000.0       66000.0
4    Eva   29         HR  52000.0          2021  5200.0       57200.0
5  Frank   40    Finance  70000.0          2018  7000.0       77000.0


In [25]:
# Fill missing values
df['Salary'] = df['Salary'].fillna(df['Salary'].mean())
print("\nFill NaN with mean Salary:\n", df)


Fill NaN with mean Salary:
       Name  Age Department   Salary  Joining_Year   Bonus  Total_Salary
0    Alice   24         HR  40000.0          2021  4000.0       44000.0
1      Bob   27         IT  50000.0          2020  5000.0       55000.0
2  Charlie   22    Finance  54400.0          2022  4500.0       49500.0
3    David   32         IT  60000.0          2019  6000.0       66000.0
4      Eva   29         HR  52000.0          2021  5200.0       57200.0
5    Frank   40    Finance  70000.0          2018  7000.0       77000.0


# 9. Applying Functions

In [26]:
# Apply custom function (Age group)
df['Age_Group'] = df['Age'].apply(lambda x: 'Young' if x < 30 else 'Senior')
print("\nWith Age Group:\n", df)


With Age Group:
       Name  Age Department   Salary  Joining_Year   Bonus  Total_Salary  \
0    Alice   24         HR  40000.0          2021  4000.0       44000.0   
1      Bob   27         IT  50000.0          2020  5000.0       55000.0   
2  Charlie   22    Finance  54400.0          2022  4500.0       49500.0   
3    David   32         IT  60000.0          2019  6000.0       66000.0   
4      Eva   29         HR  52000.0          2021  5200.0       57200.0   
5    Frank   40    Finance  70000.0          2018  7000.0       77000.0   

  Age_Group  
0     Young  
1     Young  
2     Young  
3    Senior  
4     Young  
5    Senior  


# 10. Merging & Joining DataFrames

In [27]:
# Second dataset
dept_info = pd.DataFrame({
    'Department': ['HR', 'IT', 'Finance'],
    'Manager': ['John', 'Sara', 'Mike']
})

In [28]:
dept_info

Unnamed: 0,Department,Manager
0,HR,John
1,IT,Sara
2,Finance,Mike


In [29]:
# Merge on Department
merged_df = pd.merge(df, dept_info, on='Department', how='left')
print("\nMerged DataFrame:\n", merged_df)


Merged DataFrame:
       Name  Age Department   Salary  Joining_Year   Bonus  Total_Salary  \
0    Alice   24         HR  40000.0          2021  4000.0       44000.0   
1      Bob   27         IT  50000.0          2020  5000.0       55000.0   
2  Charlie   22    Finance  54400.0          2022  4500.0       49500.0   
3    David   32         IT  60000.0          2019  6000.0       66000.0   
4      Eva   29         HR  52000.0          2021  5200.0       57200.0   
5    Frank   40    Finance  70000.0          2018  7000.0       77000.0   

  Age_Group Manager  
0     Young    John  
1     Young    Sara  
2     Young    Mike  
3    Senior    Sara  
4     Young    John  
5    Senior    Mike  


# 11. Pivot Tables

In [30]:
pivot = pd.pivot_table(df, values='Salary', index='Department', columns='Age_Group', aggfunc='mean')
print("\nPivot Table:\n", pivot)


Pivot Table:
 Age_Group    Senior    Young
Department                  
Finance     70000.0  54400.0
HR              NaN  46000.0
IT          60000.0  50000.0
