In [1]:
import pandas as pd
import numpy as np

### DataFrame Creation

In [4]:
# Create a DataFrame using a dictionary with columns: Name, Age, and City.

data = {
    "Name" : ["Shiv","Krishna","Krutarth","Ram"],
    "Age" : [19,21,15,25],
    "City" : ['Ahmedabad',"Surat","Vadodara","Mumbai"]
}

df = pd.DataFrame(data)
print("DataFrame using a dictionary :-> \n",df)
print()


# Create a DataFrame from a list of lists with custom column names.

lst = [
    ["Rahul",24,"Jaipur"],
    ["Kumar",18,"Delhi"],
    ["Sourav",23,"Utrakhand"],
    ["Kartik", 30,"Nasik"]
]

colums = ["Name","Age","City"]

lst_data = pd.DataFrame(data,columns=colums)
print("DataFrame Using List :-> \n",lst_data)
print()

# Convert a NumPy array into a DataFrame.

arr = np.array([[10,20,30],[25,35,45],[67,78,89]])

np_data = pd.DataFrame(arr, columns=['Column1','Column2','Column3'])
print("DataFrame using Array :-> \n",np_data)
print()

DataFrame using a dictionary :-> 
        Name  Age       City
0      Shiv   19  Ahmedabad
1   Krishna   21      Surat
2  Krutarth   15   Vadodara
3       Ram   25     Mumbai

DataFrame Using List :-> 
        Name  Age       City
0      Shiv   19  Ahmedabad
1   Krishna   21      Surat
2  Krutarth   15   Vadodara
3       Ram   25     Mumbai

DataFrame using Array :-> 
    Column1  Column2  Column3
0       10       20       30
1       25       35       45
2       67       78       89



### Data Loading

In [18]:
# Load a CSV file called Employee_data.csv into a DataFrame.
df = pd.read_csv('employee_data.csv')
print(df)
print()

# Load only the first 10 rows of a CSV file.

rows_display = df.head(10)
print("First 10 Rows :-> \n",rows_display)
print()


    ID     Name  Age Department  Salary
0    1    Alice   24         HR   45000
1    2      Bob   29         IT   60000
2    3  Charlie   31    Finance   72000
3    4    David   26  Marketing   55000
4    5      Eva   23      Sales   48000
5    6    Frank   35         IT   80000
6    7    Grace   27         HR   50000
7    8    Henry   32    Finance   70000
8    9    Irene   28  Marketing   58000
9   10     Jack   25      Sales   47500
10  11    Karen   30         IT   62000
11  12     Liam   34         HR   52000
12  13     Maya   22      Sales   44000
13  14   Nathan   33    Finance   75000
14  15   Olivia   29  Marketing   59000

First 10 Rows :-> 
    ID     Name  Age Department  Salary
0   1    Alice   24         HR   45000
1   2      Bob   29         IT   60000
2   3  Charlie   31    Finance   72000
3   4    David   26  Marketing   55000
4   5      Eva   23      Sales   48000
5   6    Frank   35         IT   80000
6   7    Grace   27         HR   50000
7   8    Henry   32    Fina

### Data Inspection

In [24]:
# Display the first 5 and last 5 rows of a DataFrame.
print("First 5 Rows :-> \n",df.head(5))
print("Last 5 Rows :-> \n",df.tail(5))
print()

# Get the number of rows and columns in a DataFrame.
rows,columns = df.shape
print("Number of Rows :-> ",rows)
print("Number of Columns :-> ",columns)
print()

# Get the data types of each column.
print("Data Type of each columns :-> \n",df.dtypes)
print()

# Get a summary of statistics for numeric columns.
print("Descriptive Statistic :-> \n",df.describe())
print()

First 5 Rows :-> 
    ID     Name  Age Department  Salary
0   1    Alice   24         HR   45000
1   2      Bob   29         IT   60000
2   3  Charlie   31    Finance   72000
3   4    David   26  Marketing   55000
4   5      Eva   23      Sales   48000
Last 5 Rows :-> 
     ID    Name  Age Department  Salary
10  11   Karen   30         IT   62000
11  12    Liam   34         HR   52000
12  13    Maya   22      Sales   44000
13  14  Nathan   33    Finance   75000
14  15  Olivia   29  Marketing   59000

Number of Rows :->  15
Number of Columns :->  5

Data Type of each columns :-> 
 ID             int64
Name          object
Age            int64
Department    object
Salary         int64
dtype: object

Descriptive Statistic :-> 
               ID        Age       Salary
count  15.000000  15.000000     15.00000
mean    8.000000  28.533333  58500.00000
std     4.472136   4.033196  11384.51329
min     1.000000  22.000000  44000.00000
25%     4.500000  25.500000  49000.00000
50%     8.000000  2

### Selecting Data

In [32]:
# Select a single column from a DataFrame.
name_column = df['Name']
print("Single Column Data :-> \n",name_column)
print()

# Select multiple columns from a DataFrame.
multi_column = df[["Name","Department"]]
print("Multiple Column Data :-> \n",multi_column)
print()



Single Column Data :-> 
 0       Alice
1         Bob
2     Charlie
3       David
4         Eva
5       Frank
6       Grace
7       Henry
8       Irene
9        Jack
10      Karen
11       Liam
12       Maya
13     Nathan
14     Olivia
Name: Name, dtype: object

Multiple Column Data :-> 
        Name Department
0     Alice         HR
1       Bob         IT
2   Charlie    Finance
3     David  Marketing
4       Eva      Sales
5     Frank         IT
6     Grace         HR
7     Henry    Finance
8     Irene  Marketing
9      Jack      Sales
10    Karen         IT
11     Liam         HR
12     Maya      Sales
13   Nathan    Finance
14   Olivia  Marketing



In [None]:
# Select rows where a column value meets a condition (e.g., Age > 30).
single_condition = df[df["Salary"] < 50000]
print("Name of Employee whose salary < 50K :->\n", single_condition)
print()

multi_condition = df[(df["Department"] == "IT") & (df["Salary"] > 50000)]
print("Employee Department is IT and Salary > 50000 ->\n",multi_condition)
print()

# Select rows using .iloc[]S.

#  .iloc[] = Index-based selection
rows_data = df.iloc[0]
print("Display First row data :-> \n",rows_data)
print()

multi_row_data = df.iloc[:4]
print("Display first 4 Rows Data :-> \n",multi_row_data)
print()



Name of Employee whose salary < 50K :->
        ID  Age Department  Salary
Name                             
Alice   1   24         HR   45000
Eva     5   23      Sales   48000
Jack   10   25      Sales   47500
Maya   13   22      Sales   44000

Employee Department is IT and Salary > 50000 ->
        ID  Age Department  Salary
Name                             
Bob     2   29         IT   60000
Frank   6   35         IT   80000
Karen  11   30         IT   62000

Display First row data :-> 
 ID                1
Age              24
Department       HR
Salary        45000
Name: Alice, dtype: object

Display first 4 Rows Data :-> 
          ID  Age Department  Salary
Name                               
Alice     1   24         HR   45000
Bob       2   29         IT   60000
Charlie   3   31    Finance   72000
David     4   26  Marketing   55000



## Data Cleaning

In [68]:
df = pd.DataFrame({
    "Name" : ['Shiv','Ram',None,'Ghanshyam','Krishna'],
    "Age" : [28,34,None,30,29],
    "Salary" : [50000,60000,52000,None,49000],
    "Performance Score" : [85,None,90,92,88]
})


# Find and count missing values in each column.
find_value = df.isnull()
print("Finding Missing Value :-> \n",find_value)
print("Count of Missing Value :-> \n",find_value.sum())
print()

# Drop rows with missing values.
drop_row = df.dropna(axis=0 , inplace=False)
print("Missing Value Rows Drop :-> \n",drop_row)
print()


Finding Missing Value :-> 
     Name    Age  Salary  Performance Score
0  False  False   False              False
1  False  False   False               True
2   True   True   False              False
3  False  False    True              False
4  False  False   False              False
Count of Missing Value :-> 
 Name                 1
Age                  1
Salary               1
Performance Score    1
dtype: int64

Missing Value Rows Drop :-> 
       Name   Age   Salary  Performance Score
0     Shiv  28.0  50000.0               85.0
4  Krishna  29.0  49000.0               88.0



In [80]:

# Fill missing values with a specific value or method (like forward fill).
fill_name = df["Name"].fillna("XYZ")
print("Fill Name :-> \n",fill_name)

# fill_data = df[["Age","Salary","Performance Score"]].fillna(df["Age"].mean())
fill_data = df[["Age","Salary","Performance Score"]].fillna({
    "Age" : df["Age"].mean(),
    "Salary" : df["Salary"].mean(),
    "Performance Score" : df["Performance Score"].mean()
})
print(fill_data)
print()

# Remove duplicate rows.
df = pd.DataFrame({
    "Name" : ['Shiv','Ram','Krishna','Ghanshyam','Krishna'],
    "Age" : [28,34,28,30,29],
    "Salary" : [50000,60000,52000,45000,49000],
    "Performance Score" : [85,82,90,92,88]
})


duplicate_remove = df.drop_duplicates(subset=["Name"])
print("DataFrame after removing duplicates:\n", duplicate_remove)
print()

duplicate_remove_multiple = df.drop_duplicates(subset=["Name","Age"]) # here it does not effect on output beacuse the compile understand hat both are unique value.
print("DataFrame after removing duplicates:\n", duplicate_remove)
print()

Fill Name :-> 
 0         Shiv
1          Ram
2      Krishna
3    Ghanshyam
4      Krishna
Name: Name, dtype: object
   Age  Salary  Performance Score
0   28   50000                 85
1   34   60000                 82
2   28   52000                 90
3   30   45000                 92
4   29   49000                 88

DataFrame after removing duplicates:
         Name  Age  Salary  Performance Score
0       Shiv   28   50000                 85
1        Ram   34   60000                 82
2    Krishna   28   52000                 90
3  Ghanshyam   30   45000                 92

DataFrame after removing duplicates:
         Name  Age  Salary  Performance Score
0       Shiv   28   50000                 85
1        Ram   34   60000                 82
2    Krishna   28   52000                 90
3  Ghanshyam   30   45000                 92



## Data Manipulation

In [91]:
# Add a new column to the DataFrame.
df = pd.DataFrame({
    "Name" : ['Shiv','Ram','Shayam','Ghanshyam','Krishna'],
    "Salary" : [50000,60000,52000,45000,49000],
})

df["EmployeID"] = ['E01','E02','E03','E04','E05'] 
print(df)
df.insert(1,"Age",[24,35,26,43,30])
print(df)
print()

# Rename columns.
df.rename(columns={
    "Name": "EmployeeName",
    "Salary": "MonthlySalary",
    "Age": "EmployeeAge"
}, inplace=True)

print(df)
print()


# Sort the DataFrame by a column (ascending and descending).
df.sort_values(by="MonthlySalary",ascending=True,inplace=True)
print(df)
print()

# Filter rows based on multiple conditions.
filtered = df[(df["EmployeeAge"] > 30) & (df["MonthlySalary"] > 45000 )]
print(filtered)
print()


        Name  Salary EmployeID
0       Shiv   50000       E01
1        Ram   60000       E02
2     Shayam   52000       E03
3  Ghanshyam   45000       E04
4    Krishna   49000       E05
        Name  Age  Salary EmployeID
0       Shiv   24   50000       E01
1        Ram   35   60000       E02
2     Shayam   26   52000       E03
3  Ghanshyam   43   45000       E04
4    Krishna   30   49000       E05

  EmployeeName  EmployeeAge  MonthlySalary EmployeID
0         Shiv           24          50000       E01
1          Ram           35          60000       E02
2       Shayam           26          52000       E03
3    Ghanshyam           43          45000       E04
4      Krishna           30          49000       E05

  EmployeeName  EmployeeAge  MonthlySalary EmployeID
3    Ghanshyam           43          45000       E04
4      Krishna           30          49000       E05
0         Shiv           24          50000       E01
2       Shayam           26          52000       E03
1          Ra

## Grouping and Aggregation

In [100]:
# Group the DataFrame by a column and calculate the mean.

df = pd.DataFrame({
    "Name": ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace', 'Eva', 'Ian', 'Jane'],
    "Department": ['HR', 'IT', 'Finance', 'IT', 'Finance', 'HR', 'IT', 'Finance', 'HR', 'IT'],
    "Age": [25, 30, 29, 35, 40, 26, 28, 38, 32, 31],
    "Salary": [50000, 70000, 65000, 72000, 80000, 52000, 69000, 77000, 54000, 71000]
})

grouped_frame = df.groupby(["Name","Age"])["Salary"].mean()
print(grouped_frame)
print()

# Count the number of entries in each group.
grouped_count = df.groupby("Name").size()
print(grouped_count)
print()

# Apply multiple aggregation functions (mean, sum, max) on grouped data.
grouped_agg = df.groupby("Name")["Salary"].agg(['mean', 'sum', 'max'])

print(grouped_agg)
print()


Name     Age
Alice    25     50000.0
Bob      30     70000.0
Charlie  29     65000.0
David    35     72000.0
Eva      38     77000.0
         40     80000.0
Frank    26     52000.0
Grace    28     69000.0
Ian      32     54000.0
Jane     31     71000.0
Name: Salary, dtype: float64

Name
Alice      1
Bob        1
Charlie    1
David      1
Eva        2
Frank      1
Grace      1
Ian        1
Jane       1
dtype: int64

            mean     sum    max
Name                           
Alice    50000.0   50000  50000
Bob      70000.0   70000  70000
Charlie  65000.0   65000  65000
David    72000.0   72000  72000
Eva      78500.0  157000  80000
Frank    52000.0   52000  52000
Grace    69000.0   69000  69000
Ian      54000.0   54000  54000
Jane     71000.0   71000  71000



## Merging and Joining

In [104]:
# Merge two DataFrames using a common key.


employees = pd.DataFrame({
    "EmpID": [101, 102, 103, 104, 105],
    "Name": ['Shiv', 'Ram', 'Raj', 'Krishna', 'Sanskar'],
    "DepartmentID": [1, 2, 1, 3, 2]
})

departments = pd.DataFrame({
    "DepartmentID": [1, 2, 3, 4],
    "DepartmentName": ['HR', 'IT', 'Finance', 'Marketing']
})


marge_dataframes = pd.merge(employees,departments,on="DepartmentID")
print(marge_dataframes)
print()


# Perform an inner, left, right, and outer join.

inner_joins = pd.merge(employees,departments,on="DepartmentID",how="inner")
print(inner_joins)

left_joins = pd.merge(employees,departments,on="DepartmentID",how="left")
print(left_joins)

right_joins = pd.merge(employees,departments,on="DepartmentID",how="right")
print(right_joins)

outer_joins = pd.merge(employees,departments,on="DepartmentID",how="outer")
print(outer_joins)



   EmpID     Name  DepartmentID DepartmentName
0    101     Shiv             1             HR
1    103      Raj             1             HR
2    102      Ram             2             IT
3    105  Sanskar             2             IT
4    104  Krishna             3        Finance

   EmpID     Name  DepartmentID DepartmentName
0    101     Shiv             1             HR
1    103      Raj             1             HR
2    102      Ram             2             IT
3    105  Sanskar             2             IT
4    104  Krishna             3        Finance
   EmpID     Name  DepartmentID DepartmentName
0    101     Shiv             1             HR
1    102      Ram             2             IT
2    103      Raj             1             HR
3    104  Krishna             3        Finance
4    105  Sanskar             2             IT
   EmpID     Name  DepartmentID DepartmentName
0  101.0     Shiv             1             HR
1  103.0      Raj             1             HR
2  102.0    

## Exporting Data

In [None]:
print(df)

# Save a DataFrame to a CSV file.
df.to_csv('Employee_data.csv',index=False)

      Name Department  Age  Salary
0    Alice         HR   25   50000
1      Bob         IT   30   70000
2  Charlie    Finance   29   65000
3    David         IT   35   72000
4      Eva    Finance   40   80000
5    Frank         HR   26   52000
6    Grace         IT   28   69000
7      Eva    Finance   38   77000
8      Ian         HR   32   54000
9     Jane         IT   31   71000
