In [None]:
# pandas in python is a data manipulation library
# it is used to read and write data between in-memory data structures and files
# it is used to clean, transform, and analyze data
# it is used to visualize data
# it is used to create data structures

In [1]:
# install pandas using pip
%pip install pandas

Collecting pandas
  Downloading pandas-2.2.3-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.2.4-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading pandas-2.2.3-cp313-cp313-win_amd64.whl (11.5 MB)
   ---------------------------------------- 0.0/11.5 MB ? eta -:--:--
   - -------------------------------------- 0.5/11.5 MB 17.6 MB/s eta 0:00:01
   ------ --------------------------------- 1.8/11.5 MB 4.4 MB/s eta 0:00:03
   --------------- ------------------------ 4.5/11.5 MB 7.1 MB/s eta 0:00:01
   --------------------- ------------------ 6.3/11.5 MB 7.6 MB/s eta 0:00:01
   ----------------------------- ---------- 8.4/11.5 MB 8.0 MB/s eta 0:00:01
   ------------------------------------- -- 10.7/11.5 MB 8.6 MB/s eta 0:00:01
   ---------------------------------------- 11.5/11.5 MB 7.9 MB/s eta 0:00:00
Downloading numpy-2.2.4-cp313

In [3]:
# import pandas
import pandas as pd

# series 
# a series is a one-dimensional labeled array capable of holding any data type
# a series is similar to a list or a column in a table

# example

# step 1 - create a data

# create a list of data
data = [1, 2, 3, 4, 5]

# create a series

s = pd.Series(data)
print(s)

0    1
1    2
2    3
3    4
4    5
dtype: int64


In [4]:
# modify the index of the series
s = pd.Series(data, index=['a', 'b', 'c', 'd', 'e'])
print(s)

a    1
b    2
c    3
d    4
e    5
dtype: int64


In [5]:
# create a series from a dictionary
data = {'name': 'john', 'age': 25, 'city': 'new york'}
s = pd.Series(data)
print(s)

name        john
age           25
city    new york
dtype: object


In [6]:
# add column name to the series
s = pd.Series(data, name='person')
print(s)

name        john
age           25
city    new york
Name: person, dtype: object


In [9]:
# data frame
# a data frame is a two-dimensional labeled data structure with columns of potentially different types
# a data frame is similar to a table in a database


# example

# step 1 - create a data

data = {'Name':"Nishant", 'Age': 25, 'City': 'New York'}

df = pd.DataFrame(data , index=[0])
print(df)

      Name  Age      City
0  Nishant   25  New York


In [10]:
# create a data frame from a list of dictionaries

data = [{'Name': 'Nishant', 'Age': 25, 'City': 'New York'}, {'Name': 'John', 'Age': 30, 'City': 'Chicago'}]

df = pd.DataFrame(data)
print(df)

      Name  Age      City
0  Nishant   25  New York
1     John   30   Chicago


In [13]:
# read csv file 

d_file = pd.read_csv('dataset/employee_data.csv')

print(d_file.head())

             Name  Salary               City   Department  Age Date of Join
0  Kristina Brown  102161       Johnsonhaven        Sales   38   2023-11-07
1     Kelly Moran  107822  North Michaelside  Engineering   39   2018-07-18
2   Miranda Silva   90319           Deanbury           HR   32   2024-11-22
3    Paul Bennett   85464          Barryfort    Marketing   47   2018-11-28
4  Jessica Powers  115027         Meyerburgh      Finance   48   2018-10-30


In [14]:
# convert into data frame

df = pd.DataFrame(d_file)
print(df)

                  Name  Salary                 City   Department  Age  \
0       Kristina Brown  102161         Johnsonhaven        Sales   38   
1          Kelly Moran  107822    North Michaelside  Engineering   39   
2        Miranda Silva   90319             Deanbury           HR   32   
3         Paul Bennett   85464            Barryfort    Marketing   47   
4       Jessica Powers  115027           Meyerburgh      Finance   48   
5      William Leonard   57891          Lake Ashley           HR   22   
6     Jennifer Jenkins   47585         Hernandezton  Engineering   25   
7          Jason Lyons   73351            Jonesbury        Sales   39   
8       Nancy Gonzalez  114752        South Vincent        Sales   56   
9          George Hunt   32606           Williebury           HR   43   
10        Jenna Thomas  107020     Schneiderborough  Engineering   59   
11     Brandon Wheeler   70970      North Jamiefurt        Sales   57   
12  Dr. Robert Roberts   51320            Lopeztown

In [None]:
# get the shape of the data frame
print(df.shape)

(50, 6)


In [17]:
# get the data type of each column
print(df.dtypes)
print(df.columns)


Name            object
Salary           int64
City            object
Department      object
Age              int64
Date of Join    object
dtype: object
Index(['Name', 'Salary', 'City', 'Department', 'Age', 'Date of Join'], dtype='object')


In [18]:
# get data summary
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Name          50 non-null     object
 1   Salary        50 non-null     int64 
 2   City          50 non-null     object
 3   Department    50 non-null     object
 4   Age           50 non-null     int64 
 5   Date of Join  50 non-null     object
dtypes: int64(2), object(4)
memory usage: 2.5+ KB
None


In [19]:
# get the first n rows of the data frame
print(df.head(2))

             Name  Salary               City   Department  Age Date of Join
0  Kristina Brown  102161       Johnsonhaven        Sales   38   2023-11-07
1     Kelly Moran  107822  North Michaelside  Engineering   39   2018-07-18


In [20]:
# get the last n rows of the data frame
print(df.tail(2))

                 Name  Salary          City Department  Age Date of Join
48  Christian Roberts   42055    Reillystad         HR   23   2016-04-24
49   Jeremiah Garrett   30957  New Danielle  Marketing   54   2023-01-09


In [21]:
# get the specific column from the data frame
print(df['Name'])
print(df.Name)

0         Kristina Brown
1            Kelly Moran
2          Miranda Silva
3           Paul Bennett
4         Jessica Powers
5        William Leonard
6       Jennifer Jenkins
7            Jason Lyons
8         Nancy Gonzalez
9            George Hunt
10          Jenna Thomas
11       Brandon Wheeler
12    Dr. Robert Roberts
13      Jessica Franklin
14      Lawrence Marquez
15       Elizabeth Smith
16          Julia Church
17         Destiny Lewis
18             Pam Smith
19         Steven Barron
20        Tyler Morrison
21        Adriana Patton
22           Susan Lewis
23            Megan Meza
24         Donald Conley
25      Tony Fitzpatrick
26        William Krause
27          Nancy Taylor
28          Haley Little
29        Michael Carney
30        Crystal Zamora
31          Lauren Jones
32           Paul Molina
33         Sean Brown MD
34          Howard Miles
35          Jacob Farmer
36         Mathew Melton
37            Susan Nash
38           Dave Cooper
39    Elizabeth Mcdaniel


In [22]:
# find data age between 25 and 30
fliter_age = df[(df['Age'] > 25) & (df['Age'] < 30)]
print(fliter_age)


                  Name  Salary             City Department  Age Date of Join
12  Dr. Robert Roberts   51320        Lopeztown  Marketing   28   2020-08-22
20      Tyler Morrison   91565  East Barbaraton    Finance   27   2020-10-14
25    Tony Fitzpatrick   39705     Santiagofurt      Sales   27   2015-05-17
29      Michael Carney   71724     Lake Russell  Marketing   27   2015-04-10


In [25]:
# create a group by city
group_city = df.groupby('City')

# get the data of the group

for city, data in group_city:
    print(city)


Aguirremouth
Alvarezchester
Angelicamouth
Barryfort
Brewerview
Clarkberg
Coleside
Davidfurt
Deanbury
East Barbaraton
East Kim
East Philip
Fieldsfort
Fullerberg
Garciafurt
Hernandezton
Hintonmouth
Johnsonborough
Johnsonhaven
Jonesbury
Lake Ashley
Lake Brandon
Lake Russell
Lopeztown
Louisburgh
Meyerburgh
New Amanda
New Danielle
Nicolemouth
North Deannaton
North Jamiefurt
North Jenniferhaven
North Martinmouth
North Michaelside
North Waynechester
Port Christinaburgh
Port Dawnmouth
Port Nicolemouth
Reillystad
Romeroborough
Santiagofurt
Schneiderborough
Smithchester
South Erictown
South Josephchester
South Richard
South Vincent
Stephanieview
Thomasfurt
Williebury


In [26]:
# count the number of data in each group
print(group_city.size())

City
Aguirremouth           1
Alvarezchester         1
Angelicamouth          1
Barryfort              1
Brewerview             1
Clarkberg              1
Coleside               1
Davidfurt              1
Deanbury               1
East Barbaraton        1
East Kim               1
East Philip            1
Fieldsfort             1
Fullerberg             1
Garciafurt             1
Hernandezton           1
Hintonmouth            1
Johnsonborough         1
Johnsonhaven           1
Jonesbury              1
Lake Ashley            1
Lake Brandon           1
Lake Russell           1
Lopeztown              1
Louisburgh             1
Meyerburgh             1
New Amanda             1
New Danielle           1
Nicolemouth            1
North Deannaton        1
North Jamiefurt        1
North Jenniferhaven    1
North Martinmouth      1
North Michaelside      1
North Waynechester     1
Port Christinaburgh    1
Port Dawnmouth         1
Port Nicolemouth       1
Reillystad             1
Romeroborough       

In [28]:
# increase the salary with 10% for only age greater than 25

df['Salary'] = df['Salary'].apply(lambda x: x*1.1 if x > 25 else x)

print(df)

                  Name    Salary                 City   Department  Age  \
0       Kristina Brown  112377.1         Johnsonhaven        Sales   38   
1          Kelly Moran  118604.2    North Michaelside  Engineering   39   
2        Miranda Silva   99350.9             Deanbury           HR   32   
3         Paul Bennett   94010.4            Barryfort    Marketing   47   
4       Jessica Powers  126529.7           Meyerburgh      Finance   48   
5      William Leonard   63680.1          Lake Ashley           HR   22   
6     Jennifer Jenkins   52343.5         Hernandezton  Engineering   25   
7          Jason Lyons   80686.1            Jonesbury        Sales   39   
8       Nancy Gonzalez  126227.2        South Vincent        Sales   56   
9          George Hunt   35866.6           Williebury           HR   43   
10        Jenna Thomas  117722.0     Schneiderborough  Engineering   59   
11     Brandon Wheeler   78067.0      North Jamiefurt        Sales   57   
12  Dr. Robert Roberts   

In [29]:
# write the data frame to a csv file

df.to_csv('dataset/employee_data_updated.csv', index=False)