In [79]:
import pandas as pd

print("Pandas Version:", pd.__version__)

Pandas Version: 2.3.1


In [80]:
data=[10,20,30,40,50]
series= pd.Series(data, index=["A","B","C","D","E"])
print(series)

A    10
B    20
C    30
D    40
E    50
dtype: int64


In [81]:
data_dict={
    "name":["Rudraksha","Alice","Bob","Charlie"],
    "age":[21,22,23,24],
    "salary":[1000,2000,3000,4000]
}

df=pd.DataFrame(data_dict)
print("Data Frame example")
print(df)


Data Frame example
        name  age  salary
0  Rudraksha   21    1000
1      Alice   22    2000
2        Bob   23    3000
3    Charlie   24    4000


In [82]:
#Display top and bottom rows
print(df.head(3))
print(df.tail(2))

#basic information about the dataset
print(df.info())

#summary statistics of the dataset
print(df.describe())

#column names
print(df.columns)

#shape of the dataset
print(df.shape)

        name  age  salary
0  Rudraksha   21    1000
1      Alice   22    2000
2        Bob   23    3000
      name  age  salary
2      Bob   23    3000
3  Charlie   24    4000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    4 non-null      object
 1   age     4 non-null      int64 
 2   salary  4 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 228.0+ bytes
None
             age       salary
count   4.000000     4.000000
mean   22.500000  2500.000000
std     1.290994  1290.994449
min    21.000000  1000.000000
25%    21.750000  1750.000000
50%    22.500000  2500.000000
75%    23.250000  3250.000000
max    24.000000  4000.000000
Index(['name', 'age', 'salary'], dtype='object')
(4, 3)


In [83]:
#Selecting one column
print(df['name'])

#Selecting multiple columns
print(df[['name','salary']])

#Selecting by index positions
print(df.iloc[0])
print(df.iloc[1:3])

#Selecting by index labels
print(df.loc[0:2,['name','salary']])


0    Rudraksha
1        Alice
2          Bob
3      Charlie
Name: name, dtype: object
        name  salary
0  Rudraksha    1000
1      Alice    2000
2        Bob    3000
3    Charlie    4000
name      Rudraksha
age              21
salary         1000
Name: 0, dtype: object
    name  age  salary
1  Alice   22    2000
2    Bob   23    3000
        name  salary
0  Rudraksha    1000
1      Alice    2000
2        Bob    3000


In [84]:
df['tax']=df['salary']*0.10
print(df)


        name  age  salary    tax
0  Rudraksha   21    1000  100.0
1      Alice   22    2000  200.0
2        Bob   23    3000  300.0
3    Charlie   24    4000  400.0


In [85]:
#Add a new column
df['tax'] = df['salary'] * 0.1

#Update existing column
df['salary'] = df['salary'] + 5000

#Delete a column
df = df.drop('tax', axis=1)


#arithmetic operations
df['bonus'] = df['salary']*0.1
print(df)


        name  age  salary  bonus
0  Rudraksha   21    6000  600.0
1      Alice   22    7000  700.0
2        Bob   23    8000  800.0
3    Charlie   24    9000  900.0


In [86]:
print(df)
#introducing missing values
df.loc[2,'age']=None
print("With missing values:\n",df)

#Check for null values
df.isnull()

#Filling missing values
df['age'].fillna(df['age'].mean(),inplace=True)
print(df)

#drop rows with missing values
#df.dropna(inplace=True)
#print("After handling missing values:\n",df)

        name  age  salary  bonus
0  Rudraksha   21    6000  600.0
1      Alice   22    7000  700.0
2        Bob   23    8000  800.0
3    Charlie   24    9000  900.0
With missing values:
         name   age  salary  bonus
0  Rudraksha  21.0    6000  600.0
1      Alice  22.0    7000  700.0
2        Bob   NaN    8000  800.0
3    Charlie  24.0    9000  900.0
        name        age  salary  bonus
0  Rudraksha  21.000000    6000  600.0
1      Alice  22.000000    7000  700.0
2        Bob  22.333333    8000  800.0
3    Charlie  24.000000    9000  900.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].mean(),inplace=True)


In [87]:
#filter data where Age > 30
print(df[df['age'] > 30])

#Multiple conditions
print(df[(df['age'] > 30) & (df['salary'] == '2000')])

Empty DataFrame
Columns: [name, age, salary, bonus]
Index: []
Empty DataFrame
Columns: [name, age, salary, bonus]
Index: []


In [88]:
#Sorting and Ranking
#Sort by Salary
print(df.sort_values("salary", ascending=False))

#sort by multiple columns
print(df.sort_values(["salary", "age"], ascending=[False]))

#Rank by Salary
print(df.rank("salary", ascending=False))

#Group by Salary
print(df.groupby("salary").sum())


        name        age  salary  bonus
3    Charlie  24.000000    9000  900.0
2        Bob  22.333333    8000  800.0
1      Alice  22.000000    7000  700.0
0  Rudraksha  21.000000    6000  600.0


ValueError: Length of ascending (1) != length of by (2)

In [None]:
#Group by exmaple 
group_data=df.groupby('age')['salary'].mean()
print(group_data)

#Multiple aggrigations
agg_data=df.groupby('age').agg({'salary':['mean','max','min']})
print(agg_data)


age
21.000000    6000.0
22.000000    7000.0
22.333333    8000.0
24.000000    9000.0
Name: salary, dtype: float64
           salary            
             mean   max   min
age                          
21.000000  6000.0  6000  6000
22.000000  7000.0  7000  7000
22.333333  8000.0  8000  8000
24.000000  9000.0  9000  9000


In [None]:
#Merging & Joining DataFrames
dept_data=pd.DataFrame({
    'name':['Rudraksha','Alice','Bob','Charlie'],
    'dept':['HR','IT','IT','HR']
})

merged_data=pd.merge(df,dept_data,on='name')
print(merged_data)

        name        age  salary  bonus dept
0  Rudraksha  21.000000    6000  600.0   HR
1      Alice  22.000000    7000  700.0   IT
2        Bob  22.333333    8000  800.0   IT
3    Charlie  24.000000    9000  900.0   HR


In [None]:
#concatination
df1=df.head(2)
df2=df.tail(2)

combined = pd.concat([df1, df2])
print(combined)

        name        age  salary  bonus
0  Rudraksha  21.000000    6000  600.0
1      Alice  22.000000    7000  700.0
2        Bob  22.333333    8000  800.0
3    Charlie  24.000000    9000  900.0


In [None]:
#statistical & Mathematical functions
print("Mean Salary",df['salary'].mean())
print("Max Age",df['age'].max())
print("Min Age",df['age'].min())
print("Standard Deviation",df['salary'].std())



Mean Salary 2500.0
Max Age 24
Min Age 21
Standard Deviation 1290.9944487358057


In [None]:
#Add a date column
df['joining_year']=pd.date_range(start='2020-01-01',periods=len(df),freq='Y')

#Extract year
df['joining_year']=df['joining_date'].dt.year

print(df[['name','joining_date','joining_year']])

  df['joining_year']=pd.date_range(start='2020-01-01',periods=len(df),freq='Y')


KeyError: 'joining_date'

In [None]:
#Exporting Final Data
#Save Final dataset
df.to_csv("final_dataset.csv",index=False)
print("Final dataset saved successfully")



Final dataset saved successfully
