# What are the basic data structures in pandas?

In [4]:
import pandas as pd

df = pd.Series(data=[1,2,3,4], index=['a','b', 'c', 'd'])
df

a    1
b    2
c    3
d    4
dtype: int64

In [11]:
df = pd.DataFrame({
    
    'City':['vizag','Delhi','Ankapalle'],
    'Temperature':[33,22,11],
    'Humidity':[80,60,55],
})
df.head()
df['Temperature']>22

0     True
1    False
2    False
Name: Temperature, dtype: bool

# How do you read a CSV,JSON,EXCEL file in pandas?


In [None]:
df1 = pd.read_csv('filename.csv')
df2 = pd.read_json('filename.json')
df3 = pd.read_excel('filename.xlsx')  

# How can you select specific columns from a DataFrame in pandas?

In [None]:
df = pd.read_csv('D:\DataSets\Pfizer_1.csv')
df.head()

In [None]:
df.index = [chr(i) for i in range(65,83)]
df['Date'].head()

In [None]:
df.iloc[0]

In [None]:
df.loc['A']

# How do you filter rows in a DataFrame based on a condition in pandas?


In [None]:
df['Date']=pd.to_datetime(df['Date'])
df.info()

In [None]:
df[df['Date']>='2020-10-17']

# How do you handle missing values in a DataFrame using pandas?


In [None]:
import numpy as np
df = pd.DataFrame({"name": [np.nan,'Alfred', 'Batman', 'Catwoman'],
                   "toy": [np.nan,2, 'Batmobile', 'Bullwhip'],
                   "born": [np.nan,3, pd.Timestamp("1940-04-25"),pd.NaT],
                    "check":[np.nan,pd.NaT, pd.NaT,pd.NaT],
                    "check1":[1,2,3,4]
                  })

df1=df.copy()
df

In [None]:
for i in df.columns:
    null_rate = df[i].isna().sum() / len(df) * 100
    if null_rate >= 0 :
        print("{} null rate: {}%".format(i,round(null_rate,2)))

In [None]:
df.dropna()

In [None]:
df.dropna(subset=['toy'])

In [None]:
# if a row contains 2 or more null values
df.dropna(thresh=2)

In [None]:
# if a row contains 2 or more null values
df.dropna(how='all')

In [None]:
df.fillna(0)

In [None]:
df.fillna(method='ffill')

In [None]:
df.fillna(method='bfill')

In [None]:
df1['check']=df1['check'].fillna(101)
df1

# How do you merge two DataFrames in pandas?


In [None]:
customer_table = pd.DataFrame({'Customerid': [1,2,3,4],'name':['shanmukh','sashank','sachin','Nikhil']})
order_table = pd.DataFrame({'Customerid': [1,2,5,8]})
output = pd.merge(customer_table,order_table,on='Customerid')
output

In [None]:
customer_table.merge(order_table, how='inner', on='Customerid')

In [None]:
customer_table = pd.DataFrame({'Customerid': [1,2,3,4],'name':['shanmukh','sashank','sachin','Nikhil']})
order_table = pd.DataFrame({'id': [1,2,5,8]})
output = pd.merge(customer_table,order_table,left_on='Customerid',right_on='id')
output

In [None]:
customer_table.merge(order_table, how='right', left_on='Customerid',right_on='id')

In [None]:
customer_table.merge(order_table, how='left', left_on='Customerid',right_on='id')

In [None]:
customer_table.merge(order_table, how='cross')

# How do you perform groupby operation in pandas?


In [None]:
df = pd.DataFrame({'Student': ['Sashank', 'Sashank',
                              'Manideep','Manideep', 'Sachin','Sampath','Nikhi','Shanmukh','Shanmukh'],
                   'Points': [9,10,10,10,10,8,9,3,2]})

In [None]:
# df.groupby(['Student']).mean()
# df.groupby(['Student']).median()
# df.groupby(['Student']).count()
df1 = df.groupby(['Student'])
df.groupby(['Student']).sum()

In [None]:
df1.get_group('Manideep')

In [None]:
df1['Student'].count()

In [None]:
df1.agg({'Points': ['max', 'min']})

# How do you rename columns in a DataFrame using pandas?


In [1]:
df = pd.DataFrame({'Student': ['Sashank', 'Sashank',
                              'Manideep','Manideep', 'Sachin','Sampath','Nikhi','Shanmukh','Shanmukh'],
                   'Points': [9,10,10,10,10,8,9,3,2]})
df.head()

NameError: name 'pd' is not defined

In [608]:
df = df.rename(columns={'Student': 'Scaler_Candidate'})
df.head()

Unnamed: 0,Scaler_Candidate,Points
0,Sashank,9
1,Sashank,10
2,Manideep,10
3,Manideep,10
4,Sachin,10


# How do you sort a DataFrame by a specific column in pandas?


In [609]:
df_sorted = df.sort_values(by=['Points', 'Scaler_Candidate'], ascending=[False,False])
df_sorted

Unnamed: 0,Scaler_Candidate,Points
1,Sashank,10
4,Sachin,10
2,Manideep,10
3,Manideep,10
0,Sashank,9
6,Nikhi,9
5,Sampath,8
7,Shanmukh,3
8,Shanmukh,2


# How do you aggregate data using pandas?


In [610]:
df['Points'].max()

10

In [611]:
df['Points'].min()

2

In [612]:
df['Points'].mean()

7.888888888888889

In [613]:
df['Points'].median()

9.0

In [614]:
df['Points'].mode()

0    10
dtype: int64

# How do you apply a function to each element in a DataFrame in pandas?


In [615]:
df.head()

Unnamed: 0,Scaler_Candidate,Points
0,Sashank,9
1,Sashank,10
2,Manideep,10
3,Manideep,10
4,Sachin,10


In [616]:
df['Points']=df['Points'].apply(lambda x:x-1)

In [617]:
df.head()

Unnamed: 0,Scaler_Candidate,Points
0,Sashank,8
1,Sashank,9
2,Manideep,9
3,Manideep,9
4,Sachin,9


# How do you handle duplicate data in a DataFrame using pandas?


In [618]:
df[df.duplicated()]

Unnamed: 0,Scaler_Candidate,Points
3,Manideep,9


In [619]:
df.drop_duplicates(keep="last")

Unnamed: 0,Scaler_Candidate,Points
0,Sashank,8
1,Sashank,9
3,Manideep,9
4,Sachin,9
5,Sampath,7
6,Nikhi,8
7,Shanmukh,2
8,Shanmukh,1


In [620]:
df.drop_duplicates(keep="first")

Unnamed: 0,Scaler_Candidate,Points
0,Sashank,8
1,Sashank,9
2,Manideep,9
4,Sachin,9
5,Sampath,7
6,Nikhi,8
7,Shanmukh,2
8,Shanmukh,1


# How do you calculate descriptive statistics for a DataFrame using pandas?


In [621]:
df.describe()

Unnamed: 0,Points
count,9.0
mean,6.888889
std,3.140241
min,1.0
25%,7.0
50%,8.0
75%,9.0
max,9.0


In [622]:
df.describe(include="all")

Unnamed: 0,Scaler_Candidate,Points
count,9,9.0
unique,6,
top,Sashank,
freq,2,
mean,,6.888889
std,,3.140241
min,,1.0
25%,,7.0
50%,,8.0
75%,,9.0


In [623]:
df.describe(include=object)

Unnamed: 0,Scaler_Candidate
count,9
unique,6
top,Sashank
freq,2


# How do you set and reset the index of a DataFrame using pandas?


In [624]:
df.head()

Unnamed: 0,Scaler_Candidate,Points
0,Sashank,8
1,Sashank,9
2,Manideep,9
3,Manideep,9
4,Sachin,9


In [625]:
df.set_index('Scaler_Candidate')

Unnamed: 0_level_0,Points
Scaler_Candidate,Unnamed: 1_level_1
Sashank,8
Sashank,9
Manideep,9
Manideep,9
Sachin,9
Sampath,7
Nikhi,8
Shanmukh,2
Shanmukh,1


In [626]:
df1=df.set_index('Scaler_Candidate')
df1

Unnamed: 0_level_0,Points
Scaler_Candidate,Unnamed: 1_level_1
Sashank,8
Sashank,9
Manideep,9
Manideep,9
Sachin,9
Sampath,7
Nikhi,8
Shanmukh,2
Shanmukh,1


In [627]:
df1.loc['Sashank']

Unnamed: 0_level_0,Points
Scaler_Candidate,Unnamed: 1_level_1
Sashank,8
Sashank,9


In [628]:
df1=df1.reset_index('Scaler_Candidate')
df1

Unnamed: 0,Scaler_Candidate,Points
0,Sashank,8
1,Sashank,9
2,Manideep,9
3,Manideep,9
4,Sachin,9
5,Sampath,7
6,Nikhi,8
7,Shanmukh,2
8,Shanmukh,1


# How do you concatenate multiple DataFrames in pandas?


In [629]:
df = pd.DataFrame({'Student': ['Sashank', 'Sashank',
                              'Manideep','Manideep', 'Sachin','Sampath','Nikhi','Shanmukh','Shanmukh'],
                   'Points': [9,10,10,10,10,8,9,3,2]})
df2 = pd.DataFrame({'Location': ['Hyd', 'Hyd',""
                              'Vizag','Vizag', 'Banglore','Banglore','Banglore','BZA','BZA']})

result = pd.concat([df, df2], axis=1)
print(result)

    Student  Points  Location
0   Sashank       9       Hyd
1   Sashank      10       Hyd
2  Manideep      10     Vizag
3  Manideep      10     Vizag
4    Sachin      10  Banglore
5   Sampath       8  Banglore
6     Nikhi       9  Banglore
7  Shanmukh       3       BZA
8  Shanmukh       2       BZA


In [630]:

df = pd.DataFrame({'Student': ['Sashank', 'Sashank',
                              'Manideep','Manideep', 'Sachin','Sampath','Nikhi','Shanmukh','Shanmukh'],
                   'Points': [9,10,10,10,10,8,9,3,2]})
df2 = pd.DataFrame({'Location': ['Hyd', 'Hyd',
                              'Vizag','Vizag', 'Banglore','Banglore','Banglore','BZA','BZA']})

result = pd.concat([df, df2], axis=0)
print(result)

    Student  Points  Location
0   Sashank     9.0       NaN
1   Sashank    10.0       NaN
2  Manideep    10.0       NaN
3  Manideep    10.0       NaN
4    Sachin    10.0       NaN
5   Sampath     8.0       NaN
6     Nikhi     9.0       NaN
7  Shanmukh     3.0       NaN
8  Shanmukh     2.0       NaN
0       NaN     NaN       Hyd
1       NaN     NaN       Hyd
2       NaN     NaN     Vizag
3       NaN     NaN     Vizag
4       NaN     NaN  Banglore
5       NaN     NaN  Banglore
6       NaN     NaN  Banglore
7       NaN     NaN       BZA
8       NaN     NaN       BZA


# How do you pivot a DataFrame in pandas?


In [631]:
df = pd.DataFrame({'Maths': [9,3,2,3],
                   'Physics': [0,0,0,1],
                   'Chemistry': [10, 2, 3, 6],
                   'Biology': [10,1,10,10]},index=['Sachin','Shanmukh','Sashank','Manideep'])

df


Unnamed: 0,Maths,Physics,Chemistry,Biology
Sachin,9,0,10,10
Shanmukh,3,0,2,1
Sashank,2,0,3,10
Manideep,3,1,6,10


In [632]:
df_transposed = df.T
df_transposed.index.name = 'subjects'
df_transposed

Unnamed: 0_level_0,Sachin,Shanmukh,Sashank,Manideep
subjects,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Maths,9,3,2,3
Physics,0,0,0,1
Chemistry,10,2,3,6
Biology,10,1,10,10


In [633]:
table = pd.pivot_table(df_transposed, columns='subjects',values=['Manideep'])
table

subjects,Biology,Chemistry,Maths,Physics
Manideep,10,6,3,1


# How do you melt a DataFrame in pandas?


In [634]:
df_transposed = df_transposed.reset_index().rename(columns={'index': 'subjects'})
df_transposed

Unnamed: 0,subjects,Sachin,Shanmukh,Sashank,Manideep
0,Maths,9,3,2,3
1,Physics,0,0,0,1
2,Chemistry,10,2,3,6
3,Biology,10,1,10,10


In [635]:
pd.melt(df_transposed, id_vars=['subjects'], var_name='Name', value_name='Score')

Unnamed: 0,subjects,Name,Score
0,Maths,Sachin,9
1,Physics,Sachin,0
2,Chemistry,Sachin,10
3,Biology,Sachin,10
4,Maths,Shanmukh,3
5,Physics,Shanmukh,0
6,Chemistry,Shanmukh,2
7,Biology,Shanmukh,1
8,Maths,Sashank,2
9,Physics,Sashank,0


In [636]:
#example 2
df

subjects,Maths,Physics,Chemistry,Biology
Sachin,9,0,10,10
Shanmukh,3,0,2,1
Sashank,2,0,3,10
Manideep,3,1,6,10


In [637]:
df.index
df = df.reset_index().rename(columns={'index': 'candidate'})


In [638]:
pd.melt(df, id_vars=['candidate'], var_name='Name', value_name='Score')

Unnamed: 0,candidate,Name,Score
0,Sachin,Maths,9
1,Shanmukh,Maths,3
2,Sashank,Maths,2
3,Manideep,Maths,3
4,Sachin,Physics,0
5,Shanmukh,Physics,0
6,Sashank,Physics,0
7,Manideep,Physics,1
8,Sachin,Chemistry,10
9,Shanmukh,Chemistry,2


In [639]:
df2 = pd.read_csv('D:\DataSets\Pfizer_1.csv')
print(df2.shape)
df2.head()


(18, 15)


Unnamed: 0,Date,Drug_Name,Parameter,1:30:00,2:30:00,3:30:00,4:30:00,5:30:00,6:30:00,7:30:00,8:30:00,9:30:00,10:30:00,11:30:00,12:30:00
0,15-10-2020,diltiazem hydrochloride,Temperature,23.0,22.0,,21.0,21.0,22,23.0,21.0,22.0,20,20.0,21
1,15-10-2020,diltiazem hydrochloride,Pressure,12.0,13.0,,11.0,13.0,14,16.0,16.0,24.0,18,19.0,20
2,15-10-2020,docetaxel injection,Temperature,,17.0,18.0,,17.0,18,,,23.0,23,25.0,25
3,15-10-2020,docetaxel injection,Pressure,,22.0,22.0,,22.0,23,,,27.0,26,29.0,28
4,15-10-2020,ketamine hydrochloride,Temperature,24.0,,,27.0,,26,25.0,24.0,23.0,22,21.0,20


In [640]:
pd.melt(df2, id_vars=['Date', 'Parameter', 'Drug_Name'])                

Unnamed: 0,Date,Parameter,Drug_Name,variable,value
0,15-10-2020,Temperature,diltiazem hydrochloride,1:30:00,23.0
1,15-10-2020,Pressure,diltiazem hydrochloride,1:30:00,12.0
2,15-10-2020,Temperature,docetaxel injection,1:30:00,
3,15-10-2020,Pressure,docetaxel injection,1:30:00,
4,15-10-2020,Temperature,ketamine hydrochloride,1:30:00,24.0
...,...,...,...,...,...
211,17-10-2020,Pressure,diltiazem hydrochloride,12:30:00,14.0
212,17-10-2020,Temperature,docetaxel injection,12:30:00,23.0
213,17-10-2020,Pressure,docetaxel injection,12:30:00,28.0
214,17-10-2020,Temperature,ketamine hydrochloride,12:30:00,24.0


In [641]:
#example4

In [642]:
import pandas as pd

data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Maths': [80, 70, 90],
    'Science': [90, 75, 85],
    'History': [85, 80, 95]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print(df.shape)

# melt the DataFrame to transform the subjects columns
melted_df = pd.melt(df, id_vars=['Name'], value_vars=['Maths', 'Science', 'History'], var_name='Subject', value_name='Score')
print("\nMelted DataFrame:")
print(melted_df)
print(melted_df.shape)

Original DataFrame:
      Name  Maths  Science  History
0    Alice     80       90       85
1      Bob     70       75       80
2  Charlie     90       85       95
(3, 4)

Melted DataFrame:
      Name  Subject  Score
0    Alice    Maths     80
1      Bob    Maths     70
2  Charlie    Maths     90
3    Alice  Science     90
4      Bob  Science     75
5  Charlie  Science     85
6    Alice  History     85
7      Bob  History     80
8  Charlie  History     95
(9, 3)


# MELT and EXPLODE

In [643]:
import pandas as pd

data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Colors': [['Red', 'Green'], ['Blue', 'Green'], ['Red', 'Blue', 'Green']]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print(df.shape)

# melt the DataFrame to transform the 'Colors' column
melted_df = pd.melt(df, id_vars=['Name'], var_name='Variable', value_name='Value')
print("\nMelted DataFrame:")
print(melted_df)
print(melted_df.shape)

# explode the 'Value' column to extract each color
exploded_df = melted_df.explode('Value')
print("\nExploded DataFrame:")
print(exploded_df)
print(exploded_df.shape)

Original DataFrame:
      Name              Colors
0    Alice        [Red, Green]
1      Bob       [Blue, Green]
2  Charlie  [Red, Blue, Green]
(3, 2)

Melted DataFrame:
      Name Variable               Value
0    Alice   Colors        [Red, Green]
1      Bob   Colors       [Blue, Green]
2  Charlie   Colors  [Red, Blue, Green]
(3, 3)

Exploded DataFrame:
      Name Variable  Value
0    Alice   Colors    Red
0    Alice   Colors  Green
1      Bob   Colors   Blue
1      Bob   Colors  Green
2  Charlie   Colors    Red
2  Charlie   Colors   Blue
2  Charlie   Colors  Green
(7, 3)


# How do you calculate the correlation between columns in a DataFrame using pandas?


In [644]:
import pandas as pd

# create a sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Maths': [80, 70, 90],
    'Science': [90, 75, 85],
    'History': [85, 80, 95]
}

df = pd.DataFrame(data)

# calculate the correlation matrix between all columns
correlation_matrix = df.corr()

print(correlation_matrix)


            Maths   Science   History
Maths    1.000000  0.654654  0.981981
Science  0.654654  1.000000  0.500000
History  0.981981  0.500000  1.000000


# How do you handle outliers in a DataFrame using pandas?


In [645]:
import pandas as pd
import numpy as np

df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 100],
                   'B': [10, 20, 30, 40, 50, 200]})

# identify and trim outliers in column A
q1 = df['A'].quantile(0.25)
q3 = df['A'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5*iqr
upper_bound = q3 + 1.5*iqr
df = df[(df['A'] > lower_bound) & (df['A'] < upper_bound)]

print(df)

   A   B
0  1  10
1  2  20
2  3  30
3  4  40
4  5  50


# How do you extract unique values from a column in a DataFrame using pandas?


In [646]:
import pandas as pd

# create a sample DataFrame
df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 1, 2, 3, 4, 5]})

# extract unique values from column A
unique_values = df['A'].unique()

print(unique_values)

[1 2 3 4 5]


# How do you calculate cumulative sum in a DataFrame using pandas?


In [647]:
import pandas as pd

# create a sample DataFrame
df = pd.DataFrame({'A': [1, 2, 3, 4, 5]})

# calculate the cumulative sum of column A
cumulative_sum = df['A'].cumsum()

print(cumulative_sum)

0     1
1     3
2     6
3    10
4    15
Name: A, dtype: int64


# How do you convert data types of columns in a DataFrame using pandas?


In [648]:
import pandas as pd

# create a sample DataFrame
df = pd.DataFrame({'A': ['1', '2', '3'],
                   'B': ['4.0', '5.0', '6.0']})

# convert column A to integer type
df['A'] = df['A'].astype(int)

# convert column B to float type
df['B'] = df['B'].astype(float)

print(df.dtypes)

A      int32
B    float64
dtype: object


# How do you handle datetime data in a DataFrame using pandas?


In [649]:
import pandas as pd

# create a DataFrame
df = pd.DataFrame({
    'date_str': ['2022-05-01', '2022-05-02', '2022-05-03', '2022-05-04', '2022-05-05'],
    'value': [10, 20, 30, 40, 50]
})

# convert 'date_str' column to datetime format
df['date'] = pd.to_datetime(df['date_str'])

# drop the original 'date_str' column
df.drop('date_str', axis=1, inplace=True)

# print the updated DataFrame
print(df)


   value       date
0     10 2022-05-01
1     20 2022-05-02
2     30 2022-05-03
3     40 2022-05-04
4     50 2022-05-05


# How do you handle multi-level indexing in pandas?


In [650]:
import pandas as pd

# create a DataFrame with multi-level indexing
data = {'A': [1, 2, 3, 4],
        'B': [5, 6, 7, 8],
        'C': [9, 10, 11, 12]}
index = pd.MultiIndex.from_tuples([('Group1', 'A'), ('Group1', 'B'), ('Group2', 'A'), ('Group2', 'B')], names=['Group', 'Letter'])
df = pd.DataFrame(data, index=index)

# output the DataFrame
print(df)


               A  B   C
Group  Letter          
Group1 A       1  5   9
       B       2  6  10
Group2 A       3  7  11
       B       4  8  12


In [32]:
import numpy as np

marks = [10,0,20, 40, 50, 30, 60, 80, 90, 100, 70]

# Calculate the 50th percentile
pct = np.percentile(marks, 50)

print("The 25th percentile marks are:", pct)

The 25th percentile marks are: 50.0
