# Everything about Pandas in one notebook

* Series
* DataFrames
* Missing Data
* GroupBy
* Merging,Joining,and Concatenating
* Operations
* Data Input and Output

In [1]:
import numpy as np
import pandas as pd

## Series

#### Creating a Series

In [2]:
labels = ['a','b','c']
my_list = [10,20,30]
arr = np.array([10,20,30])
d = {'a':10,'b':20,'c':30}

#### 1) Using Lists

In [3]:
pd.Series(data=my_list)
# index are generated naturally but we can change them.

0    10
1    20
2    30
dtype: int64

In [5]:
pd.Series(data=my_list, index=labels)
# Now lables are changed according to our need.

a    10
b    20
c    30
dtype: int64

#### 2) Arrays

In [6]:
pd.Series(arr)

0    10
1    20
2    30
dtype: int64

In [7]:
pd.Series(arr, labels)

a    10
b    20
c    30
dtype: int64

#### 3) Using Dictionary

In [8]:
pd.Series(d)
# As you know dictionaries are key-value pairs and 'keys' are labels here and 'values' are series.

a    10
b    20
c    30
dtype: int64

#### A pandas series can hold a variety of object types

In [9]:
pd.Series(labels)

0    a
1    b
2    c
dtype: object

In [10]:
# Functions in series
pd.Series([sum,print,len])

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object

#### Using an Index

In [11]:
ser1 = pd.Series([1,2,3,4], index = ['USA','Germany','USSR','Japan'])
ser1

USA        1
Germany    2
USSR       3
Japan      4
dtype: int64

In [12]:
ser2 = pd.Series([1,2,5,4], index=['USA','Germany','Italy','Japan'])
ser2

USA        1
Germany    2
Italy      5
Japan      4
dtype: int64

In [14]:
ser1['USA']

1

In [15]:
ser3 = pd.Series([1,2,3,4], index=['Japan','USA', 'Germany', 'USSR'])
ser3

Japan      1
USA        2
Germany    3
USSR       4
dtype: int64

In [19]:
ser1, ser3

(USA        1
 Germany    2
 USSR       3
 Japan      4
 dtype: int64,
 Japan      1
 USA        2
 Germany    3
 USSR       4
 dtype: int64)

In [17]:
ser1 + ser3
# adding based on index

Germany    5
Japan      5
USA        3
USSR       7
dtype: int64

## Data Frames

In [21]:
from numpy.random import randn
np.random.seed(101)

''' 
The seed acts as an input to the random number generator algorithm that produces a sequence of pseudorandom numbers.
Setting the seed initializes the internal state of the algorithm.
Making the random sequence predictable and reproducible:

With a fixed seed, the random number generator will always produce the same sequence of pseudorandom numbers each time we run the code.
This makes the sequence reproducible - useful for testing, experimentation, and ensuring consistency between runs.
Removing "true" randomness:

With a fixed seed, the numbers generated are not truly random anymore, but follow a predictable pattern based on the algorithm and seed.
This may not be suitable in cases where we explicitly rely on randomness across trials. 
''' 

' \nThe seed acts as an input to the random number generator algorithm that produces a sequence of pseudorandom numbers.\nSetting the seed initializes the internal state of the algorithm.\nMaking the random sequence predictable and reproducible:\n\nWith a fixed seed, the random number generator will always produce the same sequence of pseudorandom numbers each time we run the code.\nThis makes the sequence reproducible - useful for testing, experimentation, and ensuring consistency between runs.\nRemoving "true" randomness:\n\nWith a fixed seed, the numbers generated are not truly random anymore, but follow a predictable pattern based on the algorithm and seed.\nThis may not be suitable in cases where we explicitly rely on randomness across trials. \n'

In [25]:
df = pd.DataFrame(randn(5,4), index='A B C D E'.split(), columns='W X Y Z'.split())
df
# So it is generating random numbers of 5 rows and 4 columns and we are giving names to rows form 'A' to 'E' and for columns from 'W' to 'Z'.
# .split() function is spliting the string for each column and row.

Unnamed: 0,W,X,Y,Z
A,0.38603,2.084019,-0.376519,0.230336
B,0.681209,1.035125,-0.03116,1.939932
C,-1.005187,-0.74179,0.187125,-0.732845
D,-1.38292,1.482495,0.961458,-2.141212
E,0.992573,1.192241,-1.04678,1.292765


#### Selectiona and Indexing

In [27]:
df['W'] # so we are selecting a column. 

A    0.386030
B    0.681209
C   -1.005187
D   -1.382920
E    0.992573
Name: W, dtype: float64

In [29]:
df[['W','Z']] # multiple columns

Unnamed: 0,W,Z
A,0.38603,0.230336
B,0.681209,1.939932
C,-1.005187,-0.732845
D,-1.38292,-2.141212
E,0.992573,1.292765


In [31]:
df.W

A    0.386030
B    0.681209
C   -1.005187
D   -1.382920
E    0.992573
Name: W, dtype: float64

In [32]:
type(df['W'])
# So DataFrmae is just multiple Series combine togather.

pandas.core.series.Series

#### Let's create a new column

In [33]:
df['new'] = df['W'] + df['Y']
df

Unnamed: 0,W,X,Y,Z,new
A,0.38603,2.084019,-0.376519,0.230336,0.009512
B,0.681209,1.035125,-0.03116,1.939932,0.650049
C,-1.005187,-0.74179,0.187125,-0.732845,-0.818062
D,-1.38292,1.482495,0.961458,-2.141212,-0.421462
E,0.992573,1.192241,-1.04678,1.292765,-0.054206


## axis=0 means "rows"
## axis=1 means "columns"

In [36]:
df.drop('new', axis=1)

Unnamed: 0,W,X,Y,Z
A,0.38603,2.084019,-0.376519,0.230336
B,0.681209,1.035125,-0.03116,1.939932
C,-1.005187,-0.74179,0.187125,-0.732845
D,-1.38292,1.482495,0.961458,-2.141212
E,0.992573,1.192241,-1.04678,1.292765


In [37]:
df
# Even after dropping the 'new' column, original dataframe didn't change.

Unnamed: 0,W,X,Y,Z,new
A,0.38603,2.084019,-0.376519,0.230336,0.009512
B,0.681209,1.035125,-0.03116,1.939932,0.650049
C,-1.005187,-0.74179,0.187125,-0.732845,-0.818062
D,-1.38292,1.482495,0.961458,-2.141212,-0.421462
E,0.992573,1.192241,-1.04678,1.292765,-0.054206


In [38]:
df.drop('new',axis=1, inplace=True)
# When we use "inplace=True" it means do changes in original DataFrame. 
df
# Now column has been dropped.

Unnamed: 0,W,X,Y,Z
A,0.38603,2.084019,-0.376519,0.230336
B,0.681209,1.035125,-0.03116,1.939932
C,-1.005187,-0.74179,0.187125,-0.732845
D,-1.38292,1.482495,0.961458,-2.141212
E,0.992573,1.192241,-1.04678,1.292765


In [39]:
# lets drop row now.
df.drop('E', axis=0)

Unnamed: 0,W,X,Y,Z
A,0.38603,2.084019,-0.376519,0.230336
B,0.681209,1.035125,-0.03116,1.939932
C,-1.005187,-0.74179,0.187125,-0.732845
D,-1.38292,1.482495,0.961458,-2.141212


In [40]:
df

Unnamed: 0,W,X,Y,Z
A,0.38603,2.084019,-0.376519,0.230336
B,0.681209,1.035125,-0.03116,1.939932
C,-1.005187,-0.74179,0.187125,-0.732845
D,-1.38292,1.482495,0.961458,-2.141212
E,0.992573,1.192241,-1.04678,1.292765


#### Selecting Rows

In [41]:
df['A'] # we cannot get 'rows' like columns and that's why we have to use different methods to get rows.

KeyError: 'A'

In [43]:
df

Unnamed: 0,W,X,Y,Z
A,0.38603,2.084019,-0.376519,0.230336
B,0.681209,1.035125,-0.03116,1.939932
C,-1.005187,-0.74179,0.187125,-0.732845
D,-1.38292,1.482495,0.961458,-2.141212
E,0.992573,1.192241,-1.04678,1.292765


In [42]:
df.loc['A'] # we can get rows using row indexes. Here we know indexes of rows so we can get it.

W    0.386030
X    2.084019
Y   -0.376519
Z    0.230336
Name: A, dtype: float64

In [44]:
df.iloc[2] # we are using numbers to get the row. Here we got row 'C'.

W   -1.005187
X   -0.741790
Y    0.187125
Z   -0.732845
Name: C, dtype: float64

### Selecting subset of rows and columns

In [45]:
df

Unnamed: 0,W,X,Y,Z
A,0.38603,2.084019,-0.376519,0.230336
B,0.681209,1.035125,-0.03116,1.939932
C,-1.005187,-0.74179,0.187125,-0.732845
D,-1.38292,1.482495,0.961458,-2.141212
E,0.992573,1.192241,-1.04678,1.292765


In [46]:
df.loc['B','Y']

-0.031160481493099617

In [47]:
df.loc[['A','B'],['W','Y']]

Unnamed: 0,W,Y
A,0.38603,-0.376519
B,0.681209,-0.03116


### Conditional Selection

In [48]:
df[df>0]

Unnamed: 0,W,X,Y,Z
A,0.38603,2.084019,,0.230336
B,0.681209,1.035125,,1.939932
C,,,0.187125,
D,,1.482495,0.961458,
E,0.992573,1.192241,,1.292765


In [55]:
df[df['W']>0]
# Where values in 'W' > 0. 

Unnamed: 0,W,X,Y,Z
A,0.38603,2.084019,-0.376519,0.230336
B,0.681209,1.035125,-0.03116,1.939932
E,0.992573,1.192241,-1.04678,1.292765


In [57]:
df[df['W']>0]['Y']

# This give all the values in 'Y' satisfying the condition of 'W' > 0.

A   -0.376519
B   -0.031160
E   -1.046780
Name: Y, dtype: float64

In [59]:
df[df['Y']>0][['Y','X']]

Unnamed: 0,Y,X
C,0.187125,-0.74179
D,0.961458,1.482495


In [64]:
# I want values where 'Y' > 0 and "W" > 0 

df[(df['W']>0) & (df['X'] > 0)]

Unnamed: 0,W,X,Y,Z
A,0.38603,2.084019,-0.376519,0.230336
B,0.681209,1.035125,-0.03116,1.939932
E,0.992573,1.192241,-1.04678,1.292765


In [65]:
df[(df['X']>0) & (df['W'] > 0)]

Unnamed: 0,W,X,Y,Z
A,0.38603,2.084019,-0.376519,0.230336
B,0.681209,1.035125,-0.03116,1.939932
E,0.992573,1.192241,-1.04678,1.292765


## More Index Details


In [66]:
df

Unnamed: 0,W,X,Y,Z
A,0.38603,2.084019,-0.376519,0.230336
B,0.681209,1.035125,-0.03116,1.939932
C,-1.005187,-0.74179,0.187125,-0.732845
D,-1.38292,1.482495,0.961458,-2.141212
E,0.992573,1.192241,-1.04678,1.292765


In [67]:
# Reset to default index
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,0.38603,2.084019,-0.376519,0.230336
1,B,0.681209,1.035125,-0.03116,1.939932
2,C,-1.005187,-0.74179,0.187125,-0.732845
3,D,-1.38292,1.482495,0.961458,-2.141212
4,E,0.992573,1.192241,-1.04678,1.292765


### Multi-index and index hierarchy

In [68]:
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside, inside))
hier_index

[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]

In [71]:
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside, inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)
hier_index

# hier_index = list(zip(outside,inside)): This uses the zip function to pair corresponding elements from the outside and inside lists, creating a list of tuples. The resulting list looks like this: [('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)].

# hier_index = pd.MultiIndex.from_tuples(hier_index): This converts the list of tuples into a pandas MultiIndex using the from_tuples method. This MultiIndex will be used for hierarchical indexing in a DataFrame.

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [72]:
df = pd.DataFrame(np.random.randn(6,2), index= hier_index,columns=['A','B'])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,-1.467514,-0.494095
G1,2,-0.162535,0.485809
G1,3,0.392489,0.221491
G2,1,-0.855196,1.54199
G2,2,0.666319,-0.538235
G2,3,-0.568581,1.407338


In [73]:
df.loc['G1']

Unnamed: 0,A,B
1,-1.467514,-0.494095
2,-0.162535,0.485809
3,0.392489,0.221491


In [76]:
df.loc['G1']['A']

1   -1.467514
2   -0.162535
3    0.392489
Name: A, dtype: float64

In [77]:
df.loc['G1'].loc[1]

A   -1.467514
B   -0.494095
Name: 1, dtype: float64

In [82]:
df.index.names

FrozenList(['Group', 'Num'])

In [81]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-1.467514,-0.494095
G1,2,-0.162535,0.485809
G1,3,0.392489,0.221491
G2,1,-0.855196,1.54199
G2,2,0.666319,-0.538235
G2,3,-0.568581,1.407338


In [84]:
df.xs('G1')
# the code is extracting the rows from the DataFrame where the outer level of the MultiIndex is equal to 'G1'. It returns a DataFrame that includes all the rows where the value in the outer level of the MultiIndex is 'G1'.

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-1.467514,-0.494095
2,-0.162535,0.485809
3,0.392489,0.221491


In [86]:
df.xs(['G1',1])

  df.xs(['G1',1])


A   -1.467514
B   -0.494095
Name: (G1, 1), dtype: float64

In [88]:
df.xs(1, level='Num')
# Get the values from both outer group where 'Num' value is 1.

Unnamed: 0_level_0,A,B
Group,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,-1.467514,-0.494095
G2,-0.855196,1.54199


## Missing Data


In [89]:
df = pd.DataFrame({'A':[1,2,np.nan],
                   'B': [5,np.nan,np.nan],
                   'C': [1,2,3]})
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3
