# Import pandas

In [None]:
import numpy as np
import pandas as pd

### Check version

In [None]:
print(pd.__version__)

2.2.2


# Pandas Series

- 1 dimensional data can be stored

- all elements of series have same data type

- one series is treated as a single column data

- series can have a name (just like column name)

- series can have index

## Creating Series

- with list : every element in list is one element in series


- with dictionary : key goes as index of series and value goes as element

### Create Series using list / tuple

In [None]:

# here default integer index is created
s1 = pd.Series([1, 3, 5, 6, 8])# check daa type of s1 : int
s2 =pd.Series([1, 3, 5, np.nan, 6, 8])# check daa type of s2 : float bcz of np.nan

In [None]:
s1

Unnamed: 0,0
0,1
1,3
2,5
3,6
4,8


In [None]:
s2

Unnamed: 0,0
0,1.0
1,3.0
2,5.0
3,
4,6.0
5,8.0


In [None]:
#all elements in a series can have
# ONLY single data type
s1 = pd.Series((1, 'IACSD', 5, np.nan, 6, 8))

s1

Unnamed: 0,0
0,1
1,IACSD
2,5
3,
4,6
5,8


###Create Series using dictionary

with dictionary : key goes as index of series and value goes as element

In [None]:
d= {1:'A',2:'B',3:'C',4:'D'}
s1 = pd.Series(d)
print(s1)
print("########################")
print("Another example")
d= {'A':500,'B':600,'C':700,'D':800}
s1 = pd.Series(d)
print(s1)

1    A
2    B
3    C
4    D
dtype: object
########################
Another example
A    500
B    600
C    700
D    800
dtype: int64


# Pandas Data Frame

- used for 2D data

- any tabular data can be handled using data frame

- can store data where every column has different data types

- efficient for column wise operations

- internally python stores dataframe like a dictionary

## create data frame



### Create data frame using list of list

In [None]:
data = [[1,2,3,4,5],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data)
print(df)
print("###########")
df = pd.DataFrame(data,columns=['c1','c2','c3','c4','c5'])
print(df)
print("###########")
df = pd.DataFrame(data,columns=list('ABCDE'))
print(df)
print("###########")
df = pd.DataFrame(data,index=[101,102,103,104], columns=list('ABCDE'))
print(df)
print("###########")
df = pd.DataFrame(data,index=range(12,16), columns=list('ABCDE'))
print(df)

    0   1   2   3    4
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
###########
   c1  c2  c3  c4   c5
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
###########
    A   B   C   D    E
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
###########
      A   B   C   D    E
101   1   2   3   4    5
102   4   5   6  10   11
103   7   8   9  23   34
104  10  11  12  99  100
###########
     A   B   C   D    E
12   1   2   3   4    5
13   4   5   6  10   11
14   7   8   9  23   34
15  10  11  12  99  100


In [None]:
# No broad casting ... special case
data = [[1,2,3,4,5,6],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data)
print(df)
print("###########")
data = [[1],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data)
print(df)


    0   1   2   3    4    5
0   1   2   3   4    5  6.0
1   4   5   6  10   11  NaN
2   7   8   9  23   34  NaN
3  10  11  12  99  100  NaN
###########
    0     1     2     3      4
0   1   NaN   NaN   NaN    NaN
1   4   5.0   6.0  10.0   11.0
2   7   8.0   9.0  23.0   34.0
3  10  11.0  12.0  99.0  100.0


### Create data frame using dictionary

In [None]:
# in this dictionary one key will represent one column, and value of that key will
# contain all elements of that column
d= {'c1':[1,2,3],
    'c2':[77,88,99]}
df=pd.DataFrame(d)
print(df)

   c1  c2
0   1  77
1   2  88
2   3  99


In [None]:
df2 = pd.DataFrame(
    {
        "A": 1.0, # value 1 will broadcast to all rows of this column
        "B": pd.Timestamp("20220102"), # value here will broadcast to all rows of this column
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo", # value "foo" will broadcast to all rows of this column
    }
)

print(df2)
print("#####################")
#The columns of the resulting DataFrame have different dtypes:
print(df2.dtypes)

     A          B    C  D      E    F
0  1.0 2022-01-02  1.0  3   test  foo
1  1.0 2022-01-02  1.0  3  train  foo
2  1.0 2022-01-02  1.0  3   test  foo
3  1.0 2022-01-02  1.0  3  train  foo
#####################
A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object


In [None]:
#Creating a DataFrame by passing a NumPy array,
#with a datetime index using date_range()
#and labeled columns:

dates = pd.date_range("20230315",
                      periods=6)

dates

DatetimeIndex(['2023-03-15', '2023-03-16', '2023-03-17', '2023-03-18',
               '2023-03-19', '2023-03-20'],
              dtype='datetime64[ns]', freq='D')

In [None]:
from numpy.random import default_rng
rng = default_rng()

In [None]:
rng.standard_normal((6, 4) )

array([[-0.76032819, -0.81125718, -1.1521719 , -0.72729832],
       [ 2.21106257, -0.17958266,  0.59095793,  1.11390238],
       [ 0.80982926,  1.41311344,  0.18267154,  1.444382  ],
       [-0.11374347,  0.11684345,  1.43175535,  1.52591871],
       [ 0.35220529, -0.0478784 ,  1.79651381, -0.45104472],
       [-0.80918185,  1.62168852, -2.61827783,  0.44950027]])

In [None]:
from numpy.random import default_rng
rng = default_rng()

df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

df

Unnamed: 0,A,B,C,D
2023-03-15,-0.283659,-0.471235,1.521249,-0.23918
2023-03-16,-0.614487,0.253835,-1.836311,0.150907
2023-03-17,0.601968,-0.563523,0.940894,1.961701
2023-03-18,-1.752355,0.424707,0.548925,1.748731
2023-03-19,-0.679483,-0.526874,1.747119,-0.729122
2023-03-20,0.70979,0.471119,-2.665041,-0.839319


## Head and Tail working

- df.head() : returns first 5 rows, if n is passed then first n rows

- df.tail() : returns last 5 rows, if n is passed then first n rows

In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

# head
# get first 5 rows from dataframe
#
print(df.head())
print("#####################")
#tail
# get last 5 rows
# if n is passed then last n rows
df.tail(3)
print("#####################")
#index
print("Index values are ....")
print(df.index)
print("#####################")
#columns
print("columns values are ....")
print(df.columns)

                   A         B         C         D
2023-03-15 -1.214033 -0.284601  2.601810 -1.563330
2023-03-16 -0.923019 -0.309807  0.043866 -0.153661
2023-03-17  0.973080 -0.836016  0.546328  0.333955
2023-03-18 -0.800578 -2.114974  0.092404  0.203345
2023-03-19  1.149040  0.153543  0.393401  0.465227
#####################
#####################
Index values are ....
DatetimeIndex(['2023-03-15', '2023-03-16', '2023-03-17', '2023-03-18',
               '2023-03-19', '2023-03-20'],
              dtype='datetime64[ns]', freq='D')
#####################
columns values are ....
Index(['A', 'B', 'C', 'D'], dtype='object')


## Data frame to numpy array

DataFrame.to_numpy() gives a NumPy representation
of the underlying data.

DataFrame.to_numpy() does not include
the index or column labels in the output.

Note that this can be an expensive operation
when your DataFrame has columns with different data types,
which comes down to a fundamental difference between
pandas and NumPy

NumPy arrays have one dtype for the entire array,
while pandas DataFrames have one dtype per column.
When you call DataFrame.to_numpy(), pandas will find
the NumPy dtype that can hold all of the dtypes
in the DataFrame. This may end up being object,
which requires casting every value to a Python object.


For DataFrame of all floating-point values,
DataFrame.to_numpy() is fast  
Also it doesnâ€™t require copying data

In [None]:
data = [[1,2,3,4,5],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data)
print(df)
print("#####################")
n_arr = df.to_numpy()
print(type(n_arr))
print(n_arr)

    0   1   2   3    4
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
#####################
<class 'numpy.ndarray'>
[[  1   2   3   4   5]
 [  4   5   6  10  11]
 [  7   8   9  23  34]
 [ 10  11  12  99 100]]


# Descriptive Statistics *describe()* function

## Series Descriptive Statistics

In [None]:
s1= pd.Series([10,20,30,20,10])
s1.describe()

Unnamed: 0,0
count,5.0
mean,18.0
std,8.3666
min,10.0
25%,10.0
50%,20.0
75%,20.0
max,30.0


## Dataframe Descriptive Statistics

In [None]:
data = [[1,2,3,4,5],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data)
print(df)
print("#####################")
# statistic summary of data
print(df.describe())
print("#####################")
print("Transpose of DataFrame")
# Here index and column names are swapped
df.T

    0   1   2   3    4
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
#####################
               0          1          2          3           4
count   4.000000   4.000000   4.000000   4.000000    4.000000
mean    5.500000   6.500000   7.500000  34.000000   37.500000
std     3.872983   3.872983   3.872983  44.052998   43.500958
min     1.000000   2.000000   3.000000   4.000000    5.000000
25%     3.250000   4.250000   5.250000   8.500000    9.500000
50%     5.500000   6.500000   7.500000  16.500000   22.500000
75%     7.750000   8.750000   9.750000  42.000000   50.500000
max    10.000000  11.000000  12.000000  99.000000  100.000000
#####################
Transpose of DataFrame


Unnamed: 0,0,1,2,3
0,1,4,7,10
1,2,5,8,11
2,3,6,9,12
3,4,10,23,99
4,5,11,34,100


# Sorting



## Sort Dataframe values using given column or columns

- sort_values()

In [None]:
data = [[1,5,3,4,11],[4,2,6,10,11],[7,8,9,23,34],[10,11,12,99,2]] #4 by 5
df = pd.DataFrame(data, index=[101,105,110,102], columns=['A','E','D','C','B'])
print(df)
print("##############")
#or list of columns
df = df.sort_values(by="B",ascending=False)
print(df)
print("##############")
#or list of columns
df = df.sort_values(by=["B",'E'],ascending=False)
print(df)
df = df.sort_values(by=["B",'E'],ascending=[True,False])
print(df)

      A   E   D   C   B
101   1   5   3   4  11
105   4   2   6  10  11
110   7   8   9  23  34
102  10  11  12  99   2
##############
      A   E   D   C   B
110   7   8   9  23  34
101   1   5   3   4  11
105   4   2   6  10  11
102  10  11  12  99   2
##############
      A   E   D   C   B
110   7   8   9  23  34
101   1   5   3   4  11
105   4   2   6  10  11
102  10  11  12  99   2
      A   E   D   C   B
102  10  11  12  99   2
101   1   5   3   4  11
105   4   2   6  10  11
110   7   8   9  23  34


## Sort indexes by an axis

- sort_index()

- axis 0 is row direction

- axis 1 is column direction

In [None]:

data = [[1,2,3,4,5],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data)
print(df)
print("##############")
print("Sort index on axis=0")
df.sort_index(axis=0, ascending=False)

    0   1   2   3    4
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
##############
Sort index on axis=0


Unnamed: 0,0,1,2,3,4
3,10,11,12,99,100
2,7,8,9,23,34
1,4,5,6,10,11
0,1,2,3,4,5


## Sort column names of a dataframe

- sort_index()

In [None]:
data = [[1,2,3,4,5],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data, columns=['A','E','D','C','B'])
print(df)
print("##############")
print("Sort index on axis=1")
df.sort_index(axis=1, ascending=False)

    A   E   D   C    B
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
##############
Sort index on axis=1


Unnamed: 0,E,D,C,B,A
0,2,3,4,5,1
1,5,6,10,11,4
2,8,9,23,34,7
3,11,12,99,100,10


## Sort the row indexes of a data frame

- sort_index()

In [None]:
data = [[1,2,3,4,5],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data, index=[101,105,110,102], columns=['A','E','D','C','B'])
print(df)
print("##############")
print("Sort index on axis=0")
df = df.sort_index(axis=0)
print(df)
print("##############")
print("Sort index on axis=0")
df =df.sort_index(axis=0, ascending=False)
print(df)

      A   E   D   C    B
101   1   2   3   4    5
105   4   5   6  10   11
110   7   8   9  23   34
102  10  11  12  99  100
##############
Sort index on axis=0
      A   E   D   C    B
101   1   2   3   4    5
102  10  11  12  99  100
105   4   5   6  10   11
110   7   8   9  23   34
##############
Sort index on axis=0
      A   E   D   C    B
110   7   8   9  23   34
105   4   5   6  10   11
102  10  11  12  99  100
101   1   2   3   4    5


# Practice

Q1 . Create a data frame from dictionary. Names of the columns are module names (3) and row labels (index) are roll nos (5). Enter data in following order.

Rollno(index)     SQL   Python   AA


101     

109

102

125

110

In [None]:
d= {
    'SQL':[14,15,11,17,16],
    'DA':[34,23,36,25,16],
}
df = pd.DataFrame(d, index = [22,11,7,9,2])
print(df)

    SQL  DA
22   14  34
11   15  23
7    11  36
9    17  25
2    16  16


In [None]:
df['total']=df['SQL']+df['DA']
df.sort_values(by=['total'])

Unnamed: 0,SQL,DA,total
2,16,16,32
11,15,23,38
9,17,25,42
7,11,36,47
22,14,34,48


Q2 Sort all columns by their name

Q3 Print the data frame in a way that all roll nos are sorted

Q4 Print data frame in a way that, topper of python module comes as first row and then second topper and so on.

In [None]:
#Q2 Sort all columns by their name
print(df.sort_index(axis=1))
#Q3 Print the data frame in a way that all roll nos are sorted
print(df.sort_index(axis=0))
#Q4 Print data frame in a way that, topper of python module comes as first row and then second topper and so on.
print(df.sort_values(by='Python', ascending=False))

    DA  SQL  total
22  34   14     48
11  23   15     38
7   36   11     47
9   25   17     42
2   16   16     32
    SQL  DA  total
2    16  16     32
7    11  36     47
9    17  25     42
11   15  23     38
22   14  34     48


KeyError: 'Python'

# Inplace parameter in Pandas functions

when inplace = True , then original data frame is changed. No copy is created / returned

when inplace = False , then copy is created of given data frame and copy is modified and returned

By default inplace = False



Q5 Sort all columns by their name for original dataframe. Dont create a copy.

Q6 Print the data frame in a way that all roll nos are sorted

Q7 Print data frame in a way that, topper of python module comes as first row and then second topper and so on.


In [None]:
df.sort_index(axis=1, inplace =True)
print(df)
df.sort_index(axis=0, inplace =True)
print(df)
df.sort_values(by='Python', ascending=False, inplace = True)
print(df)

     AA  Python  SQL
101  30      34   10
102  32      36   30
109  28      23   20
110  25      16   25
125  16      25   35
     AA  Python  SQL
101  30      34   10
102  32      36   30
109  28      23   20
110  25      16   25
125  16      25   35
     AA  Python  SQL
102  32      36   30
101  30      34   10
125  16      25   35
109  28      23   20
110  25      16   25


# Accessing and Selecting Data from Dataframe

## Select a column

In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print("Direct way df['A']")
print(df['A']) # note the data type
print("################")
print("loc : select column by name df.loc[:,'A']")
print(df.loc[:,'A'])
print("################")
print("iloc : select column by index df.iloc[:,0]")
print(df.iloc[:,0])

Direct way df['A']
2023-03-15    1.320009
2023-03-16    0.977240
2023-03-17    1.105996
2023-03-18    1.637962
2023-03-19   -0.594982
2023-03-20   -0.674834
Freq: D, Name: A, dtype: float64
################
loc : select column by name df.loc[:,'A']
2023-03-15    1.320009
2023-03-16    0.977240
2023-03-17    1.105996
2023-03-18    1.637962
2023-03-19   -0.594982
2023-03-20   -0.674834
Freq: D, Name: A, dtype: float64
################
iloc : select column by index df.iloc[:,0]
2023-03-15    1.320009
2023-03-16    0.977240
2023-03-17    1.105996
2023-03-18    1.637962
2023-03-19   -0.594982
2023-03-20   -0.674834
Freq: D, Name: A, dtype: float64


## select a row by index

In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print(df)
print("loc: select a single row using label of row df.loc['2023-03-15']")
print(df.loc["2023-03-15"])
print("################")
print("iloc: select a single row using index of row df.iloc[0]")
print(df.iloc[0])

                   A         B         C         D
2023-03-15 -0.425496  1.569852  0.253590  0.389239
2023-03-16  0.135678  0.048230 -0.103572  0.184423
2023-03-17 -0.092790  0.530495  0.736643 -0.259811
2023-03-18 -0.754468 -0.040571  1.659652 -0.993765
2023-03-19  1.120366 -0.739553  1.705381  1.120448
2023-03-20  1.930682 -0.578636 -2.645407  0.768295
loc: select a single row using label of row df.loc['2023-03-15']
A   -0.425496
B    1.569852
C    0.253590
D    0.389239
Name: 2023-03-15 00:00:00, dtype: float64
################
iloc: select a single row using index of row df.iloc[0]
A   -0.425496
B    1.569852
C    0.253590
D    0.389239
Name: 2023-03-15 00:00:00, dtype: float64


# Slicing OR Finding subset of data frame OR Selecting multiple rows / columns

## Select Multiple columns

- by name (label)

- by index



In [None]:

df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print(df)
print("Select columns B and C by name df[['B','C']]")
print(df[['B','C']])
print("##############")
print("loc : Select columns B and C by name df.loc[:,['B','C']]")
print(df.loc[:,['B','C']])
print("##############")
print("iloc : Select columns B and C by index (end is excluded) df.iloc[:,1:3]")
print(df.iloc[:,1:3])
print("##############")
print("iloc : Select columns B and D by index (end is excluded) df.iloc[:,[1,3]]")
print(df.iloc[:,[1,3]])

                   A         B         C         D
2023-03-15  1.423625 -0.704873  0.102576  0.024497
2023-03-16  0.576376 -1.792552 -0.339249  1.345366
2023-03-17  0.429096 -0.003459  0.227604 -0.381545
2023-03-18 -2.300929 -0.110601 -0.708734  0.398343
2023-03-19  1.119251  0.776631  1.507313 -3.074306
2023-03-20 -1.354255 -0.773607 -0.427368 -0.893150
Select columns B and C by name df[['B','C']]
                   B         C
2023-03-15 -0.704873  0.102576
2023-03-16 -1.792552 -0.339249
2023-03-17 -0.003459  0.227604
2023-03-18 -0.110601 -0.708734
2023-03-19  0.776631  1.507313
2023-03-20 -0.773607 -0.427368
##############
loc : Select columns B and C by name df.loc[:,['B','C']]
                   B         C
2023-03-15 -0.704873  0.102576
2023-03-16 -1.792552 -0.339249
2023-03-17 -0.003459  0.227604
2023-03-18 -0.110601 -0.708734
2023-03-19  0.776631  1.507313
2023-03-20 -0.773607 -0.427368
##############
iloc : Select columns B and C by index (end is excluded) df.iloc[:,1:3]
     

## Select rows

- by index

- by label

In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print(df)
print("Selecting by label df['20230315':'20230318'] end is included")
print(df["20230315":"20230318"])
print("##############")
print("loc: Selecting by label df.loc['20230315':'20230318', ] end is included")
print(df.loc["20230315":"20230318",])
print("##############")
print("Selecting by index df[0:3] end is excluded")
print(df[0:3])
print("##############")
print("iloc: Selecting by index df.iloc[0:3,] end is excluded")
print(df.iloc[0:3,])

                   A         B         C         D
2023-03-15  1.685728  0.831602 -0.117946  1.436211
2023-03-16  2.165238 -1.415170  0.713006 -0.543027
2023-03-17  1.310325  1.030990 -0.092548  0.517207
2023-03-18  1.729686 -0.642621  0.202537  0.014272
2023-03-19  0.315413  0.911041  2.385910 -1.706535
2023-03-20  0.761118  0.015818  0.122562 -0.253136
Selecting by label df['20230315':'20230318'] end is included
                   A         B         C         D
2023-03-15  1.685728  0.831602 -0.117946  1.436211
2023-03-16  2.165238 -1.415170  0.713006 -0.543027
2023-03-17  1.310325  1.030990 -0.092548  0.517207
2023-03-18  1.729686 -0.642621  0.202537  0.014272
##############
loc: Selecting by label df.loc['20230315':'20230318', ] end is included
                   A         B         C         D
2023-03-15  1.685728  0.831602 -0.117946  1.436211
2023-03-16  2.165238 -1.415170  0.713006 -0.543027
2023-03-17  1.310325  1.030990 -0.092548  0.517207
2023-03-18  1.729686 -0.642621  0.20

# loc

construct for slicing DataFrame

this construct allows to access slice / part of the dataframe based on labels of rows or columns

row labels are user defined index and column labels are column names

labels are always strings

While slicing using loc, end is included

In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print(df)
print("##############")
print("Select rows from date 15-03-2023 to 18-03-2023 \
and columns A & B df.loc['20230315':'20230318', ['A', 'B']]")

print(df.loc["20230315":"20230318", ["A", "B"]])

print("##############")

print("Access Single cell (return scalar value) using loc df.loc[dates[0], 'A']")
print(df.loc[dates[0], 'A'])

                   A         B         C         D
2023-03-15  0.536809  1.351359 -0.434289 -0.567217
2023-03-16 -0.606667 -0.897744  0.004610 -0.387571
2023-03-17 -0.316901 -0.174297 -0.133275  0.305414
2023-03-18 -0.345993 -0.263366  0.966780 -1.042862
2023-03-19 -2.379784  2.340416  1.342577  0.444983
2023-03-20  0.597469  0.632389  0.510123  2.113037
##############
Select rows from date 15-03-2023 to 18-03-2023 and columns A & B df.loc['20230315':'20230318', ['A', 'B']]
                   A         B
2023-03-15  0.536809  1.351359
2023-03-16 -0.606667 -0.897744
2023-03-17 -0.316901 -0.174297
2023-03-18 -0.345993 -0.263366
##############
Access Single cell (return scalar value) using loc df.loc[dates[0], 'A']
0.5368087574965121


# iloc

construct for slicing DataFrame

Selection by index (position)

uses rows number(always start from 0) and column number(always start from 0)

This returns a data frame / series

End is excluded

In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print(df)
print("##############")
print(df.iloc[3:5, 0:2])

                   A         B         C         D
2023-03-15  0.518655 -1.692986 -1.567214 -1.074259
2023-03-16  0.331447 -0.203882 -0.760320  1.068118
2023-03-17  2.652918 -1.002382  0.889460  0.817000
2023-03-18  0.010814  0.095427  0.177513  0.490402
2023-03-19  2.812125 -0.062186  0.153920  0.228175
2023-03-20  1.648320  0.275245 -0.291960  0.370341
##############
                   A         B
2023-03-18  0.010814  0.095427
2023-03-19  2.812125 -0.062186



# Access single cell in a efficient way

## at

returns a single value based on label

## iat

returns a single value based on index

In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print(df)
print("##############")
print(df.at['2023-03-15', 'A'])
print("##############")
print(df.iat[0, 0])

                   A         B         C         D
2023-03-15  1.940784 -0.670069 -1.212297  1.656952
2023-03-16  1.497002 -1.514099  1.154060 -0.809661
2023-03-17 -0.454865  0.591773  1.482441  0.106005
2023-03-18 -0.598530 -0.501846 -0.816558  0.640827
2023-03-19 -2.064065 -1.651633 -1.327569  0.052052
2023-03-20 -0.285122  0.589789 -0.344147  0.955206
##############
1.9407839453929223
##############
1.9407839453929223


# Practice

On previously used dataframe perform slicing operations

Q1. Select rows of last 3 students  
- use loc
- use iloc

Q2. Select python marks of roll no 101 and 110
- use loc
- use iloc

Q3. Print the row of student who is 3rd highest in AA



In [None]:
d= {
    'SQL':[10,20,30,35,25],
    'Python':[34,23,36,25,16],
    'AA':[30,28,32,16,25],
}
df = pd.DataFrame(d, index = [101,109,102,125,110])
print(df)
print("#############")
df.sort_values(by='AA',ascending=False,inplace=True)
print(df)
print("#############")
print(df.iloc[2])
print("#############")
print(df.sort_values(by='AA',ascending=False).iloc[2])

     SQL  Python  AA
101   10      34  30
109   20      23  28
102   30      36  32
125   35      25  16
110   25      16  25
#############
     SQL  Python  AA
102   30      36  32
101   10      34  30
109   20      23  28
110   25      16  25
125   35      25  16
#############
SQL       20
Python    23
AA        28
Name: 109, dtype: int64
#############
SQL       20
Python    23
AA        28
Name: 109, dtype: int64


# Create shallow copy of data frame


In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print(id(df))

df2 = df.copy()
print(id(df2))

138551301752592
138551303760848


# Add new column
## Direct assignment


In [None]:
from numpy.random import default_rng
rng = default_rng()

df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

print(df)
print("############")
df["E"] = ["one", "one", "two",
           "three", "four", "three",
           ]
print(df)
print("############")
df['F']=10 # Works
print(df)
print("############")
df['G']=[10] # error
print(df)

                   A         B         C         D
2023-03-15 -0.315836 -0.997688 -1.598943 -0.821370
2023-03-16 -1.149755 -0.743261 -0.194141  1.488230
2023-03-17 -0.360846 -1.297747  1.100502  2.192956
2023-03-18  0.305299  0.441104 -0.168190  0.084253
2023-03-19 -0.220179 -1.591807 -1.523124  0.383831
2023-03-20 -0.053104  2.180864 -1.588970 -1.173061
############
                   A         B         C         D      E
2023-03-15 -0.315836 -0.997688 -1.598943 -0.821370    one
2023-03-16 -1.149755 -0.743261 -0.194141  1.488230    one
2023-03-17 -0.360846 -1.297747  1.100502  2.192956    two
2023-03-18  0.305299  0.441104 -0.168190  0.084253  three
2023-03-19 -0.220179 -1.591807 -1.523124  0.383831   four
2023-03-20 -0.053104  2.180864 -1.588970 -1.173061  three
############
                   A         B         C         D      E   F
2023-03-15 -0.315836 -0.997688 -1.598943 -0.821370    one  10
2023-03-16 -1.149755 -0.743261 -0.194141  1.488230    one  10
2023-03-17 -0.360846 -1.2

ValueError: Length of values (1) does not match length of index (6)

In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

print(df)
print("############")
#Using Series object with index
s1 = pd.Series([1, 2, 3, 4, 5, 6],
               index=pd.date_range("20230316", periods=6))
print(s1)
print("############")
df["Z"] = s1
print(df)

                   A         B         C         D
2023-03-15  1.332999  0.107905 -2.612704 -0.906423
2023-03-16  1.481725 -0.761254 -1.615807 -0.332888
2023-03-17  0.025721 -0.987778  1.290840 -1.968789
2023-03-18  0.804074 -0.376350 -0.082986  0.005882
2023-03-19  0.397967  0.833019  0.645963  0.857927
2023-03-20 -1.125494  0.198684  0.087305 -0.362865
############
2023-03-16    1
2023-03-17    2
2023-03-18    3
2023-03-19    4
2023-03-20    5
2023-03-21    6
Freq: D, dtype: int64
############
                   A         B         C         D    Z
2023-03-15  1.332999  0.107905 -2.612704 -0.906423  NaN
2023-03-16  1.481725 -0.761254 -1.615807 -0.332888  1.0
2023-03-17  0.025721 -0.987778  1.290840 -1.968789  2.0
2023-03-18  0.804074 -0.376350 -0.082986  0.005882  3.0
2023-03-19  0.397967  0.833019  0.645963  0.857927  4.0
2023-03-20 -1.125494  0.198684  0.087305 -0.362865  5.0


# Boolean Indexing (Filtering)

In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

print(df)
print("############")
print(df[df["A"] > 0])
print("############")
print(df[[True,True,False,False,True,True]])

                   A         B         C         D
2023-03-15 -0.964428 -0.801885 -0.906518  0.092412
2023-03-16  0.166966 -1.346376  1.310914 -0.703432
2023-03-17 -0.480140 -1.939386  0.672398  1.614126
2023-03-18 -1.666450 -0.370255  1.457855  0.389111
2023-03-19 -0.115726  0.657977 -2.450609 -0.102892
2023-03-20 -0.107501  1.158994 -0.778654  0.099406
############
                   A         B         C         D
2023-03-16  0.166966 -1.346376  1.310914 -0.703432
############
                   A         B         C         D
2023-03-15 -0.964428 -0.801885 -0.906518  0.092412
2023-03-16  0.166966 -1.346376  1.310914 -0.703432
2023-03-19 -0.115726  0.657977 -2.450609 -0.102892
2023-03-20 -0.107501  1.158994 -0.778654  0.099406


In [None]:

print(df["A"] > 0)

2023-03-15    False
2023-03-16     True
2023-03-17    False
2023-03-18    False
2023-03-19    False
2023-03-20    False
Freq: D, Name: A, dtype: bool


# Select Cells by condition

- Cells which fulfill condition are returned as it is
- Cells which don't fulfill condition are given value NaN

In [None]:
print(df > 0)

                A      B      C      D
2023-03-15  False  False  False   True
2023-03-16   True  False   True  False
2023-03-17  False  False   True   True
2023-03-18  False  False   True   True
2023-03-19  False   True  False  False
2023-03-20  False   True  False   True


In [None]:
print(df[df > 0])

                   A         B         C         D
2023-03-15       NaN       NaN       NaN  0.092412
2023-03-16  0.166966       NaN  1.310914       NaN
2023-03-17       NaN       NaN  0.672398  1.614126
2023-03-18       NaN       NaN  1.457855  0.389111
2023-03-19       NaN  0.657977       NaN       NaN
2023-03-20       NaN  1.158994       NaN  0.099406


# Practice selecting using condition ( Boolean Indexing)

In [None]:
d= {
    'SQL':[10,20,30,35,25],
    'Python':[34,23,36,25,16],
    'AA':[30,28,32,16,25],
}
df = pd.DataFrame(d, index = [101,109,102,125,110])
print(df)
print("#############")
print("Select all students who have passed in SQL")
print(df[df['SQL'] > 16])
print("#############")
print("Select all students who have passed in SQL and Python")
#print(df[(df['SQL'] > 16) and (df['Python'] > 16)]) # Error
print(df[(df['SQL'] > 16) & (df['Python'] > 16)])
print("#############")
print("Select all students who have failed in SQL or AA")
#print(df[(df['SQL'] < 16) or (df['AA'] < 16)]) # Error
print(df[(df['SQL'] < 16) | (df['AA'] < 16)])

     SQL  Python  AA
101   10      34  30
109   20      23  28
102   30      36  32
125   35      25  16
110   25      16  25
#############
Select all students who have passed in SQL
     SQL  Python  AA
109   20      23  28
102   30      36  32
125   35      25  16
110   25      16  25
#############
Select all students who have passed in SQL and Python
     SQL  Python  AA
109   20      23  28
102   30      36  32
125   35      25  16
#############
Select all students who have failed in SQL or AA
     SQL  Python  AA
101   10      34  30


In [None]:
print("Select all students who have passed in SQL and Python and AA")
print(df[(df['SQL']>16 ) & (df['Python']>16) & (df['AA']>16)])

Select all students who have passed in SQL and Python and AA
     SQL  Python  AA
109   20      23  28
102   30      36  32


## Using loc

In [None]:
d= {
    'SQL':[10,20,30,35,25],
    'Python':[34,23,36,25,16],
    'AA':[30,28,32,16,25],
}
df = pd.DataFrame(d, index = [101,109,102,125,110])
print(df)
print("#############")
print("Select all students who have passed in SQL")
print(df.loc[df['SQL'] > 16])
print("#############")
print("Select all students who have passed in SQL and Python")
#print(df[(df['SQL'] > 16) and (df['Python'] > 16)]) # Error
print(df.loc[(df['SQL'] > 16) & (df['Python'] > 16)])
print("#############")
print("Select all students who have failed in SQL or AA")
#print(df[(df['SQL'] < 16) or (df['AA'] < 16)]) # Error
print(df.loc[(df['SQL'] < 16) | (df['AA'] < 16)])

     SQL  Python  AA
101   10      34  30
109   20      23  28
102   30      36  32
125   35      25  16
110   25      16  25
#############
Select all students who have passed in SQL
     SQL  Python  AA
109   20      23  28
102   30      36  32
125   35      25  16
110   25      16  25
#############
Select all students who have passed in SQL and Python
     SQL  Python  AA
109   20      23  28
102   30      36  32
125   35      25  16
#############
Select all students who have failed in SQL or AA
     SQL  Python  AA
101   10      34  30


In [None]:
d= {
    'SQL':[10,20,30,35,25],
    'Python':[34,23,36,25,16],
    'AA':[30,28,32,16,25],
}
df = pd.DataFrame(d, index = [101,109,102,125,110])
print(df)
print("#############")
print("Select all students who have failed in SQL or AA, display ONLY SQL and AA marks")
print(df.loc[(df['SQL'] < 16) | (df['AA'] < 16), ['SQL', 'AA'] ])

     SQL  Python  AA
101   10      34  30
109   20      23  28
102   30      36  32
125   35      25  16
110   25      16  25
#############
Select all students who have failed in SQL or AA, display ONLY SQL and AA marks
     SQL  AA
101   10  30


# Filetering using multiple values

In [None]:
from numpy.random import default_rng
rng = default_rng()

df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

print(df)
print("############")
df["E"] = ["one", "one", "two",
           "three", "four", "three",
           ]
print(df)
print("############")
print(df[df["E"].isin(["two", "four"])])

                   A         B         C         D
2023-03-15  0.454967  0.762087 -0.906509 -0.800184
2023-03-16  0.114119 -0.153369 -1.844797 -1.125623
2023-03-17  1.494045 -0.397963 -0.033467 -0.983249
2023-03-18  0.867955 -2.105086 -1.344394 -0.142225
2023-03-19  0.360870 -3.018773 -0.625548  1.879637
2023-03-20 -0.610614 -0.301414 -1.152072  0.228265
############
                   A         B         C         D      E
2023-03-15  0.454967  0.762087 -0.906509 -0.800184    one
2023-03-16  0.114119 -0.153369 -1.844797 -1.125623    one
2023-03-17  1.494045 -0.397963 -0.033467 -0.983249    two
2023-03-18  0.867955 -2.105086 -1.344394 -0.142225  three
2023-03-19  0.360870 -3.018773 -0.625548  1.879637   four
2023-03-20 -0.610614 -0.301414 -1.152072  0.228265  three
############
                   A         B         C         D     E
2023-03-17  1.494045 -0.397963 -0.033467 -0.983249   two
2023-03-19  0.360870 -3.018773 -0.625548  1.879637  four


In [None]:
df["E"].isin(["two", "four"])

2023-03-15    False
2023-03-16    False
2023-03-17     True
2023-03-18    False
2023-03-19     True
2023-03-20    False
Freq: D, Name: E, dtype: bool

In [None]:
("two", "four") in df["E"]

InvalidIndexError: ('two', 'four')

# Update single cell

###using label -> date and column "A"

In [None]:
from numpy.random import default_rng
rng = default_rng()

df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

print(df)
print("############")

df.at['2023-03-15', "A"] = 0

print(df)

#using index -> first row and third column
df.iat[0, 2] = 0

print(df)

# Using Numpy array
df.loc[:, "D"] = np.array([5] * len(df))

print(df)

# Update using condition
df2 = df.copy()
df2['E']=10
df2[df2 <= 0] = -df2

print(df2)



                   A         B         C         D
2023-03-15  0.239802 -0.580218  0.671724 -0.191127
2023-03-16 -0.462368  0.476623  0.201328 -0.209788
2023-03-17 -1.512886 -0.505985  1.449243  0.065673
2023-03-18  0.101725  1.642766 -1.386160  0.627465
2023-03-19  0.025715 -0.303390 -1.227474 -0.478679
2023-03-20  0.283054 -0.337952  0.472980  0.358749
############
                   A         B         C         D
2023-03-15  0.000000 -0.580218  0.671724 -0.191127
2023-03-16 -0.462368  0.476623  0.201328 -0.209788
2023-03-17 -1.512886 -0.505985  1.449243  0.065673
2023-03-18  0.101725  1.642766 -1.386160  0.627465
2023-03-19  0.025715 -0.303390 -1.227474 -0.478679
2023-03-20  0.283054 -0.337952  0.472980  0.358749
                   A         B         C         D
2023-03-15  0.000000 -0.580218  0.000000 -0.191127
2023-03-16 -0.462368  0.476623  0.201328 -0.209788
2023-03-17 -1.512886 -0.505985  1.449243  0.065673
2023-03-18  0.101725  1.642766 -1.386160  0.627465
2023-03-19  0.0257

# Mean Median Mode of All columns



### When all columns are numbers ( continuous)

In [None]:
from numpy.random import default_rng
rng = default_rng()

df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

print(df)
print("############")
print("Mean")
print(df.mean())
print("############")
print("Median")
print(df.median())

                   A         B         C         D
2023-03-15  0.936454 -0.522897 -0.662565  0.491822
2023-03-16  0.201702 -0.144814  1.171709 -0.465899
2023-03-17 -1.976228 -0.442338  2.584648 -1.740182
2023-03-18 -0.381005  0.611932  0.944455  1.856276
2023-03-19  0.460962  0.717926 -3.191161 -0.123803
2023-03-20  0.074121  0.363126 -1.365758  2.240260
############
Mean
A   -0.113999
B    0.097156
C   -0.086445
D    0.376412
dtype: float64
############
Median
A    0.137911
B    0.109156
C    0.140945
D    0.184010
dtype: float64


### When there is mix of categorical (String) and continuous ( number) columns

In [None]:
df1 = pd.DataFrame()
df1['cat1'] = ['A','B','B','B','C','D']
df1['cat2'] = [1,1,2,2,3,4]
df1['rno'] = [1,2,3,4,5,6]
df1['marks']= [23,34,39,16,10,25]
print(df1)
print("############")
print("Mean")
print(df1.iloc[:,2:].mean())
print("############")
print("Median")
print(df1.iloc[:,2:].median())
print("############")
print("Mode :: May return multiple values")
print(df1.mode())

  cat1  cat2  rno  marks
0    A     1    1     23
1    B     1    2     34
2    B     2    3     39
3    B     2    4     16
4    C     3    5     10
5    D     4    6     25
############
Mean
rno       3.5
marks    24.5
dtype: float64
############
Median
rno       3.5
marks    24.0
dtype: float64
############
Mode :: May return multiple values
  cat1  cat2  rno  marks
0    B   1.0    1     10
1  NaN   2.0    2     16
2  NaN   NaN    3     23
3  NaN   NaN    4     25
4  NaN   NaN    5     34
5  NaN   NaN    6     39
