# Pandas and Pandera

In [2]:
import pandas as pd

In [3]:
%pip install pandera

Note: you may need to restart the kernel to use updated packages.


# Pandas Terminology
* Data Frame, in Excel called Sheet, is consist of colum and row series.
* A Series is a collection of values, i.e. A single column or a single row is called Series.
* Title of a Series is called Column Index and Row Index, as the case may be.
## Pandas Core Components
* Series Types
* Data Frame Types

In [11]:
li:list[int] = [1, 2, 3, 4, 5] # It has no index numbers visible
li

[1, 2, 3, 4, 5]

In [4]:
import pandas as pd
import pandera as pa
s1: pd.Series = pd.Series([1, 2, 3, 4, 5]) # Series creation from a list
s1 # it has index numbers visible

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [5]:
import pandas as pd
import pandera as pa
s1: pd.Series = pd.Series({1, 2, 3, 4, 5}) # Series creation from a set
s1

TypeError: 'set' type is unordered

In [6]:
import pandas as pd
import pandera as pa
s1: pd.Series = pd.Series((1, 2, 3, 4, 5)) # series creation from a tuple
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [7]:
import pandas as pd
import pandera as pa
s1: pd.Series = pd.Series({"a": 1, "b": 2, "c": 3, "d": 4, "e": 5})
s1 # series creation from a dictionary

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [8]:
# How to make index of our own choice
values: list[int] = [1, 2, 3, 4, 5]
desired_index: list[str] = ['a', 'b', 'c', 'd', 'e']
s1: pd.Series = pd.Series(values, index=desired_index)
s1

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [9]:
# How to make multiple index of our own choice
values: list[int] = [1, 2, 3, 4, 5]
desired_index: list[list[str]] = [['a1', 'a1', 'a1', 'b1', 'b1'], ['a', 'b', 'c', 'd', 'e']]
s1: pd.Series = pd.Series(values, index=desired_index)
s1

a1  a    1
    b    2
    c    3
b1  d    4
    e    5
dtype: int64

In [10]:
# How to apply extra parameters
values: list[int] = [1, 2, 3, 4, 5]
desired_index: list[list[str]] = [['a1', 'a1', 'a1', 'b1', 'b1'], ['a', 'b', 'c', 'd', 'e']]
s1: pd.Series = pd.Series(values, index=desired_index, name="Group Data", dtype="int32")
s1

a1  a    1
    b    2
    c    3
b1  d    4
    e    5
Name: Group Data, dtype: int32

# Data Frame

In [21]:
s1: pd.Series = pd.Series([1, 2, 3, 4, 5])
s2: pd.Series = pd.Series([10, 20, 30, 40, 50])
s3: pd.Series = pd.Series(['Muhammad', 'Ali', 'Fatima', 'Hassan', 'Hussain'])
df: pd.DataFrame = pd.DataFrame({"Id": s1, "Student Name": s3, "Score": s2})
df

Unnamed: 0,Id,Student Name,Score
0,1,Muhammad,10
1,2,Ali,20
2,3,Fatima,30
3,4,Hassan,40
4,5,Hussain,50


In [22]:
s1: pd.Series = pd.Series([1, 2, 3, 4, 5], name="Id")
s2: pd.Series = pd.Series([10, 20, 30, 40, 50], name="Score")
s3: pd.Series = pd.Series(['Muhammad', 'Ali', 'Fatima', 'Hassan', 'Hussain'], name="Student Name")
df: pd.DataFrame = pd.DataFrame({"Id": s1, "Student Name": s3, "Score": s2})
df

Unnamed: 0,Id,Student Name,Score
0,1,Muhammad,10
1,2,Ali,20
2,3,Fatima,30
3,4,Hassan,40
4,5,Hussain,50


In [26]:
s1: pd.Series = pd.Series([1, 2, 3, 4, 5], name="Id")
s2: pd.Series = pd.Series([10, 20, 30, 40, 50], name="Score")
s3: pd.Series = pd.Series(['Muhammad', 'Ali', 'Fatima', 'Hassan', 'Hussain'], name="Student Name")
df: pd.DataFrame = pd.concat([s1, s3, s2], axis=1)
df

Unnamed: 0,Id,Student Name,Score
0,1,Muhammad,10
1,2,Ali,20
2,3,Fatima,30
3,4,Hassan,40
4,5,Hussain,50


In [27]:
data: list[list[int]] = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
df: pd.DataFrame = pd.DataFrame(data)
df

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [28]:
data: list[list[int]] = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
df: pd.DataFrame = pd.DataFrame(data, columns = ['A', 'B', 'C'])
df

Unnamed: 0,A,B,C
0,1,2,3
1,4,5,6
2,7,8,9


In [29]:
data: list[list[int]] = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
df: pd.DataFrame = pd.DataFrame(data, columns = ['A', 'B', 'C'], index = ['x', 'y', 'z'])
df

Unnamed: 0,A,B,C
x,1,2,3
y,4,5,6
z,7,8,9


In [30]:
df.columns

Index(['A', 'B', 'C'], dtype='object')

In [31]:
df.index

Index(['x', 'y', 'z'], dtype='object')

In [32]:
df.values

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]], dtype=int64)

In [33]:
from nptyping import NDArray, Shape, Int64
import numpy as np
data: NDArray[Shape["10, 10"], Int64] = np.arange(10*10).reshape(10, 10)
data

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
       [50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
       [60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
       [70, 71, 72, 73, 74, 75, 76, 77, 78, 79],
       [80, 81, 82, 83, 84, 85, 86, 87, 88, 89],
       [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]])

In [34]:
from nptyping import NDArray, Shape, Int64
import numpy as np
data: NDArray[Shape["10, 10"], Int64] = np.arange(10*10).reshape(10, 10)
df: pd.DataFrame = pd.DataFrame(data, columns=list("ABCDEFGHIJ"), index=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
df

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
1,0,1,2,3,4,5,6,7,8,9
2,10,11,12,13,14,15,16,17,18,19
3,20,21,22,23,24,25,26,27,28,29
4,30,31,32,33,34,35,36,37,38,39
5,40,41,42,43,44,45,46,47,48,49
6,50,51,52,53,54,55,56,57,58,59
7,60,61,62,63,64,65,66,67,68,69
8,70,71,72,73,74,75,76,77,78,79
9,80,81,82,83,84,85,86,87,88,89
10,90,91,92,93,94,95,96,97,98,99


In [35]:
dfl: list[pd.DataFrame] = pd.read_html("http://www.w3schools.com/python/python_operators.asp")
dfl

[  Operator            Name Example    Try it
 0        +        Addition   x + y  Try it »
 1        -     Subtraction   x - y  Try it »
 2        *  Multiplication   x * y  Try it »
 3        /        Division   x / y  Try it »
 4        %         Modulus   x % y  Try it »
 5       **  Exponentiation  x ** y  Try it »
 6       //  Floor division  x // y  Try it »,
    Operator  Example     Same As    Try it
 0         =    x = 5       x = 5  Try it »
 1        +=   x += 3   x = x + 3  Try it »
 2        -=   x -= 3   x = x - 3  Try it »
 3        *=   x *= 3   x = x * 3  Try it »
 4        /=   x /= 3   x = x / 3  Try it »
 5        %=   x %= 3   x = x % 3  Try it »
 6       //=  x //= 3  x = x // 3  Try it »
 7       **=  x **= 3  x = x ** 3  Try it »
 8        &=   x &= 3   x = x & 3  Try it »
 9        |=   x |= 3   x = x | 3  Try it »
 10       ^=   x ^= 3   x = x ^ 3  Try it »
 11      >>=  x >>= 3  x = x >> 3  Try it »
 12      <<=  x <<= 3  x = x << 3  Try it »,
   Operator   

In [36]:
dfl[0]

Unnamed: 0,Operator,Name,Example,Try it
0,+,Addition,x + y,Try it »
1,-,Subtraction,x - y,Try it »
2,*,Multiplication,x * y,Try it »
3,/,Division,x / y,Try it »
4,%,Modulus,x % y,Try it »
5,**,Exponentiation,x ** y,Try it »
6,//,Floor division,x // y,Try it »


In [37]:
df: pd.DataFrame = pd.read_json("http://www.w3schools.com/python/pandas/data.js")
df

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0
...,...,...,...,...
164,60,105,140,290.8
165,60,110,145,300.4
166,60,115,145,310.2
167,75,120,150,320.4


In [38]:
# How to use Pandera
df = pd.DataFrame({
    "column-1": [1, 4, 0, 10, 9], 
    "column-2": [-1.3, -1.4, -2.9, -10.1, -20.4], 
    "column-3": ['value_1', 'value_2', 'value_3', 'value_2', 'value_1'],
})
# define schema
schema = pa.DataFrameSchema({
    "column-1": pa.Column(int, checks = pa.Check.le(10)),
    "column-2": pa.Column(float, checks = pa.Check.lt(-1.2)),
    "column-3": pa.Column(str, checks = [
        pa.Check.str_startswith('value_'),
        pa.Check(lambda s: s.str.split("_", expand = True).shape[1] == 2),
        ])
})
validated_df = schema(df)
print(validated_df)

   column-1  column-2 column-3
0         1      -1.3  value_1
1         4      -1.4  value_2
2         0      -2.9  value_3
3        10     -10.1  value_2
4         9     -20.4  value_1


# slicing and indexing
* series_variable[index]
* DataFrame
    * loc
    * iloc
    * at
    * iat

In [39]:
s1: pd.Series = pd.Series([1, 2, 3, 4, 5])
display(s1)
# Applying indexing
display(s1[3])


0    1
1    2
2    3
3    4
4    5
dtype: int64

4

In [40]:
s1: pd.Series = pd.Series([1, 2, 3, 4, 5])
display(s1)
# Applying slicing
display(s1[1:4])


0    1
1    2
2    3
3    4
4    5
dtype: int64

1    2
2    3
3    4
dtype: int64

In [41]:
s1: pd.Series = pd.Series([1, 2, 3, 4, 5], index = ['a', 'b', 'c', 'd', 'e'])
display(s1)
# Applying slicing
display(s1.iloc[1:4]) # index location (numbers only) same as numpy slicing

a    1
b    2
c    3
d    4
e    5
dtype: int64

b    2
c    3
d    4
dtype: int64

In [42]:
s1: pd.Series = pd.Series([1, 2, 3, 4, 5], index = ['a', 'b', 'c', 'd', 'e'])
display(s1)
# Applying slicing
display(s1.loc['b':'d']) # index location (label) and end included

a    1
b    2
c    3
d    4
e    5
dtype: int64

b    2
c    3
d    4
dtype: int64

In [43]:
s1: pd.Series = pd.Series([1, 2, 3, 4, 5], index = ['a', 'b', 'c', 'd', 'e'])
display(s1)
# Applying slicing
display(s1.iat[1]) # index location (numbers only) at particular index

a    1
b    2
c    3
d    4
e    5
dtype: int64

2

In [44]:
s1: pd.Series = pd.Series([1, 2, 3, 4, 5], index = ['a', 'b', 'c', 'd', 'e'])
display(s1)
# Applying slicing
display(s1.at['b']) # index location (label) at a particular index

a    1
b    2
c    3
d    4
e    5
dtype: int64

2

## Manipulation with Data
* Add a new column in DataFrame
    * Single column
    * Multiple columns
* Delate an existing column in DataFrame
* Change Data Type of a column in DataFrame
* Map Function
* Apply Function
    * Single column
    * Multiple columns
* Concat
    * Axis
        * Axis = 0 (default) Top to Bottom
        * Axis = 1 Left to Right
* df.info()
* df.describe()
* df.head()
* df.tail()
* df.samples()

In [45]:
import pandas as pd
import pandera as pa
import numpy as np
from nptyping import NDArray, Shape, Int64
total: int = 10000
s1: NDArray[Shape[str(total)+"5"], Int64] = np.random.randint(80, 100, (total, 5), dtype=np.int64)
ss1: pd.DataFrame = pd.DataFrame(s1, columns=['s1', 's2','s3','s4','s5'])
s2: NDArray[Shape[str(total)+"5"], Int64] = np.random.randint(70, 79, (total, 5), dtype=np.int64)
ss2: pd.DataFrame = pd.DataFrame(s2, columns=['s1', 's2','s3','s4','s5'])
s3: NDArray[Shape[str(total)+"5"], Int64] = np.random.randint(60, 69, (total, 5), dtype=np.int64)
ss3: pd.DataFrame = pd.DataFrame(s3, columns=['s1', 's2','s3','s4','s5'])
s4: NDArray[Shape[str(total)+"5"], Int64] = np.random.randint(50, 59, (total, 5), dtype=np.int64)
ss4: pd.DataFrame = pd.DataFrame(s4, columns=['s1', 's2','s3','s4','s5'])
s5: NDArray[Shape[str(total)+"5"], Int64] = np.random.randint(40, 49, (total, 5), dtype=np.int64)
ss5: pd.DataFrame = pd.DataFrame(s5, columns=['s1', 's2','s3','s4','s5'])
s6: NDArray[Shape[str(total)+"5"], Int64] = np.random.randint(33, 39, (total, 5), dtype=np.int64)
ss6: pd.DataFrame = pd.DataFrame(s6, columns=['s1', 's2','s3','s4','s5'])
s7: NDArray[Shape[str(total)+"5"], Int64] = np.random.randint(0, 32, (total, 5), dtype=np.int64)
ss7: pd.DataFrame = pd.DataFrame(s7, columns=['s1', 's2','s3','s4','s5'])
display(ss1, ss2, ss3, ss4, ss5, ss6, ss7)

Unnamed: 0,s1,s2,s3,s4,s5
0,85,98,97,97,95
1,96,97,89,87,95
2,95,82,85,93,84
3,99,83,93,97,92
4,91,89,81,95,81
...,...,...,...,...,...
9995,99,88,95,91,99
9996,83,92,81,93,92
9997,80,80,84,82,95
9998,86,90,98,80,83


Unnamed: 0,s1,s2,s3,s4,s5
0,71,78,71,78,76
1,77,73,75,78,73
2,73,71,77,75,73
3,77,73,73,70,76
4,78,70,73,73,74
...,...,...,...,...,...
9995,78,71,73,71,71
9996,74,76,71,71,70
9997,73,77,75,73,75
9998,76,73,77,70,78


Unnamed: 0,s1,s2,s3,s4,s5
0,66,68,66,64,62
1,64,66,68,66,67
2,65,66,64,65,67
3,65,62,65,62,64
4,67,62,66,61,61
...,...,...,...,...,...
9995,67,61,62,67,61
9996,68,68,63,65,68
9997,65,68,62,60,64
9998,65,66,67,63,60


Unnamed: 0,s1,s2,s3,s4,s5
0,55,51,57,51,56
1,57,55,54,50,51
2,55,54,53,54,51
3,52,50,56,55,54
4,53,56,55,57,58
...,...,...,...,...,...
9995,53,58,50,55,56
9996,57,55,55,54,56
9997,57,55,53,56,55
9998,53,53,50,53,52


Unnamed: 0,s1,s2,s3,s4,s5
0,45,41,47,47,48
1,40,40,48,44,44
2,47,45,47,46,44
3,44,45,45,48,48
4,46,43,41,45,42
...,...,...,...,...,...
9995,40,47,45,46,43
9996,47,48,41,42,47
9997,43,41,41,48,43
9998,43,40,40,40,40


Unnamed: 0,s1,s2,s3,s4,s5
0,35,37,37,37,36
1,35,37,34,33,38
2,38,35,34,33,37
3,37,36,34,36,33
4,36,33,38,37,35
...,...,...,...,...,...
9995,35,35,35,33,33
9996,38,37,37,34,34
9997,37,36,36,36,37
9998,38,33,36,33,34


Unnamed: 0,s1,s2,s3,s4,s5
0,24,10,28,8,21
1,8,25,3,4,19
2,5,0,15,22,30
3,17,9,5,27,17
4,31,14,28,2,18
...,...,...,...,...,...
9995,3,19,14,18,11
9996,21,14,24,20,19
9997,27,0,9,17,26
9998,26,16,7,29,2


In [46]:
df: pd.DataFrame = pd.concat([ss1, ss2, ss3, ss4, ss5, ss6, ss7]).reset_index(drop=True)
display(df.info())
print("=================================================================================")
display(df.describe())
print("=================================================================================")
display(df.head())
print("=================================================================================")
display(df.tail())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   s1      70000 non-null  int64
 1   s2      70000 non-null  int64
 2   s3      70000 non-null  int64
 3   s4      70000 non-null  int64
 4   s5      70000 non-null  int64
dtypes: int64(5)
memory usage: 2.7 MB


None



Unnamed: 0,s1,s2,s3,s4,s5
count,70000.0,70000.0,70000.0,70000.0,70000.0
mean,53.785829,53.774371,53.792743,53.795271,53.765029
std,23.394074,23.387239,23.423266,23.404295,23.428247
min,0.0,0.0,0.0,0.0,0.0
25%,37.0,37.0,37.0,37.0,37.0
50%,54.0,54.0,54.0,54.0,54.0
75%,72.0,72.0,72.0,72.0,72.0
max,99.0,99.0,99.0,99.0,99.0




Unnamed: 0,s1,s2,s3,s4,s5
0,85,98,97,97,95
1,96,97,89,87,95
2,95,82,85,93,84
3,99,83,93,97,92
4,91,89,81,95,81




Unnamed: 0,s1,s2,s3,s4,s5
69995,3,19,14,18,11
69996,21,14,24,20,19
69997,27,0,9,17,26
69998,26,16,7,29,2
69999,0,5,9,23,31


In [47]:
df['s1']

0        85
1        96
2        95
3        99
4        91
         ..
69995     3
69996    21
69997    27
69998    26
69999     0
Name: s1, Length: 70000, dtype: int64

In [48]:
df.s1 # cannot be used to add a new column

0        85
1        96
2        95
3        99
4        91
         ..
69995     3
69996    21
69997    27
69998    26
69999     0
Name: s1, Length: 70000, dtype: int64

In [49]:
df[['s1', 's3', 's5']]

Unnamed: 0,s1,s3,s5
0,85,97,95
1,96,89,95
2,95,85,84
3,99,93,92
4,91,81,81
...,...,...,...
69995,3,14,11
69996,21,24,19
69997,27,9,26
69998,26,7,2


In [50]:
df.head()

Unnamed: 0,s1,s2,s3,s4,s5
0,85,98,97,97,95
1,96,97,89,87,95
2,95,82,85,93,84
3,99,83,93,97,92
4,91,89,81,95,81


In [51]:
df['Total'] = 500
df.head()

Unnamed: 0,s1,s2,s3,s4,s5,Total
0,85,98,97,97,95,500
1,96,97,89,87,95,500
2,95,82,85,93,84,500
3,99,83,93,97,92,500
4,91,89,81,95,81,500


In [53]:
df['Total'] = 500 # add a new column to the dataframe
df['Total'] = 300 # update a column in the dataframe
del df['Total'] # delete a column from the dataframe
df

Unnamed: 0,s1,s2,s3,s4,s5
0,85,98,97,97,95
1,96,97,89,87,95
2,95,82,85,93,84
3,99,83,93,97,92
4,91,89,81,95,81
...,...,...,...,...,...
69995,3,19,14,18,11
69996,21,14,24,20,19
69997,27,0,9,17,26
69998,26,16,7,29,2


In [54]:
df['Marks Obtained'] = df['s1'] + df['s2'] + df['s3'] + df['s4'] + df['s5']
df['Total Marks'] = 500
df['Percentage'] = (df['Marks Obtained'] / df['Total Marks']) * 100
df.head()

Unnamed: 0,s1,s2,s3,s4,s5,Marks Obtained,Total Marks,Percentage
0,85,98,97,97,95,472,500,94.4
1,96,97,89,87,95,464,500,92.8
2,95,82,85,93,84,439,500,87.8
3,99,83,93,97,92,464,500,92.8
4,91,89,81,95,81,437,500,87.4


In [55]:
def grade(per: float) -> str:
    if per >= 85:
        return "A+"
    elif per >= 75:
        return "A"
    elif per >= 60:
        return "B"
    elif per >= 50:
        return "C"
    elif per >= 40:
        return "D"
    elif per >= 33:
        return "E"
    else:
        return "F"

In [58]:
df['Marks Obtained'] = df['s1'] + df['s2'] + df['s3'] + df['s4'] + df['s5']
df['Total Marks'] = 500
df['Percentage'] = (df['Marks Obtained'] / df['Total Marks']) * 100
df['Grade'] = df['Percentage'].apply(grade)
# df.sample()
df

Unnamed: 0,s1,s2,s3,s4,s5,Marks Obtained,Total Marks,Percentage,Grade
0,85,98,97,97,95,472,500,94.4,A+
1,96,97,89,87,95,464,500,92.8,A+
2,95,82,85,93,84,439,500,87.8,A+
3,99,83,93,97,92,464,500,92.8,A+
4,91,89,81,95,81,437,500,87.4,A+
...,...,...,...,...,...,...,...,...,...
69995,3,19,14,18,11,65,500,13.0,F
69996,21,14,24,20,19,98,500,19.6,F
69997,27,0,9,17,26,79,500,15.8,F
69998,26,16,7,29,2,80,500,16.0,F


In [61]:
import pandas as pd
import pandera as pa
import numpy as np
from nptyping import NDArray, Shape, Int64
total: int = 10000
s1: NDArray[Shape[str(total)+"5"], Int64] = np.random.randint(80, 100, (total, 5), dtype=np.int64)
ss1: pd.DataFrame = pd.DataFrame(s1, columns=['s1', 's2','s3','s4','s5'])
s2: NDArray[Shape[str(total)+"5"], Int64] = np.random.randint(70, 79, (total, 5), dtype=np.int64)
ss2: pd.DataFrame = pd.DataFrame(s2, columns=['s1', 's2','s3','s4','s5'])
s3: NDArray[Shape[str(total)+"5"], Int64] = np.random.randint(60, 69, (total, 5), dtype=np.int64)
ss3: pd.DataFrame = pd.DataFrame(s3, columns=['s1', 's2','s3','s4','s5'])
s4: NDArray[Shape[str(total)+"5"], Int64] = np.random.randint(50, 59, (total, 5), dtype=np.int64)
ss4: pd.DataFrame = pd.DataFrame(s4, columns=['s1', 's2','s3','s4','s5'])
s5: NDArray[Shape[str(total)+"5"], Int64] = np.random.randint(40, 49, (total, 5), dtype=np.int64)
ss5: pd.DataFrame = pd.DataFrame(s5, columns=['s1', 's2','s3','s4','s5'])
s6: NDArray[Shape[str(total)+"5"], Int64] = np.random.randint(33, 39, (total, 5), dtype=np.int64)
ss6: pd.DataFrame = pd.DataFrame(s6, columns=['s1', 's2','s3','s4','s5'])
s7: NDArray[Shape[str(total)+"5"], Int64] = np.random.randint(0, 32, (total, 5), dtype=np.int64)
ss7: pd.DataFrame = pd.DataFrame(s7, columns=['s1', 's2','s3','s4','s5'])
df: pd.DataFrame = pd.concat([ss1, ss2, ss3, ss4, ss5, ss6, ss7]).reset_index(drop=True)
display(df.head())

Unnamed: 0,s1,s2,s3,s4,s5
0,97,90,95,80,94
1,87,81,93,84,99
2,90,93,88,96,94
3,95,87,89,84,85
4,97,93,89,99,93


In [66]:
# How to apply multiple slicing
df.iloc[0:11, 0:3]

Unnamed: 0,s1,s2,s3
0,97,90,95
1,87,81,93
2,90,93,88
3,95,87,89
4,97,93,89
5,90,96,83
6,84,91,96
7,82,98,84
8,84,81,82
9,96,86,96


In [71]:
df.loc[0:10, 's2':'s4']

Unnamed: 0,s2,s3,s4
0,90,95,80
1,81,93,84
2,93,88,96
3,87,89,84
4,93,89,99
5,96,83,88
6,91,96,96
7,98,84,83
8,81,82,99
9,86,96,98


In [73]:
df.iat[0, 1]

90

In [74]:
df.at[0, 's2']

90

In [116]:
# How to apply function on a single column
def fn(s1: int, s2: int, s3: int, s4: int, s5: int) -> int:
    return s1 + s2 + s3 + s4 + s5
df[['s1', 's2','s3','s4','s5']].apply(fn(s1, s2, s3, s4,s5), axis=1)

TypeError: 'numpy.int64' object is not callable

In [120]:
# How to apply function on a single column
def fn(s1: int, s2: int, s3: int, s4: int, s5: int) -> int:
    return s1 + s2 + s3 + s4 + s5
df['Marks Obtained'] = df[['s1', 's2','s3','s4','s5']].apply(lambda x: fn(*x), axis=1)
df

Unnamed: 0,s1,s2,s3,s4,s5,Marks Obtained
0,97,90,95,80,94,456
1,87,81,93,84,99,444
2,90,93,88,96,94,461
3,95,87,89,84,85,440
4,97,93,89,99,93,471
...,...,...,...,...,...,...
69995,24,6,12,19,22,83
69996,29,12,0,0,22,63
69997,22,29,20,8,6,85
69998,14,18,13,22,14,81


In [133]:
# How to apply function on multiple columns
def fnn(s1: int, s2: int, s3: int, s4: int, s5: int) -> tuple[int, int, float, str]:
    total: int = 500
    obtained: int = s1 + s2 + s3 + s4 + s5
    per: float = obtained / total * 100
    grade: str = ""
    if per >= 85:
        grade = "A+"
    elif per >= 75:
        grade = "A"
    elif per >= 60:
        grade = "B"
    elif per >= 50:
        grade = "C"
    elif per >= 40:
        grade = "D"
    elif per >= 33:
        grade = "E"
    else:
        grade = "F"
    return total, obtained, per, grade
df[['Total Marks', 'Marks Obtained', 'Percentage', 'Grade']] =df[['s1', 's2', 's3', 's4', 's5']].apply(lambda x: fnn(*x), axis=1, result_type='expand')
df

Unnamed: 0,s1,s2,s3,s4,s5,Total Marks,Marks Obtained,Percentage,Grade
0,83,86,95,90,92,500,446,89.2,A+
1,89,83,97,98,95,500,462,92.4,A+
2,85,97,90,96,98,500,466,93.2,A+
3,81,97,80,99,97,500,454,90.8,A+
4,93,90,84,87,87,500,441,88.2,A+
...,...,...,...,...,...,...,...,...,...
69995,4,11,2,0,21,500,38,7.6,F
69996,29,7,22,4,27,500,89,17.8,F
69997,27,23,11,9,17,500,87,17.4,F
69998,10,31,25,10,5,500,81,16.2,F


In [157]:
# How to apply map function
Remarks: dict = {'A+': 'Excellent', 'A': 'Very Good', 'B': 'Good', 'C': 'Fair', 'D': 'Poor', 'E': 'Very Poor', 'F': 'Fail'}
df['Remarks'] = df.Grade.map(Remarks) 
df.sample(10)

Unnamed: 0,s1,s2,s3,s4,s5,Total Marks,Marks Obtained,Percentage,Grade,Remarks
65087,30,18,0,31,9,500,88,17.6,F,Fail
49469,43,46,48,45,42,500,224,44.8,D,Poor
8168,98,84,99,98,88,500,467,93.4,A+,Excellent
26550,64,60,63,65,63,500,315,63.0,B,Good
1619,95,89,85,83,90,500,442,88.4,A+,Excellent
21564,65,65,67,67,62,500,326,65.2,B,Good
64937,15,24,5,27,20,500,91,18.2,F,Fail
1520,80,89,83,82,90,500,424,84.8,A,Very Good
39055,54,58,54,51,50,500,267,53.4,C,Fair
11120,72,76,76,76,76,500,376,75.2,A,Very Good


## Type Casting
* astype
* apply

In [158]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   s1              70000 non-null  int64  
 1   s2              70000 non-null  int64  
 2   s3              70000 non-null  int64  
 3   s4              70000 non-null  int64  
 4   s5              70000 non-null  int64  
 5   Total Marks     70000 non-null  int64  
 6   Marks Obtained  70000 non-null  int64  
 7   Percentage      70000 non-null  float64
 8   Grade           70000 non-null  object 
 9   Remarks         70000 non-null  object 
dtypes: float64(1), int64(7), object(2)
memory usage: 5.3+ MB


In [159]:
df['s1'] = df['s1'].astype(np.int32)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   s1              70000 non-null  int32  
 1   s2              70000 non-null  int64  
 2   s3              70000 non-null  int64  
 3   s4              70000 non-null  int64  
 4   s5              70000 non-null  int64  
 5   Total Marks     70000 non-null  int64  
 6   Marks Obtained  70000 non-null  int64  
 7   Percentage      70000 non-null  float64
 8   Grade           70000 non-null  object 
 9   Remarks         70000 non-null  object 
dtypes: float64(1), int32(1), int64(6), object(2)
memory usage: 5.1+ MB


In [161]:
df['s3'] = df['s3'].apply(np.int32)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   s1              70000 non-null  int32  
 1   s2              70000 non-null  int64  
 2   s3              70000 non-null  int32  
 3   s4              70000 non-null  int64  
 4   s5              70000 non-null  int64  
 5   Total Marks     70000 non-null  int64  
 6   Marks Obtained  70000 non-null  int64  
 7   Percentage      70000 non-null  float64
 8   Grade           70000 non-null  object 
 9   Remarks         70000 non-null  object 
dtypes: float64(1), int32(2), int64(5), object(2)
memory usage: 4.8+ MB
