# Pandas

In [2]:
import pandas as pd
import pandera as pa

## Pandas Series

In [3]:
# pandas Series using list
s1: pd.Series = pd.Series([1, 2, 3, 4, 5])
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [4]:
# Series using Dictionary
s1: pd.Series = pd.Series({"name": "Sarmad", "email": "sarmad@email.com", "age": 20})

s1

name               Sarmad
email    sarmad@email.com
age                    20
dtype: object

In [5]:
# Series using list with index
values: list[int] = [1, 2, 3, 4]
index: list[str] = ["a", "b", "c", "d"]

s1: pd.Series = pd.Series(values, index=index)
s1

a    1
b    2
c    3
d    4
dtype: int64

In [6]:
values: list[str] = list("ABCDEFGH")
series: pd.Series = pd.Series(values, index=[1, 2, 3, 4, 5, 6, 7, 8])

series

1    A
2    B
3    C
4    D
5    E
6    F
7    G
8    H
dtype: object

In [7]:
values: list[int] = [1, 2, 3, 4]
# 2D index list
index: list[list[str]] = [["a", "b", "c", "d"], ["e", "f", "g", "h"]]

s1: pd.Series = pd.Series(values, index=index, name="Simple pandas series")
s1

a  e    1
b  f    2
c  g    3
d  h    4
Name: Simple pandas series, dtype: int64

In [8]:
# Series using a python tuple
series = pd.Series(("Sarmad", "sarmad@gmail.com", 20))
series

0              Sarmad
1    sarmad@gmail.com
2                  20
dtype: object

In [9]:
# Series using a python set
series = pd.Series(list({"Sarmad", "sarmad@gmail.com", 20}))
series

0    sarmad@gmail.com
1                  20
2              Sarmad
dtype: object

## DataFrame

In [40]:
import numpy as np

# dictionary with two series

data = {
    "Student name": pd.Series(
        ["Sarmad", "Hammad", "Ali", "Akmal", "Jawad"], name="Student name"
    ),
    "Student Score": pd.Series(np.random.randint(33, 100, 5), name="Student Score"),
}

# Create a dataframe from the dictionary
df: pd.DataFrame = pd.DataFrame(data)
df

Unnamed: 0,Student name,Student Score
0,Sarmad,97
1,Hammad,86
2,Ali,42
3,Akmal,99
4,Jawad,98


In [45]:
from typing import Literal

Grades = Literal["A+", "A", "B+", "B", "C", "D", "F"]


def get_grade(marks: int) -> Grades:
    """
    Returns the grade based on the marks
    """
    if marks >= 90:
        return "A+"
    elif marks < 90 and marks >= 80:
        return "A"
    elif marks >= 75:
        return "B+"
    elif marks >= 70:
        return "B"
    elif marks >= 65:
        return "C"
    elif marks >= 50:
        return "D"
    else:
        return "F"


df["Grade"] = [get_grade(marks) for marks in df["Student Score"]]

df

Unnamed: 0,Student name,Student Score,Grade
0,Sarmad,97,A+
1,Hammad,86,A
2,Ali,42,F
3,Akmal,99,A+
4,Jawad,98,A+


In [46]:
df["Remarks"] = ["Pass" if num >= 50 else "Fail" for num in df["Student Score"]]

df

Unnamed: 0,Student name,Student Score,Grade,Remarks
0,Sarmad,97,A+,Pass
1,Hammad,86,A,Pass
2,Ali,42,F,Fail
3,Akmal,99,A+,Pass
4,Jawad,98,A+,Pass


In [11]:
s1: pd.Series = pd.Series([1, 2, 3, 4, 5], name="Student Id")
s2: pd.Series = pd.Series([10, 20, 30, 40, 50], name="Student Score")
s3: pd.Series = pd.Series(
    ["Sarmad", "Hammad", "Ali", "Akmal", "Jawad"], name="Student name"
)

df1: pd.DataFrame = pd.DataFrame({"Student Id": s1, "Score": s2, "Student Name": s3})

df1

Unnamed: 0,Student Id,Score,Student Name
0,1,10,Sarmad
1,2,20,Hammad
2,3,30,Ali
3,4,40,Akmal
4,5,50,Jawad


In [12]:
# read json

# Error due to internet disconnection

df: pd.DataFrame = pd.read_json("https://www.w3schools.com/python/pandas/data.js")
df

URLError: <urlopen error [Errno 11001] getaddrinfo failed>

In [13]:
# data to validate

df = pd.DataFrame(
    {
        "column1": [1, 4, 0, 10, 9],
        "column2": [-1.3, -1.4, -2.9, -10.1, -20.4],
        "column3": ["value_1", "value_2", "value_3", "value_2", "value_1"],
    }
)

# define schema
schema = pa.DataFrameSchema(
    {
        "column1": pa.Column(int, checks=pa.Check.le(10)),
        "column2": pa.Column(float, checks=pa.Check.lt(-1.2)),
        "column3": pa.Column(
            str,
            checks=[
                pa.Check.str_startswith("value_"),
                # define custom checks as functions that take a series as input and
                # outputs a boolean or boolean Series
                pa.Check(lambda s: s.str.split("_", expand=True).shape[1] == 2),
            ],
        ),
    }
)

validated_df = schema(df)
print(validated_df)

   column1  column2  column3
0        1     -1.3  value_1
1        4     -1.4  value_2
2        0     -2.9  value_3
3       10    -10.1  value_2
4        9    -20.4  value_1


In [14]:
# 2D list data frame

data: list[list[int]] = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]

table: pd.DataFrame = pd.DataFrame(data, columns=["A", "B", "C"], index=[1, 2, 3])
table

Unnamed: 0,A,B,C
1,1,2,3
2,4,5,6
3,7,8,9


In [15]:
table.keys

<bound method NDFrame.keys of    A  B  C
1  1  2  3
2  4  5  6
3  7  8  9>

In [16]:
table.columns

Index(['A', 'B', 'C'], dtype='object')

In [17]:
table.index

Index([1, 2, 3], dtype='int64')

In [18]:
table.values

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]], dtype=int64)

In [47]:
import numpy as np
from nptyping import NDArray, Shape, Int64

arr: NDArray[Shape["10"], Int64] = np.arange(8 * 10).reshape(8, 10)

table: pd.DataFrame = pd.DataFrame(arr, columns=list("ABCDEFGHIJ"))

table

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,0,1,2,3,4,5,6,7,8,9
1,10,11,12,13,14,15,16,17,18,19
2,20,21,22,23,24,25,26,27,28,29
3,30,31,32,33,34,35,36,37,38,39
4,40,41,42,43,44,45,46,47,48,49
5,50,51,52,53,54,55,56,57,58,59
6,60,61,62,63,64,65,66,67,68,69
7,70,71,72,73,74,75,76,77,78,79


## Regex

In [20]:
chat_data: str = """
21:41:45 From Altaf Hussain to Everyone:
	PIAIC-173738
21:41:51 From Hamza to Everyone:
	PIAIC-201785
21:41:52 From jhon wick to Everyone:
	piaic 223880
21:41:54 From Hina Zargham to Everyone:
	PIAIC101499
21:41:54 From Hatif Humayun to Everyone:
	PIAIC-52822
21:41:54 From Ahmed Siddiqui to Everyone:
	PIAIC123456
21:41:56 From Arif Najmi to Everyone:
	125657
21:42:00 From Rehan Baig to Everyone:
	PIAIC73919
21:42:00 From STONE to Everyone:
	ZAM - 786
21:42:01 From M. Waheed Iqbal (PIAIC_126369) to Everyone:
	PIAIC_126369
21:42:03 From Anonymous to Everyone:
	PIAIC210905
21:42:06 From ABDUL KHALIQ to Everyone:
	PIAIC-604031
21:42:11 From Arshad Siddiqui to Everyone:
	PIAIC120702
21:42:13 From Ali Zar FSD to Everyone:
	PIaic 223972
21:42:13 From Azfar Suhail to Everyone:
	PIAIC218333
21:42:14 From Kamran Ahmed to Everyone:
	PIAIC139495
21:42:18 From Ahmed to Everyone:
	216511
21:42:20 From Ayesha Arshad to Everyone:
	PIAIC-225620
21:42:25 From Kamal Hassan to Everyone:
	PIAIC58320
21:42:29 From Ahmed to Everyone:
	PIAIC-2165111
"""

import re

patterns: str = r"""
(\d{2}:\d{2}:\d{2}) From (.*) to Everyone:
	(PIAIC-? ?\d{5,6})
"""

data: list[list[str]] = re.findall(patterns, chat_data)

data

[('21:41:45', 'Altaf Hussain', 'PIAIC-173738'),
 ('21:41:54', 'Hina Zargham', 'PIAIC101499'),
 ('21:41:54', 'Ahmed Siddiqui', 'PIAIC123456'),
 ('21:42:00', 'Rehan Baig', 'PIAIC73919'),
 ('21:42:03', 'Anonymous', 'PIAIC210905'),
 ('21:42:11', 'Arshad Siddiqui', 'PIAIC120702'),
 ('21:42:13', 'Azfar Suhail', 'PIAIC218333'),
 ('21:42:20', 'Ayesha Arshad', 'PIAIC-225620')]

In [21]:
df: pd.DataFrame = pd.DataFrame(data, columns=["Time", "Name", "Roll Number"])

df

Unnamed: 0,Time,Name,Roll Number
0,21:41:45,Altaf Hussain,PIAIC-173738
1,21:41:54,Hina Zargham,PIAIC101499
2,21:41:54,Ahmed Siddiqui,PIAIC123456
3,21:42:00,Rehan Baig,PIAIC73919
4,21:42:03,Anonymous,PIAIC210905
5,21:42:11,Arshad Siddiqui,PIAIC120702
6,21:42:13,Azfar Suhail,PIAIC218333
7,21:42:20,Ayesha Arshad,PIAIC-225620


In [45]:
# Concatenate

names: pd.Series = pd.Series(
    ["Sarmad", "Hammad", "Akmal", "Nawaz", "Babar"], name="Name"
)
emails: pd.Series = pd.Series(
    [name.lower() + "@gmail.com" for name in names], name="Email"
)
majors: pd.Series = pd.Series(["BSSE", "BSCS", "BSIT", "BSCS", "BSSE"], name="Major")

table: pd.DataFrame = pd.concat([names, emails, majors], axis=1)

table

Unnamed: 0,Name,Email,Major
0,Sarmad,sarmad@gmail.com,BSSE
1,Hammad,hammad@gmail.com,BSCS
2,Akmal,akmal@gmail.com,BSIT
3,Nawaz,nawaz@gmail.com,BSCS
4,Babar,babar@gmail.com,BSSE


## Slicing

In [23]:
s1: pd.Series = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"])
display(s1.iloc[1:4])  # index location (numbers) same as numpy

b    2
c    3
d    4
dtype: int64

In [24]:
s1: pd.Series = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"])
display(s1.loc["a":"d"])  # index location (label) end included

a    1
b    2
c    3
d    4
dtype: int64

In [25]:
s1: pd.Series = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"])
display(
    s1.iat[1]
)  # index location (numbers) extract one cell value and it can be updated.

2

In [26]:
s1: pd.Series = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"])
display(s1.at["d"])

4