#  Data Science Learning Journey  
*Curiosity to Capability — One Notebook at a Time*

---
Compiled and authored by **Partho Sarothi Das**   
	Dhaka, Bangladesh  
	Bachelor's & Master's in Statistics  
	Investment Banking Professional → Aspiring Data Scientist 
    
---

# Pandas Introduction

- Pandas is a Python library used for working with data sets.
- It has functions for analyzing, cleaning, exploring, and manipulating data.
- The name "Pandas" has a reference to both "Panel Data", and "Python Data Analysis" and was created by Wes McKinney in 2008.

# Why Use Pandas?

- Pandas allows us to analyze big data and make conclusions based on statistical theories.
- Pandas can clean messy data sets, and make them readable and relevant.
- Relevant data is very important in data science.

In [19]:
# Importing Pandas

import pandas as pd

In [21]:
# Checking Pandas Version

print(pd.__version__)

2.2.3


# Pandas Series

- A Pandas Series is like a column in a table.
- It is a one-dimensional array holding data of any type.

### Series from List

In [25]:
# Series from ----> List ----> integer

import pandas as pd

a = [1,2,4,2,7]
s = pd.Series(a)
s

0    1
1    2
2    4
3    2
4    7
dtype: int64

In [27]:
# Series from ----> List ----> string

L = ['Bangladesh','India', 'Nepal']
pd.Series(L)

0    Bangladesh
1         India
2         Nepal
dtype: object

In [29]:
# Custom Index

marks = [90, 80, 70]
subjects = ['Statistics','ML', 'Math']
pd.Series(marks, index = subjects)

Statistics    90
ML            80
Math          70
dtype: int64

In [31]:
# setting a name 

marks = pd.Series(marks, index=subjects, name='student1')
marks

Statistics    90
ML            80
Math          70
Name: student1, dtype: int64

### Series from dict

In [21]:
age = {
    'Partho':43,
    'Sheren':43,
    'Abhik':5
}

ages = pd.Series(age)
ages

Partho    43
Sheren    43
Abhik      5
dtype: int64

### Series Attributes

In [24]:
# Size

ages.size

3

In [28]:
# dtype

ages.dtype

dtype('int64')

In [32]:
# is_unique

ages.is_unique

False

In [36]:
# Index

ages.index

Index(['Partho', 'Sheren', 'Abhik'], dtype='object')

In [40]:
# Values

ages.values

array([43, 43,  5], dtype=int64)

### Series using read_csv

In [52]:
# with one column

s = pd.read_csv('./data/series1.csv').squeeze()
s

0     2
1     6
2     4
3     8
4     6
5     9
6     5
7     3
8     1
9     6
10    8
11    9
12    8
13    7
14    3
Name: 1, dtype: int64

In [59]:
# with two column

s = pd.read_csv('./data/series2.csv', index_col='subject').squeeze()
s

subject
ML               90
stat             95
math             80
Deep Learning    95
Name: marks, dtype: int64

### Series methods

In [78]:
s = pd.read_csv('./data/series1.csv').squeeze()
s.head()

0    1
1    2
2    6
3    4
4    8
Name: Number, dtype: int64

In [80]:
# tail
s.tail()

11    8
12    9
13    8
14    7
15    3
Name: Number, dtype: int64

In [82]:
# Sample()

s.sample(3)

7     5
3     4
15    3
Name: Number, dtype: int64

In [84]:
# value_counts()

s.value_counts()

Number
6    3
8    3
1    2
9    2
3    2
2    1
4    1
5    1
7    1
Name: count, dtype: int64

In [104]:
# sort_values

marks = pd.read_csv('./data/series2.csv', index_col='subject').squeeze()
marks.sort_values(ascending=False)

subject
stat             95
Deep Learning    95
ML               90
calculas         86
math             80
Name: marks, dtype: int64

In [108]:
# sort_index

marks.sort_index()

subject
Deep Learning    95
ML               90
calculas         86
math             80
stat             95
Name: marks, dtype: int64

In [112]:
# sort_index

marks.sort_index(ascending=True, inplace=True)
marks

subject
Deep Learning    95
ML               90
calculas         86
math             80
stat             95
Name: marks, dtype: int64

### Series Maths Methods

In [115]:
# count

marks.count()

5

In [117]:
# sum

marks.sum()

446

In [119]:
# mean -> median -> mode -> std -> var

print(marks.mean())
print(marks.median())
print(marks.mode())
print(marks.std())
print(marks.var())

89.2
90.0
0    95
Name: marks, dtype: int64
6.379655163094632
40.699999999999996


In [121]:
# Minimum value

marks.min()

80

In [123]:
# Maximum value

marks.max()

95

In [131]:
# Describe

marks.describe()

count     5.000000
mean     89.200000
std       6.379655
min      80.000000
25%      86.000000
50%      90.000000
75%      95.000000
max      95.000000
Name: marks, dtype: float64

### Series Indexing

In [15]:
# integer indexing
x = pd.Series([12,13,14,35,46,57,58,79,9])
x

0    12
1    13
2    14
3    35
4    46
5    57
6    58
7    79
8     9
dtype: int64

In [35]:
x[1]

13

In [37]:
# slicing

x[3:6]

3    35
4    46
5    57
dtype: int64

In [41]:
# # negative slicing

x[-5:]

4    46
5    57
6    58
7    79
8     9
dtype: int64

In [49]:
x[::2]

0    12
2    14
4    46
6    58
8     9
dtype: int64

In [51]:
# Fancy indexing

x[[1,3,7]]

1    13
3    35
7    79
dtype: int64

### Editing Series

In [58]:
x

0    12
1    13
2    14
3    35
4    46
5    57
6    58
7    79
8     9
dtype: int64

In [60]:
x[1] = 10
x

0    12
1    10
2    14
3    35
4    46
5    57
6    58
7    79
8     9
dtype: int64

In [66]:
# what if an index does not exist

x[10] = 100
x

0      12
1      10
2      14
3      35
4      46
5      57
6      58
7      79
8       9
10    100
dtype: int64

In [69]:
# edit by   fancy indexing

x[[0,1,2]] = [0,0,0]
x

0       0
1       0
2       0
3      35
4      46
5      57
6      58
7      79
8       9
10    100
dtype: int64

In [71]:
x[0:3] =  [100,200,300]
x

0     100
1     200
2     300
3      35
4      46
5      57
6      58
7      79
8       9
10    100
dtype: int64

### Series with Python Functionalities

In [78]:
len(x)

10

In [80]:
type(x)

pandas.core.series.Series

In [82]:
sorted(x)

[9, 35, 46, 57, 58, 79, 100, 100, 200, 300]

In [89]:
# type conversion

list(x)

[100, 200, 300, 35, 46, 57, 58, 79, 9, 100]

In [91]:
dict(x)

{0: 100, 1: 200, 2: 300, 3: 35, 4: 46, 5: 57, 6: 58, 7: 79, 8: 9, 10: 100}

In [95]:
city = ['Dhaka', 'Dinajpur', 'Sylhet', 'Khulna']
value = [40, 30, 35, 28]
cities = pd.Series(value, index=city)
cities

Dhaka       40
Dinajpur    30
Sylhet      35
Khulna      28
dtype: int64

In [101]:
# Series ---> to ----> Dictionary

dict(cities)

{'Dhaka': 40, 'Dinajpur': 30, 'Sylhet': 35, 'Khulna': 28}

In [113]:
# membership operator

28 in cities.values

True

In [111]:
'Dhaka' in cities

True

In [115]:
# Looping
for i in cities.index:
    print(i)

Dhaka
Dinajpur
Sylhet
Khulna


In [117]:
# Looping
for i in cities.values:
    print(i)

40
30
35
28


In [123]:
# Arithmetic Operators(Broadcasting)

100 + cities

Dhaka       140
Dinajpur    130
Sylhet      135
Khulna      128
dtype: int64

In [125]:
cities>= 35

Dhaka        True
Dinajpur    False
Sylhet       True
Khulna      False
dtype: bool

### Boolean Indexing on Series

In [144]:
vk = pd.read_csv('data/vk.csv').squeeze()

In [146]:
vk

Unnamed: 0,match_id,batsman_runs
0,12,62
1,17,28
2,20,64
3,27,0
4,30,10
...,...,...
136,624,75
137,626,113
138,632,54
139,633,0
