## pandas

In [1]:
import pandas as pd

In [3]:
# print version
print(pd.__version__)

1.5.3


In [4]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [6]:
# series from a list
arr = [1,2,3,4,5,6]
s1 = pd.Series(arr)
s1

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64

In [7]:
order = [1,2,3,4,5,6]
s2 = pd.Series(arr, index=order)
s2

1    1
2    2
3    3
4    4
5    5
6    6
dtype: int64

In [8]:
import numpy as np

In [10]:
n = np.random.randn(5)
index = ['a','b','c','d','e']
s = pd.Series(n, index=index)

In [11]:
s

a   -0.279354
b   -0.179537
c    0.064936
d   -0.167244
e    0.828624
dtype: float64

In [14]:
d = {'a':1, 'b':2, 'c':'samuel'}
p = pd.Series(d)
p

a         1
b         2
c    samuel
dtype: object

In [15]:
# changes indices
p.index = [1,2,3]
p

1         1
2         2
3    samuel
dtype: object

In [16]:
p[0:1]

1    1
dtype: object

In [17]:
p[1:]

2         2
3    samuel
dtype: object

In [24]:
s3 = np.concatenate((s1,s2))
s3

array([1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6])

In [25]:
s4 = pd.Series(s3)
s4

0     1
1     2
2     3
3     4
4     5
5     6
6     1
7     2
8     3
9     4
10    5
11    6
dtype: int64

In [26]:
s4.drop(6)
s4

0     1
1     2
2     3
3     4
4     5
5     6
6     1
7     2
8     3
9     4
10    5
11    6
dtype: int64

# series operations

In [27]:
arr1 = [0,1,2,3,4,5,7]
arr2 = [6,7,8,9,5]


In [28]:
s5 = pd.Series(arr2)
s5

0    6
1    7
2    8
3    9
4    5
dtype: int64

In [29]:
s6 = pd.Series(arr1)
s6

0    0
1    1
2    2
3    3
4    4
5    5
6    7
dtype: int64

In [30]:
s5.add(s6)

0     6.0
1     8.0
2    10.0
3    12.0
4     9.0
5     NaN
6     NaN
dtype: float64

In [31]:
s5.sub(s6)

0    6.0
1    6.0
2    6.0
3    6.0
4    1.0
5    NaN
6    NaN
dtype: float64

In [32]:
s5.mul(s6)

0     0.0
1     7.0
2    16.0
3    27.0
4    20.0
5     NaN
6     NaN
dtype: float64

In [33]:
s5.div(s6)

0     inf
1    7.00
2    4.00
3    3.00
4    1.25
5     NaN
6     NaN
dtype: float64

In [35]:
s7 = s6
print('median', s7.median())

median 3.0


In [36]:
print('mean', s7.mean())

mean 3.142857142857143


In [37]:
print('max', s7.max())

max 7


In [38]:
print('min', s7.min())

min 0


# dataframe

In [40]:
dates = pd.date_range('today', periods=6)

In [41]:
dates

DatetimeIndex(['2023-07-12 07:20:15.034249', '2023-07-13 07:20:15.034249',
               '2023-07-14 07:20:15.034249', '2023-07-15 07:20:15.034249',
               '2023-07-16 07:20:15.034249', '2023-07-17 07:20:15.034249'],
              dtype='datetime64[ns]', freq='D')

In [42]:
num_arr = np.random.randn(6,4)
num_arr

array([[-0.21116583,  0.44235567,  0.3987694 ,  0.92617702],
       [ 0.07282357,  0.51302582, -0.20717685, -0.23358987],
       [ 0.65949212,  0.26418146,  1.3919404 , -0.46971849],
       [-0.78586813, -0.53541046,  1.99928238, -0.60266577],
       [-1.14563128, -1.01144518, -0.62884767, -1.41417275],
       [-0.35669414,  0.88685266, -0.87436438,  0.38079776]])

In [43]:
columns = ['A','B','C','D']
columns

['A', 'B', 'C', 'D']

In [44]:
df1 = pd.DataFrame(num_arr, index=dates, columns=columns)
df1

Unnamed: 0,A,B,C,D
2023-07-12 07:20:15.034249,-0.211166,0.442356,0.398769,0.926177
2023-07-13 07:20:15.034249,0.072824,0.513026,-0.207177,-0.23359
2023-07-14 07:20:15.034249,0.659492,0.264181,1.39194,-0.469718
2023-07-15 07:20:15.034249,-0.785868,-0.53541,1.999282,-0.602666
2023-07-16 07:20:15.034249,-1.145631,-1.011445,-0.628848,-1.414173
2023-07-17 07:20:15.034249,-0.356694,0.886853,-0.874364,0.380798


In [45]:
data = { 'names': ['samuel', 'vinald', 'okiror'],
       'age': [1,2,3],
       'gender': ['male', 'female','male']}

labels = ['a', 'b', 'c']

df2 = pd.DataFrame(data, index=labels)
df2

Unnamed: 0,names,age,gender
a,samuel,1,male
b,vinald,2,female
c,okiror,3,male


In [46]:
df2.dtypes

names     object
age        int64
gender    object
dtype: object

In [47]:
df2.head()

Unnamed: 0,names,age,gender
a,samuel,1,male
b,vinald,2,female
c,okiror,3,male


In [48]:
df2.head(2)

Unnamed: 0,names,age,gender
a,samuel,1,male
b,vinald,2,female


In [49]:
df2.tail(2)

Unnamed: 0,names,age,gender
b,vinald,2,female
c,okiror,3,male


In [51]:
df2.index

Index(['a', 'b', 'c'], dtype='object')

In [52]:
df2.columns

Index(['names', 'age', 'gender'], dtype='object')

In [53]:
df2.values

array([['samuel', 1, 'male'],
       ['vinald', 2, 'female'],
       ['okiror', 3, 'male']], dtype=object)

In [54]:
df2.describe()

Unnamed: 0,age
count,3.0
mean,2.0
std,1.0
min,1.0
25%,1.5
50%,2.0
75%,2.5
max,3.0


In [55]:
df2.T

Unnamed: 0,a,b,c
names,samuel,vinald,okiror
age,1,2,3
gender,male,female,male


In [57]:
df2.sort_values(by='names')

Unnamed: 0,names,age,gender
c,okiror,3,male
a,samuel,1,male
b,vinald,2,female


In [58]:
# slicing
df2[1:]

Unnamed: 0,names,age,gender
b,vinald,2,female
c,okiror,3,male


In [59]:
# select a column
df2[['names']]

Unnamed: 0,names
a,samuel
b,vinald
c,okiror


In [60]:
df3 = df2.copy()

In [61]:
df3.isnull()

Unnamed: 0,names,age,gender
a,False,False,False
b,False,False,False
c,False,False,False


In [63]:
df3.mean()

  df3.mean()


age    2.0
dtype: float64

In [64]:
df3['age'].sum()

6

# dataframe file operations

In [65]:
# save local file
df3.to_csv('name.csv')

In [67]:
# reads csv from the local computer
df4 = pd.read_csv('name.csv')

In [68]:
df4.head()

Unnamed: 0.1,Unnamed: 0,names,age,gender
0,a,samuel,1,male
1,b,vinald,2,female
2,c,okiror,3,male


In [69]:
df4.to_excel('name.xlsx', sheet_name='names')

In [70]:
df5 = pd.read_excel('name.xlsx', 'names', index_col=None, na_values=['NA'])

In [71]:
df5

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,names,age,gender
0,0,a,samuel,1,male
1,1,b,vinald,2,female
2,2,c,okiror,3,male


# visualization