In [1]:
import pandas as pd
import numpy as np

# SERIES

In [2]:
pd.Series?

In [3]:
s = pd.Series(np.random.randn(5))
s

0    0.321669
1    1.398236
2   -0.067967
3   -0.040422
4   -0.180593
dtype: float64

In [4]:
s = pd.Series(np.random.randn(5),
             index=["a", "b", "c", "d", "e"],
             name="example") #name - in the DataFrame it will be a name of column
s

a   -0.477214
b    0.358082
c    1.480471
d    1.725782
e   -0.865645
Name: example, dtype: float64

In [5]:
pd.Series(5, index=["a", "b", "c", "d", "e"])

a    5
b    5
c    5
d    5
e    5
dtype: int64

In [6]:
s[0]

-0.47721422815242803

In [7]:
s[:3]

a   -0.477214
b    0.358082
c    1.480471
Name: example, dtype: float64

In [8]:
s[[4,3]]

e   -0.865645
d    1.725782
Name: example, dtype: float64

In [9]:
s.values

array([-0.47721423,  0.35808156,  1.48047143,  1.72578205, -0.8656455 ])

In [10]:
s["e"] = 1488

In [11]:
s

a      -0.477214
b       0.358082
c       1.480471
d       1.725782
e    1488.000000
Name: example, dtype: float64

In [12]:
s[[True, False,True,False,True]] # It allows to do selection

a      -0.477214
c       1.480471
e    1488.000000
Name: example, dtype: float64

In [13]:
s > 0

a    False
b     True
c     True
d     True
e     True
Name: example, dtype: bool

In [14]:
s[s>0], s>0

(b       0.358082
 c       1.480471
 d       1.725782
 e    1488.000000
 Name: example, dtype: float64,
 a    False
 b     True
 c     True
 d     True
 e     True
 Name: example, dtype: bool)

In [15]:
s[s>0] *= -1

In [16]:
s

a      -0.477214
b      -0.358082
c      -1.480471
d      -1.725782
e   -1488.000000
Name: example, dtype: float64

In [17]:
s > 0

a    False
b    False
c    False
d    False
e    False
Name: example, dtype: bool

In [18]:
s + s

a      -0.954428
b      -0.716163
c      -2.960943
d      -3.451564
e   -2976.000000
Name: example, dtype: float64

In [19]:
np.exp(s)

a    0.620510
b    0.699016
c    0.227530
d    0.178034
e    0.000000
Name: example, dtype: float64

In [20]:
s.abs()

a       0.477214
b       0.358082
c       1.480471
d       1.725782
e    1488.000000
Name: example, dtype: float64

In [21]:
s + s[s>0] # when indexes don't match

a   NaN
b   NaN
c   NaN
d   NaN
e   NaN
Name: example, dtype: float64

# DataFrames

In [22]:
d = {"one" : pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"])}
d

{'one': a    1.0
 b    2.0
 c    3.0
 dtype: float64,
 'two': a    1.0
 b    2.0
 c    3.0
 d    4.0
 dtype: float64}

In [23]:
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [24]:
# It is also possible to pass in np arrays, scalars

In [25]:
d = {"one" : "Hellow",
    "two": np.array([1, 2, 3, 4])}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
0,Hellow,1
1,Hellow,2
2,Hellow,3
3,Hellow,4


In [26]:
df.columns = ["1", "2"]
df.index = ["a", "b", "c", "d"]
df

Unnamed: 0,1,2
a,Hellow,1
b,Hellow,2
c,Hellow,3
d,Hellow,4


In [27]:
d = {"one" : "Hellow",
    "two": np.array([1.0, 2.0, 3.0, 4.0])}
df = pd.DataFrame(d)
df.index = ["a", "b", "c", "d"]

# give back a named series
df["one"]

a    Hellow
b    Hellow
c    Hellow
d    Hellow
Name: one, dtype: object

In [28]:
del df["one"]

In [29]:
df

Unnamed: 0,two
a,1.0
b,2.0
c,3.0
d,4.0


In [30]:
df["three"] = df["two"] + df["two"]
df["four"] = "four"
df["five"] = df["four"][:3]
df

Unnamed: 0,two,three,four,five
a,1.0,2.0,four,four
b,2.0,4.0,four,four
c,3.0,6.0,four,four
d,4.0,8.0,four,


In [31]:
df["two"][0]

1.0

In [32]:
df[["two", "five"]]

Unnamed: 0,two,five
a,1.0,four
b,2.0,four
c,3.0,four
d,4.0,


In [33]:
# selection by indexes and column names
df.loc["a", "five"]

'four'

In [34]:
df.loc[["a", "d"], ["five", "two"]]

Unnamed: 0,five,two
a,four,1.0
d,,4.0


In [35]:
df.loc["d":"a":-1, "two":"three"] #-1 reverses rows

Unnamed: 0,two,three
d,4.0,8.0
c,3.0,6.0
b,2.0,4.0
a,1.0,2.0


In [36]:
# select rows and columns by their ordering
df.iloc[1:3, 0]

b    2.0
c    3.0
Name: two, dtype: float64

In [37]:
df.iloc[1:3]

Unnamed: 0,two,three,four,five
b,2.0,4.0,four,four
c,3.0,6.0,four,four


# DataFrames Functions

In [38]:
df.copy()

Unnamed: 0,two,three,four,five
a,1.0,2.0,four,four
b,2.0,4.0,four,four
c,3.0,6.0,four,four
d,4.0,8.0,four,


In [39]:
x = df.copy()

In [40]:
x.loc["a", "two"] = -2

In [41]:
x

Unnamed: 0,two,three,four,five
a,-2.0,2.0,four,four
b,2.0,4.0,four,four
c,3.0,6.0,four,four
d,4.0,8.0,four,


In [42]:
df

Unnamed: 0,two,three,four,five
a,1.0,2.0,four,four
b,2.0,4.0,four,four
c,3.0,6.0,four,four
d,4.0,8.0,four,


In [43]:
df.two

a    1.0
b    2.0
c    3.0
d    4.0
Name: two, dtype: float64

In [44]:
#astype
df.two.astype(np.int)

a    1
b    2
c    3
d    4
Name: two, dtype: int32

In [45]:
df.T

Unnamed: 0,a,b,c,d
two,1,2,3,4
three,2,4,6,8
four,four,four,four,four
five,four,four,four,


In [46]:
df.head(2)

Unnamed: 0,two,three,four,five
a,1.0,2.0,four,four
b,2.0,4.0,four,four


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, a to d
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   two     4 non-null      float64
 1   three   4 non-null      float64
 2   four    4 non-null      object 
 3   five    3 non-null      object 
dtypes: float64(2), object(2)
memory usage: 320.0+ bytes


In [48]:
df.describe(include="all")

Unnamed: 0,two,three,four,five
count,4.0,4.0,4,3
unique,,,1,1
top,,,four,four
freq,,,4,3
mean,2.5,5.0,,
std,1.290994,2.581989,,
min,1.0,2.0,,
25%,1.75,3.5,,
50%,2.5,5.0,,
75%,3.25,6.5,,


In [49]:
# Sometimes all columns don't fit the screen width
for i in range(20):
    df[i] = i
    
df.head()

Unnamed: 0,two,three,four,five,0,1,2,3,4,5,...,10,11,12,13,14,15,16,17,18,19
a,1.0,2.0,four,four,0,1,2,3,4,5,...,10,11,12,13,14,15,16,17,18,19
b,2.0,4.0,four,four,0,1,2,3,4,5,...,10,11,12,13,14,15,16,17,18,19
c,3.0,6.0,four,four,0,1,2,3,4,5,...,10,11,12,13,14,15,16,17,18,19
d,4.0,8.0,four,,0,1,2,3,4,5,...,10,11,12,13,14,15,16,17,18,19


In [50]:
#Transpose helps
df.head().T

Unnamed: 0,a,b,c,d
two,1,2,3,4
three,2,4,6,8
four,four,four,four,four
five,four,four,four,
0,0,0,0,0
1,1,1,1,1
2,2,2,2,2
3,3,3,3,3
4,4,4,4,4
5,5,5,5,5


In [51]:
pd.set_option('display.max_rows', 100)
pd.set_option('precision', 7)

In [52]:
s

a      -0.4772142
b      -0.3580816
c      -1.4804714
d      -1.7257820
e   -1488.0000000
Name: example, dtype: float64