In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
## data can be many different things:

# - a Python dict
# - an ndarray
# - a scalar value

## The passed index is a list of axis labels.



In [4]:
s = pd.Series(np.random.randn(5), index = ['a', 'b', 'c', 'd', 'e'])

In [5]:
# From ndarray
## If data is an ndarray, an index must be the same length as the data. If no index is passed, one will be created having values [0, ..., len(data) - 1].

In [6]:
s

a   -0.348409
b    0.559159
c   -1.050144
d    0.225322
e    0.052592
dtype: float64

In [7]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [8]:
pd.Series(np.random.randn(5))

0   -0.526563
1   -0.141065
2    0.762211
3    1.109981
4    1.164537
dtype: float64

In [9]:
# From dict
## Series can be created from dicts:

In [10]:
d = {'b' : 1, 'a' : 0, 'c' : 2}

In [11]:
pd.Series(d)

b    1
a    0
c    2
dtype: int64

In [12]:
## When data is a dict, and an index is not passed, the Series index will be ordered by the dict’s insertion order. 
## There is no sorting if you have Python version >= 3.6 and Pandas version >= 0.23.

In [13]:
# From scalar value
## If data is a scalar value, an index must be provided. The value will be repeated to match the length of the index.



In [14]:
pd.Series(5., index = ['a', 'b', 'c', 'd', 'e'])

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

In [15]:
# Series is ndarray-like
## Series acts very similarly to a ndarray from NumPy and is a valid argument to most NumPy functions. Operations such as slicing will also slice the index.

In [16]:
s[0]

-0.3484090897364835

In [17]:
s[:3]

a   -0.348409
b    0.559159
c   -1.050144
dtype: float64

In [18]:
s[s > s.median()]

b    0.559159
d    0.225322
dtype: float64

In [19]:
s[[4, 3, 1]]

e    0.052592
d    0.225322
b    0.559159
dtype: float64

In [20]:
np.exp(s)

a    0.705810
b    1.749200
c    0.349887
d    1.252725
e    1.053999
dtype: float64

In [21]:
## Each series has a dtype 
s.dtype

dtype('float64')

In [22]:
## While Series in ndarray-like, if you need an actual ndarray, then use Series.to_numpy()

s.to_numpy()

array([-0.34840909,  0.55915871, -1.05014398,  0.22532155,  0.05259176])

In [23]:
# Series is dict-like
## A Series is like a fixed-size dict in which you can get and set values by an index label.

In [24]:
s['a']

-0.3484090897364835

In [25]:
s['e'] = 12

In [26]:
s

a    -0.348409
b     0.559159
c    -1.050144
d     0.225322
e    12.000000
dtype: float64

In [27]:
'e' in s

True

In [28]:
'f' in s

False

In [29]:
## If a label is not contained, an exception is raised:
s['f']

KeyError: 'f'

In [30]:
# Vectorized operations
## When working with raw NumPy arrays, looping through value-by-value is usually not necessary. The same is true when working with Series in Pandas. 
## Series can also be passed into most NumPy methods expecting an ndarray.

In [31]:
s + s

a    -0.696818
b     1.118317
c    -2.100288
d     0.450643
e    24.000000
dtype: float64

In [32]:
s * 2

a    -0.696818
b     1.118317
c    -2.100288
d     0.450643
e    24.000000
dtype: float64

In [33]:
np.exp(s)

a         0.705810
b         1.749200
c         0.349887
d         1.252725
e    162754.791419
dtype: float64

In [34]:
## A key difference between Series and ndarray is that operations between Series automatically align data based on the label. 
## Thus, you can write computations without considering whether the Series involved have the same labels.

In [35]:
s1 = s[1:]

In [36]:
s2 = s[:-1]

In [37]:
s1 + s2

a         NaN
b    1.118317
c   -2.100288
d    0.450643
e         NaN
dtype: float64

In [38]:
# Name attribute
## Series can also have a name attribute.

In [39]:
s = pd.Series(np.random.randn(5), name = 'something') 

In [40]:
s

0    0.414818
1   -0.612958
2    0.439991
3    0.216739
4   -1.312098
Name: something, dtype: float64

In [41]:
s.name

'something'

In [42]:
## DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. 
## You can think of it like a spreadsheet or SQL table, or a dictionary of Series objects. It is generally the most commonly used Pandas object. 
## Like Series, DataFrame accepts many different kinds of input:

# - dictionary of 1D ndarrays, lists, dictionaries, or Series
# - 2D NumPy nd.array
# - Series
# - DataFrame

In [43]:
# From Dictionary of Series or Dictionaries
## The resulting index will be the union of the indexes of the various Series.

In [44]:
d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
         'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])} 

In [45]:
df = pd.DataFrame(d)

In [46]:
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [47]:
pd.DataFrame(d, index = ['d', 'b', 'a'])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [48]:
pd.DataFrame(d, index = ['d', 'b', 'a'], columns = ['two', 'three'])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


In [49]:
## When data is a dictionary, and a columns is not passed, the DataFrame columns will be ordered by the dictionary’s insertion order. 
## There is no sorting if you have Python version >= 3.6 and Pandas version >= 0.23.

# Warning
## If you pass an index and/or columns, you are guaranteeing the index and/or columns of the resulting DataFrame. 
##Thus, a dictionary of Series plus a specific index and/or columns will discard all data not matching to the passed index and/or columns. 
##See the last example above with an empty column labeled three.

In [50]:
# From Dictionary of ndarrays or lists

## The ndarrays must all be of the same length. If an index is passed, it must also be of the same length as the arrays. 
## If no index is passed, the result will be range(n), where n is the array length.

In [51]:
d = {'one': [1., 2., 3., 4.],
         'two': [4., 3., 2., 1.]}

In [52]:
pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [53]:
pd.DataFrame(d, index = ['a', 'b', 'c', 'd'])

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


In [54]:
# From a Series


## The result will be a DataFrame with the same index as the input Series, and with one column whose name is the original name of the Series (only if no other column name is provided).

In [55]:
 pd.DataFrame(pd.Series(np.random.randn(5), name='something'))

Unnamed: 0,something
0,0.190262
1,0.860553
2,-0.157193
3,-0.315345
4,0.353224


In [56]:
df['one']

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [57]:
df['three'] = df['one'] * df['two']

In [58]:
df['flag'] = df['one'] > 2

In [59]:
df

Unnamed: 0,one,two,three,flag
a,1.0,1.0,1.0,False
b,2.0,2.0,4.0,False
c,3.0,3.0,9.0,True
d,,4.0,,False


In [60]:
## Columns can be deleted like with a dictionary:

In [61]:
del df['two']

In [62]:
## When inserting a scalar value, it will naturally be propagated to fill the column:

In [63]:
df['foo'] = 'bar'


In [64]:
df

Unnamed: 0,one,three,flag,foo
a,1.0,1.0,False,bar
b,2.0,4.0,False,bar
c,3.0,9.0,True,bar
d,,,False,bar


In [65]:
## When inserting a Series that does not have the same index as the DataFrame, it will be conformed to the DataFrame’s index.

In [66]:
df['one_trunc'] = df['one'][:2]

In [67]:
df

Unnamed: 0,one,three,flag,foo,one_trunc
a,1.0,1.0,False,bar,1.0
b,2.0,4.0,False,bar,2.0
c,3.0,9.0,True,bar,
d,,,False,bar,


In [68]:
## Operations with scalars are just as you would expect:



In [69]:
df = pd.DataFrame(np.random.randn(8, 3), index = index, columns=list('ABC'))

NameError: name 'index' is not defined

In [70]:
df * 5 + 2

TypeError: can only concatenate str (not "int") to str

In [71]:
df1 = pd.DataFrame({'a': [1, 0, 1], 'b': [0, 1, 1]}, dtype=bool)

In [72]:
df2 = pd.DataFrame({'a': [0, 1, 1], 'b': [1, 1, 0]}, dtype=bool)

In [73]:
df1 & df2

Unnamed: 0,a,b
0,False,False
1,False,True
2,True,False


In [74]:
df1 | df2

Unnamed: 0,a,b
0,True,True
1,True,True
2,True,True


In [75]:
df1 ^ df2

Unnamed: 0,a,b
0,True,True
1,True,False
2,False,True


In [76]:
-df1

Unnamed: 0,a,b
0,False,True
1,True,False
2,False,False


In [78]:
# Pandas dtypes

## In this activity, we will learn about the data types that Pandas is using in Series and DataFrames. 
## This is very important in order to further understand what we can do with our data in Pandas.



## For the most part, Pandas uses NumPy arrays and dtypes for Series or individual columns of a DataFrame. 
## NumPy provides support for float, int, bool, timedelta64[ns] and datetime64[ns].

## However, NumPy doesn't allow non-numeric data types, therefore, Pandas has to extend NumPy's type system in a few places. 
## The following table lists most of Pandas extension types (the most common ones):

In [79]:

## Kind of Data	               Data Type	                          String Aliases
## Categorical	             CategoricalDtype	                       'category'
## nullable integer	            Int64Dtype	             'Int8', 'UInt8', 'Int16', 'UInt16'...
## Strings	                    StringDtype	                          'string'
## Boolean (with NA)	        BooleanDtype	                 'boolean','bool'
## any	                        object dtype	                       'object'


In [80]:
## A convenient dtypes attribute for DataFrame returns a Series with the data type of each column.

In [81]:
dft = pd.DataFrame({'A': np.random.rand(3),
                        'B': 1,
                        'C': 'foo',
                        'D': pd.Timestamp('20010102'),
                        'E': pd.Series([1.0] * 3).astype('float32'),
                        'F': False,
                        'G': pd.Series([1] * 3, dtype='int8')})


In [82]:
dft

Unnamed: 0,A,B,C,D,E,F,G
0,0.75026,1,foo,2001-01-02,1.0,False,1
1,0.994165,1,foo,2001-01-02,1.0,False,1
2,0.677307,1,foo,2001-01-02,1.0,False,1


In [84]:
dft.dtypes

A           float64
B             int64
C            object
D    datetime64[ns]
E           float32
F              bool
G              int8
dtype: object

In [85]:
## Series has the same attribute as well:

In [86]:
dft['A'].dtype

dtype('float64')

In [87]:
## Pandas has two ways of storing strings.

## 1)object dtype, which can hold any Python object, including strings.
## 2)StringDtype, which is dedicated to strings (introduced in 2020, only in the Pandas 1.0.0 version)

## It is recommended to use StringDtype for strings because an object can hide any data type inside.

In [88]:
pd.Series([1, 2, 3, 6., 'foo'])

0      1
1      2
2      3
3    6.0
4    foo
dtype: object

In [89]:
# Converting

## You can use the astype() method to explicitly convert dtypes from one to another. 
## These will by default return a copy, even if the dtype was unchanged (pass copy=False to change this behavior).
## In addition, they will raise an exception if the astype() operation is invalid.



In [90]:
df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float32')

In [91]:
df1.dtypes

A    float32
dtype: object

In [92]:
 df1 = df1.astype('float64')

In [93]:
df1.dtypes

A    float64
dtype: object

In [94]:
## Convert certain columns to a specific dtype by passing a dict to astype().

In [95]:
 dft1 = pd.DataFrame({'a': [1, 0, 1], 'b': [4, 5, 6], 'c': [7, 8, 9]})

In [96]:
 dft1 = dft1.astype({'a': np.bool, 'c': np.float64})

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dft1 = dft1.astype({'a': np.bool, 'c': np.float64})


In [97]:
dft1

Unnamed: 0,a,b,c
0,True,4,7.0
1,False,5,8.0
2,True,6,9.0


In [98]:
dft1.dtypes

a       bool
b      int64
c    float64
dtype: object

In [99]:
# Pandas Basics I


## We will start with the practice of basic operations in Pandas. 
## It is very important to get familiar with this stuff because we will be using it again and again throughout this course.

In [100]:
# We will cover an introduction to Pandas, specifically:

## - Attributes of Pandas objects
## - Counting values in Series
## - Altering labels
## - .dt and .str accessors
## - Sorting

In [101]:
## One of the great things about the frequently used Python packages is that their documentation is really good. 
## We can usually easily google anything we want to do in Pandas. 
## We will also be working intensively with the official documentation throughout this course.

In [102]:
# Attributes of Pandas objects

## Pandas objects have a number of attributes enabling us to access metadata:

## - Shape: gives the axis dimensions of the object, consistent with ndarray

## - Axis labels:

## a) Series: index (only axis)
## b)DataFrame: index (rows) and columns



In [103]:
df = pd.DataFrame(np.random.randn(8, 3), index=index,
                      columns=['A', 'B', 'C'])

NameError: name 'index' is not defined

In [104]:
df.columns = [x.lower() for x in df.columns]

In [105]:
df

Unnamed: 0,one,three,flag,foo,one_trunc
a,1.0,1.0,False,bar,1.0
b,2.0,4.0,False,bar,2.0
c,3.0,9.0,True,bar,
d,,,False,bar,


In [106]:
## We can think of the Pandas objects (Index, Series, DataFrame) as containers for arrays, which hold the actual data and do the actual computation. 
## To get the actual data inside an Index or Series, use the attribute .array.

In [107]:
df['a'].array

KeyError: 'a'

In [108]:
# Counting values in Series

## The value_counts() Series method and top-level function computes a histogram of a 1D array of values.

In [109]:
data = np.random.randint(0, 7, size=50)

In [110]:
data

array([0, 6, 1, 0, 2, 4, 4, 2, 2, 2, 2, 5, 2, 2, 2, 3, 6, 5, 2, 0, 2, 5,
       4, 2, 2, 2, 0, 0, 3, 1, 4, 6, 3, 2, 1, 3, 6, 1, 4, 5, 0, 1, 4, 5,
       1, 5, 3, 6, 2, 0])

In [111]:
s = pd.Series(data)

In [112]:
s.value_counts()

2    15
0     7
1     6
4     6
5     6
6     5
3     5
dtype: int64

In [113]:
## Similarly, we can get the most frequently occurring value(s) (mode()) of the values in a Series or DataFrame.

In [114]:
s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7])


In [115]:
s5.mode()

0    3
1    7
dtype: int64

In [116]:
df5 = pd.DataFrame({"A": np.random.randint(0, 7, size=50),
                        "B": np.random.randint(-10, 15, size=50)})

In [117]:
df5.mode()

Unnamed: 0,A,B
0,3,14.0
1,4,


In [118]:
# .dt and .str accessors

# .dt accessor

###Series has an accessor to succinctly return datetime-like properties for the values of the Series, if it is a datetime/period-like Series. This will return a Series, indexed like an existing Series.



In [119]:
s = pd.Series(pd.date_range('20130101 09:10:12', periods=4))

In [120]:
s

0   2013-01-01 09:10:12
1   2013-01-02 09:10:12
2   2013-01-03 09:10:12
3   2013-01-04 09:10:12
dtype: datetime64[ns]

In [121]:
s.dt.hour

0    9
1    9
2    9
3    9
dtype: int64

In [122]:
s.dt.second

0    12
1    12
2    12
3    12
dtype: int64

In [123]:
s.dt.day

0    1
1    2
2    3
3    4
dtype: int64

In [124]:
s.dt.dayofweek

0    1
1    2
2    3
3    4
dtype: int64

In [125]:
## We can easily produce timezone-aware transformations:

In [126]:
stz = s.dt.tz_localize('US/Eastern')

In [127]:
stz

0   2013-01-01 09:10:12-05:00
1   2013-01-02 09:10:12-05:00
2   2013-01-03 09:10:12-05:00
3   2013-01-04 09:10:12-05:00
dtype: datetime64[ns, US/Eastern]

In [128]:
stz.dt.tz

<DstTzInfo 'US/Eastern' LMT-1 day, 19:04:00 STD>

In [129]:
## We can also chain these types of operations:

In [130]:
s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern')

0   2013-01-01 04:10:12-05:00
1   2013-01-02 04:10:12-05:00
2   2013-01-03 04:10:12-05:00
3   2013-01-04 04:10:12-05:00
dtype: datetime64[ns, US/Eastern]

In [131]:
# .str accessor


## Series is equipped with a set of string processing methods that make it easy to operate on each element of the array. 
## These are accessed via the Series’s str attribute and generally have names matching the equivalent (scalar) built-in string methods.

In [132]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'],
                  dtype="string")

In [133]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5    <NA>
6    caba
7     dog
8     cat
dtype: string

In [134]:
# Sorting
## There are three types of sorting in Pandas: 1. Sorting by index labels 2. Sorting by column values 3. Sorting by a combination of both

In [135]:
# By index
## The Series.sort_index() and DataFrame.sort_index() methods are used to sort a Pandas object by its index levels.

In [136]:
df = pd.DataFrame({
        'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
        'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
        'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})


In [137]:
unsorted_df = df.reindex(index=['a', 'd', 'c', 'b'],
                          columns=['three', 'two', 'one'])

In [138]:
unsorted_df

Unnamed: 0,three,two,one
a,,1.767152,-2.093752
d,0.390976,0.927584,
c,0.359196,-0.051403,-1.270354
b,0.448217,-0.483422,-1.004761


In [139]:
# Sort DataFrame by index
unsorted_df.sort_index()

Unnamed: 0,three,two,one
a,,1.767152,-2.093752
b,0.448217,-0.483422,-1.004761
c,0.359196,-0.051403,-1.270354
d,0.390976,0.927584,


In [140]:
# Sort DataFrame by index
unsorted_df.sort_index(ascending=False)

Unnamed: 0,three,two,one
d,0.390976,0.927584,
c,0.359196,-0.051403,-1.270354
b,0.448217,-0.483422,-1.004761
a,,1.767152,-2.093752


In [141]:
# Sort DataFrame by column names
unsorted_df.sort_index(axis=1)

Unnamed: 0,one,three,two
a,-2.093752,,1.767152
d,,0.390976,0.927584
c,-1.270354,0.359196,-0.051403
b,-1.004761,0.448217,-0.483422


In [143]:
# Sort Series by index
unsorted_df['three'].sort_index()

a         NaN
b    0.448217
c    0.359196
d    0.390976
Name: three, dtype: float64

In [144]:
# By values


## The Series.sort_values() method is used to sort a Series by its values. The DataFrame.sort_values() method is used to sort a DataFrame by its column or row values.



In [145]:
df1 = pd.DataFrame({'one': [2, 1, 1, 1],
                        'two': [1, 3, 2, 4],
                        'three': [5, 4, 3, 2]})

In [146]:
# Sort DataFrame by column "two"
df1.sort_values(by='two')

Unnamed: 0,one,two,three
0,2,1,5
2,1,2,3
1,1,3,4
3,1,4,2


In [147]:
# Sort DataFrame by columns "one" and "two"
df1[['one', 'two', 'three']].sort_values(by=['one', 'two'])

Unnamed: 0,one,two,three
2,1,2,3
1,1,3,4
3,1,4,2
0,2,1,5


In [148]:
## These methods have a special treatment of NA values via the na_position argument:

In [149]:
s[2] = np.nan

In [150]:
s.sort_values()

0       A
3    Aaba
1       B
4    Baca
6    CABA
8     cat
7     dog
2    <NA>
5    <NA>
dtype: string

In [151]:
s.sort_values(na_position='first')

2    <NA>
5    <NA>
0       A
3    Aaba
1       B
4    Baca
6    CABA
8     cat
7     dog
dtype: string

In [152]:
## We can use the name of the index to sort by both an index and a column.

In [153]:
# Build MultiIndex
idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 2),
                                   ('b', 2), ('b', 1), ('b', 1)])

In [154]:
idx.names = ['first', 'second']

In [155]:
# Build DataFrame
df_multi = pd.DataFrame({'A': np.arange(6, 0, -1)},
                            index=idx)

In [156]:
df_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,A
first,second,Unnamed: 2_level_1
a,1,6
a,2,5
a,2,4
b,2,3
b,1,2
b,1,1


In [157]:
# Sort DataFrame by 'second' (index) and 'A' (column)
df_multi.sort_values(by=['second', 'A'])

Unnamed: 0_level_0,Unnamed: 1_level_0,A
first,second,Unnamed: 2_level_1
b,1,1
b,1,2
a,1,6
b,2,3
a,2,4
a,2,5
