In [5]:
import pandas as pd


In [2]:
students = ['Alice', 'Jack', 'Molly']

pd.Series(students)

0    Alice
1     Jack
2    Molly
dtype: object

In [4]:
# Let's create a little list of numbers

numbers = [1, 2, 3]
# And turn into series
pd.Series(numbers)

0    1
1    2
2    3
dtype: int64

In [5]:
students = ['Alice', 'Jack', None]
pd.Series(students)

0    Alice
1     Jack
2     None
dtype: object

In [6]:
# However, if we create a list of numbers, integers or floats, and put in the None type, pandas automatically converts this
# to a special floating point value designated as NaN, which stands for "Not a Number"
numbers = [1, 2, None]
pd.Series(numbers)

0    1.0
1    2.0
2    NaN
dtype: float64

In [7]:
# NaN is "NOT" equivalent to None and when we try the equality test, the result is False.

# Let's bring in numpy which allows us to generate an NaN value
import numpy as np
# And lets compare it to None
np.nan == None

False

In [8]:
# Instead, you need to use special functions to test for the presence of not a number, such as the Numpy library isnan().

np.isnan(np.nan)

True

In [9]:
# A series can be created directly from dictionary data. If you do this, the index is automatically assigned to the keys of 
# the dictionary data. If you do this, the index is automatically assigned to the keys of the dictionary that you provided
# and not just incrementing integers

students_scores = {'Alice': 'Physics',
                  'Jack': 'Chemistry',
                  'Molly': 'English'}
s = pd.Series(students_scores)
s

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [10]:
s.index

Index(['Alice', 'Jack', 'Molly'], dtype='object')

In [11]:
# Let's create a list of tuples
students = [('Alice', 'Brown'), ('Jack', 'White'), ('Molly', 'Green')]
pd.Series(students)

0    (Alice, Brown)
1     (Jack, White)
2    (Molly, Green)
dtype: object

In [12]:
# You can also separate your index creation from the data by passing in the index as a list explicitly to the series

s = pd.Series(['Physics', 'Chemistry', 'English'], index=['Alice', 'Jack', 'Molly'])
s

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [13]:
students_scores = {'Alice': 'Physics',
                  'Jack': 'Chemistry',
                  'Molly': 'English'}
s = pd.Series(students_scores, index=['Alice', 'Molly', 'Sam'])
s

Alice    Physics
Molly    English
Sam          NaN
dtype: object

# Pandas: Second lesson

In [6]:
# A pandas Series can be queried either by the index position or the index label. If you don't give an index
# to the series when querying, the position and the label are effectively the same values. To query by numeric location,
# starting at zero, use the iloc attribute. To query by the index label, you can use the loc attribute

# Let's start with an example. We'll use students enrolled in classes coming from a dictionary 
student_classes = {'Alic': 'Physics',
                  'Jack': 'Chemistry',
                  'Molly': 'English',
                  'Sam': 'History'}
s = pd.Series(student_classes)
s

Alic       Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [15]:
# So this series, if you wanted to see the fourth entry we would use the iloc attribute with the parameter 3.
s.iloc[3]

'History'

In [16]:
# If you wanted to see what class Molly has, we would use the loc attribute with a parameter of Molly
s.loc['Molly']

'English'

In [1]:
# Keep in mind that iloc and loc are not methods, they are attributes. So you don't use parenthesis to query them,
# but square brackets instead, which is called the indexing operator. In Python this calls get or set for an item depending
# on the context of its use.

# This might seem a bit confusing if you're used to languages where encapsulation of attributes, variables, and properties 
# is common, such as in Java.

In [8]:
# Pandas tries to make our code a bit more readable and provides a sort of smart syntax using the indexing operator
# directly on the series itself. For instance, if you pass in an integer parameter, the operator will behave as if you want it
# to query via the iloc attribute
s[3]

'History'

In [9]:
# If you pass in an object, it will query as if you wanted to use the label based loc attribute.
s['Molly']

'English'

In [10]:
# So what happens if your index is a list of integers ? This is a bit complicated and Pandas can't determine automatically
# whether you're intending to query by index position or index label. So you need to be more explicit and use the iloc or
# loc attributes directly.

# Here's an example using class and their classcode information, where classes are indexed by classcodes, in the form of
# integers
class_code = { 99: 'Physics',
             100: 'Chemistry',
             101: 'English',
             102: 'History'}
s = pd.Series(class_code)

In [11]:
# If we try and call s[0] we get a key error because there's no item in the classes list with an index of zero,
# instead we have to call iloc explicitly if we want the first item.
s[0]

KeyError: 0

In [12]:
# Let's talk about working with the data

grades = pd.Series([90, 80, 70, 60])

total = 0
for grade in grades:
    total += grade
print(total/len(grades))

75.0


In [13]:
# This works, but it's slow. 
# Pandas and the underlying numpy libraries support a method of computation called vectorization.
# Vectorization works with the most of the functions in the numpy library, including the sum function

In [14]:
# Here's how we would really write the code using the numpy sum method. First we need to import 
# the numpy module
import numpy as np

# Then we just call np.sum and pass an iterable item. In this case, our pandas series.
total = np.sum(grades)
print(total/len(grades))


75.0


In [15]:
# Let's create a series of random numbers. This is used a lot when demonstraing techniques with Pandas
numbers = pd.Series(np.random.randint(0,1000,10000))

# Now lets look at the top five items in that series to make sure they actually seem random. We can do this with the head()
# function
numbers.head()

0    302
1    247
2    879
3    810
4    670
dtype: int32

In [16]:
# We can actually verify that length of the series is correct using len function
len(numbers)

10000

In [17]:
# Ok, we're confident now that we have a big series. The ipython interpreter has something called magic functions begin 
# with a percentage sign. If we type this sign and then hit the Tab key, you can see a list of the available functions. You 
# could write your own magic functions too

In [18]:
# Here, we're actually going to use what's called a cellular magic function. These start with two percentage signs and wrap the 
# code in the current Jupyter cell. The function we're going to use is called timeit. This function will run our code a few
# times to determine, on average, how long it takes.

# Let's run timeit with our original iterative code. You can give timeit the number of loops that you would like to run.
# By default, it is 1,000 loops.



In [19]:
%%timeit -n 100
total = 0
for number in numbers:
    total+=number
total/len(numbers)

2.43 ms ± 120 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [20]:
# Not bad. Timeit ran the code and it doesn't seem to take very long at all. Now let's try with vectorization.


In [21]:
%%timeit -n 100
total = np.sum(numbers)
total/len(numbers)

183 µs ± 54.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [22]:
# Wow! This is a pretty shocking difference in the speed and demonstrates why one should be aware of parallel computing
# features and start thinking in functional programming terms. Put more simply, vectorization is the ability for a computer 
# to execute multiple instructions at once, and with high performance chips, especially graphics cards, you can get dramatic
# speedups. Modern graphics cards can run thousands of instructions in parallel.

In [23]:
# A related feature in pandas and numpy is called broadcasting. With broadcasting, you can apply an operation to every value
# in the series, changing the series. For instance, if we wanted to increase every random variable by 2, we could do so quickly
# using += operator directly on the Series object.

# Let's look at the head of our series
numbers.head()

0    302
1    247
2    879
3    810
4    670
dtype: int32

In [24]:
# And now let's just increase everything in the series by 2
numbers+=2
numbers.head()

0    304
1    249
2    881
3    812
4    672
dtype: int32

In [30]:
# The procedural way of doing this would be to iterate through all of the items in the series and increase the values directly 
# Pandas does support iterating through a series much like a dictionary, allowing you to unpack values easily.

# We can use the iteritems() function which returns a label and value
for label, value in numbers.iteritems():
    # now for the item which is returned, lets call replace() 
    numbers.replace(label, value+2)
numbers.head()

0    304
1    249
2    881
3    812
4    672
dtype: int32

In [31]:
# Lets look at some speed comparisons. First, lets try five loops using the iterative approach


In [40]:
%%timeit -n 10
# we'll create a blank new series of items to deal with
s = pd.Series(np.random.randint(0,1000,10000))

for label, value in s.iteritems():
    s.loc[label] = value+2
    


426 ms ± 65.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [36]:
# Now, lets try that using the broadcasting methods


In [41]:
%%timeit -n 10

s = pd.Series(np.random.randint(0,1000,1000))
# And we just broadcast with +=
s+=2

399 µs ± 170 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [42]:
# Amazing. Not only is it significantly faster, but it's more concise and even easier  to read too. The typical mathematical 
# operations you would expect are vectorized, and numpy documentation outlines what it takes to create functions on your own.


In [43]:
# Ones last note on using the indexing operators to access series data. The .loc attribute lets you not only modify data in 
# place, but also add new data as well. If the value you pass in as the index doesn't exist, then a new entry is added.
# And keep in mind, indices can have mixed types. While it's important to be aware of the typing going on underneath,
# Pandas will automatically change the underlying NumPy types as appropriate.


In [44]:
# Here's an example using a Series of a few numbers.
s = pd. Series([1, 2, 3])

# We could add some new value, maybe a unversity course
s.loc['History'] = 102

s

0            1
1            2
2            3
History    102
dtype: int64

In [45]:
# We see that mixed types for data values or index labels are no problem for Pandas. Since "History" is not in the
# original list of indices, s.loc['History'] essentially creates a new element in the series, with the index named
# "History", amd the value of 102

In [46]:
# Up until now I've shown only examples of a series where the index values were unique. I want to end this lecture 
# by showing an example where index values are not unique, and this makes pandas Series a little different from conceptually 
# then for instance, a relational database.

students_classes = pd.Series({'Alice': 'Physics',
                             'Jack': 'Chemistry',
                             'Molly': 'English',
                             'Sam': 'History'})

students_classes

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [47]:
kelly_classes = pd.Series(['Philosophy', 'Arts', 'Math'], index=['Kelly', 'Kelly', 'Kelly'])
kelly_classes

Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

In [49]:
all_students_classes = students_classes.append(kelly_classes)

all_students_classes

Alice       Physics
Jack      Chemistry
Molly       English
Sam         History
Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

In [50]:
# append() is not a in place modifier
students_classes

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [51]:
# Finally, we see that when we query the appended series for Kelly, we don't get a single velue, but a series itself
all_students_classes.loc['Kelly']

Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object