# Introduction to Pandas and Series Data

In [1]:
import pandas as pd
import numpy as np

## None and NaN

If we have a Series compound by strings the missing values will be put as `None`.

However, if the missing value is in a Series of integers or floats it will have the value `NaN`.

These two are not equivalent one to another

In [18]:
students = ['Alice', 'Jack', None]
numbers = [1, 2, None]

pd.Series(students), pd.Series(numbers)

(0    Alice
 1     Jack
 2     None
 dtype: object,
 0    1.0
 1    2.0
 2    NaN
 dtype: float64)

In [19]:
np.nan == None, np.nan == np.nan, np.isnan(np.nan)

(False, False, True)

## Series

In [22]:
students = {'Alice': 'Physics',
            'Jack': 'Chemistry',
            'Molly': 'English'}

s = pd.Series(students)
s

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [25]:
s.index

Index(['Alice', 'Jack', 'Molly'], dtype='object')

In [30]:
s = pd.Series(['Physics', 'Chemistry', 'English'],
              index=['Alice', 'Jack', 'Molly'])
s

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [34]:
students = ['Alice', 'Molly', 'Sam']
students_scores = {'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English'}
s = pd.Series(students_scores, index=students)
s

Alice    Physics
Molly    English
Sam          NaN
dtype: object

### Querying Series

They can be queried either by index position or index label. If you don't give the index it will be the same

You can use the following attributes:

Numeric location
- `iloc[x: int]`

Index label
- `loc[x: str]`

In [35]:
students = {'Alice': 'Physics',
            'Jack': 'Chemistry',
            'Molly': 'English',
            'Sam': 'History'}

s = pd.Series(students)
s

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [44]:
# the 3rd and 4th form could be confusing and lead to errors when the index are numbers
s.iloc[3], s.loc['Sam'], s[3], s['Sam']

('History', 'History', 'History', 'History')

In [49]:
class_code = {99: 'Physics',
              100: 'Chemistry',
              101: 'English',
              102: 'History'}
s = pd.Series(class_code)
s

99       Physics
100    Chemistry
101      English
102      History
dtype: object

In [54]:
grades = pd.Series([90, 80, 70, 60])
grades

0    90
1    80
2    70
3    60
dtype: int64

In [58]:
total = 0
for grade in grades:
    total += grade

total, grades.sum(), np.sum(grades)

(300, 300, 300)

In [69]:
numbers = pd.Series(np.random.randint(0, 1000, 10000))
numbers.head()

0     26
1    488
2    283
3     38
4    400
dtype: int32

In [63]:
%%timeit -n 100
total = 0
for number in numbers:
    total += number
total/len(numbers)

1.32 ms ± 351 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [72]:
%%timeit -n 100
numbers.mean()

214 µs ± 78.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [75]:
a = s.copy()
a.iloc[2] = 2
a, s

(99       Physics
 100    Chemistry
 101            2
 102      History
 dtype: object,
 99       Physics
 100    Chemistry
 101      English
 102      History
 dtype: object)