In [1]:
# %load command1.py
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

%config InlineBackend.figure_format='svg'
plt.rcParams['figure.dpi']=120

pd.options.display.float_format='{:,.2f}'.format
pd.set_option('display.max_colwidth', None)


## Create a Series

**from a list**

In [2]:
# from a list
companies = ['Google', 'Microsoft', 'Facebook', 'Apple']

pd.Series(companies)

# Custom Index
pd.Series(companies,index=[100,101,102,103])

# Custom Index
pd.Series(companies,index=['GOOGL','MSFT','FB','AAPL'])

0       Google
1    Microsoft
2     Facebook
3        Apple
dtype: object

100       Google
101    Microsoft
102     Facebook
103        Apple
dtype: object

GOOGL       Google
MSFT     Microsoft
FB        Facebook
AAPL         Apple
dtype: object

**from a dict**

In [3]:
companies = {
    'a': 'Google',
    'b': 'Microsoft',
    'c': 'Facebook',
    'd': 'Apple'
}
pd.Series(companies)

# When index is specified
pd.Series(
    companies, 
    index=['a', 'b', 'd']
)

a       Google
b    Microsoft
c     Facebook
d        Apple
dtype: object

a       Google
b    Microsoft
d        Apple
dtype: object

**from a scalar**

In [4]:
pd.Series(10, index=[100, 101, 102, 103])

100    10
101    10
102    10
103    10
dtype: int64

**with the read_csv()**

In [5]:
%%writefile ./pandasData/dataSeries.csv
date,product,price,cost,profit
2019/1/1,A,10,5,1
2019/1/2,B,20,12,2
2019/1/3,C,30,20,3
2019/1/4,D,40,30,4

Overwriting ./pandasData/dataSeries.csv


In [6]:
pd.read_csv('./pandasData/dataSeries.csv')

Unnamed: 0,date,product,price,cost,profit
0,2019/1/1,A,10,5,1
1,2019/1/2,B,20,12,2
2,2019/1/3,C,30,20,3
3,2019/1/4,D,40,30,4


If we want the data to be imported into a Series instead of a DataFrame, we can provide additional arguments usecols and squeeze. The squeeze=True will convert a DataFrame of one column into a Series.

In [7]:
pd.read_csv('./pandasData/dataSeries.csv', usecols=['product'], squeeze=True)



  pd.read_csv('./pandasData/dataSeries.csv', usecols=['product'], squeeze=True)


0    A
1    B
2    C
3    D
Name: product, dtype: object

In [8]:
pd.read_csv('./pandasData/dataSeries.csv', usecols=['product']).squeeze()

0    A
1    B
2    C
3    D
Name: product, dtype: object

## Retrieving data

**with the read_csv()**

In [9]:
s = pd.Series([1,2,3,4,5],index = ['a','b','c','d','e'])

s[0]
s[:3]
s[2:4]
s[::2]

1

a    1
b    2
c    3
dtype: int64

c    3
d    4
dtype: int64

a    1
c    3
e    5
dtype: int64

**with index/label**

In [10]:
s = pd.Series([1,2,3,4,5],index = ['a','b','c','d','e'])
s['a']
s[['b','c','d']]

1

b    2
c    3
d    4
dtype: int64

## Attributes

**Values and indexes**

In [11]:
companies = ['Google', 'Microsoft', 'Facebook', 'Apple']
s = pd.Series(companies)
s
s.values
s.index
s.is_unique

0       Google
1    Microsoft
2     Facebook
3        Apple
dtype: object

array(['Google', 'Microsoft', 'Facebook', 'Apple'], dtype=object)

RangeIndex(start=0, stop=4, step=1)

True

**Data type and size**

In [12]:
s.dtype
s.size
s.shape
s.ndim

dtype('O')

4

(4,)

1

## Methods

**Showing rows**

In [13]:
prices = [10, 5, 3, 2.5, 8, 11]

s = pd.Series(prices)
s.head(2)
s.tail(2)

0   10.00
1    5.00
dtype: float64

4    8.00
5   11.00
dtype: float64

**Aggregations**

In [14]:
s.mean()
s.sum()
s.product()
s.agg(['mean', 'sum', 'product'])

6.583333333333333

39.5

33000.0

mean           6.58
sum           39.50
product   33,000.00
dtype: float64

**Counting values**

In [15]:
s = pd.Series(['a','b','b','a','a'])
s.unique()
s.nunique()
s.value_counts()

array(['a', 'b'], dtype=object)

2

a    3
b    2
dtype: int64

**Sorting by values or index labels**

In [16]:
prices = [10, 5, 3, 2.5, 8, 11]

s = pd.Series(prices)
s.sort_values()
s.sort_values(ascending=False)

3    2.50
2    3.00
1    5.00
4    8.00
0   10.00
5   11.00
dtype: float64

5   11.00
0   10.00
4    8.00
1    5.00
2    3.00
3    2.50
dtype: float64

In [17]:
# To modify the original series
s.sort_values(inplace=True)
s

# ascending by default
s.sort_index()

3    2.50
2    3.00
1    5.00
4    8.00
0   10.00
5   11.00
dtype: float64

0   10.00
1    5.00
2    3.00
3    2.50
4    8.00
5   11.00
dtype: float64

In [18]:
# To sort it in descenting order
s.sort_index(ascending=False)
s

5   11.00
4    8.00
3    2.50
2    3.00
1    5.00
0   10.00
dtype: float64

3    2.50
2    3.00
1    5.00
4    8.00
0   10.00
5   11.00
dtype: float64

In [19]:
# To modify the original series
s.sort_index(inplace=True)
s

0   10.00
1    5.00
2    3.00
3    2.50
4    8.00
5   11.00
dtype: float64

**Working with missing values**

In [20]:
s = pd.Series([1, 2, 3, np.nan, np.nan])
s.isna()
s.isna().sum()
s.count()

0    False
1    False
2    False
3     True
4     True
dtype: bool

2

3

**Searching values**

In [21]:
prices = [10, 5, 3, 2.5, 8, 11]

s = pd.Series(prices)
s.nlargest()
s.nlargest(2)
s.le(5, fill_value=0) # less than or equal
s <= 5

5   11.00
0   10.00
4    8.00
1    5.00
2    3.00
dtype: float64

5   11.00
0   10.00
dtype: float64

0    False
1     True
2     True
3     True
4    False
5    False
dtype: bool

0    False
1     True
2     True
3     True
4    False
5    False
dtype: bool

## Working with PYthon built-in functions

In [22]:
prices = [10, 5, 3, 2.5, 8, 11]

s = pd.Series(prices)
len(s)
type(s)
dir(s)
list(s)
dict(s)

6

pandas.core.series.Series

['T',
 '_AXIS_LEN',
 '_AXIS_ORDERS',
 '_AXIS_TO_AXIS_NUMBER',
 '_HANDLED_TYPES',
 '__abs__',
 '__add__',
 '__and__',
 '__annotations__',
 '__array__',
 '__array_priority__',
 '__array_ufunc__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__long__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdivmod__',
 '__redu

[10.0, 5.0, 3.0, 2.5, 8.0, 11.0]

{0: 10.0, 1: 5.0, 2: 3.0, 3: 2.5, 4: 8.0, 5: 11.0}

Python `in` keyword returns a boolean value that compares the value you provide to the values in the list. It's going to return `True` if it exists among those values and `False` if it does not.

In [23]:
# by default Pandas is going to look among the index labels not the actual values within the Series. 
2.5 in s

False

In [24]:
2.5 in s.values

True