In [7]:
import pandas as pd
import numpy as np

In [8]:
# Series: The object of the pandas library designed to represent one-dimensional data structures.
s = pd.Series([12,-4,7,9])
s

0    12
1    -4
2     7
3     9
dtype: int64

In [9]:
#if we want to include index exclusively then,
s = pd.Series([12,-4,7,9], index = ['a', 'b', 'c', 'd'])
s

a    12
b    -4
c     7
d     9
dtype: int64

In [10]:
# Selcting internal elements
s[2]

7

In [11]:
s['b']

-4

In [12]:
# Selecting multiple elements
s[0:2]

a    12
b    -4
dtype: int64

In [13]:
s[['b','c']]

b   -4
c    7
dtype: int64

In [14]:
#assigning values to the elemnets
s[1] = 0
s

a    12
b     0
c     7
d     9
dtype: int64

In [15]:
s['b'] = 1
s

a    12
b     1
c     7
d     9
dtype: int64

In [16]:
# Defining a series from NumPy arrays and other series
arr = np.array([1,2,3,4])
s3 = pd.Series(arr)
s3

0    1
1    2
2    3
3    4
dtype: int32

In [17]:
arr

array([1, 2, 3, 4])

In [18]:
arr[2] = -2
s3

0    1
1    2
2   -2
3    4
dtype: int32

In [19]:
# To select the element in the series that are greater than 8
s[s>8]

a    12
d     9
dtype: int64

In [20]:
# Evaluating values: To check the existence of duplicate values in a series, we use the function unique()
serd = pd.Series([1,0,2,1,2,3], index = ['White','White','Blue','Green','Green','Yellow'])
serd

White     1
White     0
Blue      2
Green     1
Green     2
Yellow    3
dtype: int64

In [21]:
serd.unique()

array([1, 0, 2, 3], dtype=int64)

In [22]:
# value_counts(): This function not only returns unique values but also calculates the occurence in series
serd.value_counts()

1    2
2    2
0    1
3    1
dtype: int64

In [23]:
# isin(): This function evaluates the relationship theta, whether the values presernt in series. It returns the boolean values
serd.isin([0,3])

White     False
White      True
Blue      False
Green     False
Green     False
Yellow     True
dtype: bool

In [24]:
serd.isin([1,2])

White      True
White     False
Blue       True
Green      True
Green      True
Yellow    False
dtype: bool

In [25]:
serd[serd.isin([0,3])]

White     0
Yellow    3
dtype: int64

In [26]:
# NaN values (Not A Number): To indicate the presence of an empty field or something that is not defined numeric.
s2 = pd.Series([5, -3, np.NAN, 14])
s2

0     5.0
1    -3.0
2     NaN
3    14.0
dtype: float64

In [27]:
s2.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [28]:
s2.notnull()

0     True
1     True
2    False
3     True
dtype: bool

In [29]:
s2[s2.notnull()]

0     5.0
1    -3.0
3    14.0
dtype: float64

In [30]:
s2[s2.isnull()]

2   NaN
dtype: float64

In [34]:
frame1 = pd.DataFrame(np.arange(16).reshape((4,4)), index = ['red','blue','yellow','white'], columns = ['ball', 'pen', 'pencil', 'paper'])
frame2 = pd.DataFrame(np.arange(12).reshape((4,3)), index = ['blue','green','white','yellow'], columns = ['mug','pen', 'ball'])
print(frame1)
print("")
print(frame2)

        ball  pen  pencil  paper
red        0    1       2      3
blue       4    5       6      7
yellow     8    9      10     11
white     12   13      14     15

        mug  pen  ball
blue      0    1     2
green     3    4     5
white     6    7     8
yellow    9   10    11


In [35]:
frame1.add(frame2)

Unnamed: 0,ball,mug,paper,pen,pencil
blue,6.0,,,6.0,
green,,,,,
red,,,,,
white,20.0,,,20.0,
yellow,19.0,,,19.0,


In [36]:
frame1 + frame2

Unnamed: 0,ball,mug,paper,pen,pencil
blue,6.0,,,6.0,
green,,,,,
red,,,,,
white,20.0,,,20.0,
yellow,19.0,,,19.0,


In [38]:
ser = pd.Series(np.arange(4), index = ['ball', 'pen', 'pencil', 'paper'])
print(ser)

ball      0
pen       1
pencil    2
paper     3
dtype: int32


In [40]:
frame1 - ser

Unnamed: 0,ball,pen,pencil,paper
red,0,0,0,0
blue,4,4,4,4
yellow,8,8,8,8
white,12,12,12,12


In [41]:
# if an index is not present in one of the two data strucures, the result will
ser['mug'] = 9
ser

ball      0
pen       1
pencil    2
paper     3
mug       9
dtype: int64

In [42]:
frame1 - ser

Unnamed: 0,ball,mug,paper,pen,pencil
red,0,,0,0,0
blue,4,,4,4,4
yellow,8,,8,8,8
white,12,,12,12,12


# FUNCTION APPLICATION AND MAPPING

1 fucntion by element
2 fucntion by row and column
3 statistical fucntions

In [43]:
#1 The pandas library is built on the foundations of Numpy and then extends many of it's features by adapting them to new
# data structures as series and dataframe. Among these are the universal functions,called unfunc.
np.sqrt(frame1)

Unnamed: 0,ball,pen,pencil,paper
red,0.0,1.0,1.414214,1.732051
blue,2.0,2.236068,2.44949,2.645751
yellow,2.828427,3.0,3.162278,3.316625
white,3.464102,3.605551,3.741657,3.872983


In [59]:
#2
f = lambda x:x.max()-x.min()
def f(x):
    return x.max()-x.min()
print(f(frame1))
print("")
print(frame1.apply(f))
print("")
print(frame1.apply(f,axis=1))
print("")
def f(x):
    return pd.Series([x.min(), x.max()], index=['min','max'])
print(frame1.apply(f))

ball      12
pen       12
pencil    12
paper     12
dtype: int32

ball      12
pen       12
pencil    12
paper     12
dtype: int64

red       3
blue      3
yellow    3
white     3
dtype: int64

     ball  pen  pencil  paper
min     0    1       2      3
max    12   13      14     15


In [60]:
#3
print(frame1.sum())
print("")
print(frame1.mean())
print("")
print(frame1.describe())

ball      24
pen       28
pencil    32
paper     36
dtype: int64

ball      6.0
pen       7.0
pencil    8.0
paper     9.0
dtype: float64

            ball        pen     pencil      paper
count   4.000000   4.000000   4.000000   4.000000
mean    6.000000   7.000000   8.000000   9.000000
std     5.163978   5.163978   5.163978   5.163978
min     0.000000   1.000000   2.000000   3.000000
25%     3.000000   4.000000   5.000000   6.000000
50%     6.000000   7.000000   8.000000   9.000000
75%     9.000000  10.000000  11.000000  12.000000
max    12.000000  13.000000  14.000000  15.000000


# Sorting and Ranking

Another fundamental operation that uses indexing is sorting. Sorting the data is often 
a necessity and it is very important to be able to do it easily. pandas provides the sort_
index() function, which returns a new object that’s identical to the start, but in which 
the elements are ordered.

In [62]:
s1 = pd.Series([5,0,3,8,4], index=['red','blue','yellow','white','green'])
print(s1)
print("")
print(s1.sort_index())

red       5
blue      0
yellow    3
white     8
green     4
dtype: int64

blue      0
green     4
red       5
white     8
yellow    3
dtype: int64


In [63]:
s1.sort_index(ascending = False)

yellow    3
white     8
red       5
green     4
blue      0
dtype: int64

In [65]:
print(frame1.sort_index())
print("")
print(frame1.sort_index(axis = 1))

        ball  pen  pencil  paper
blue       4    5       6      7
red        0    1       2      3
white     12   13      14     15
yellow     8    9      10     11

        ball  paper  pen  pencil
red        0      3    1       2
blue       4      7    5       6
yellow     8     11    9      10
white     12     15   13      14


In [66]:
s1.sort_values()

blue      0
yellow    3
green     4
red       5
white     8
dtype: int64

In [68]:
print(frame1.sort_values(by = 'pen'))
print("")
print(frame1.sort_values(by = ['pen', 'pencil']))

        ball  pen  pencil  paper
red        0    1       2      3
blue       4    5       6      7
yellow     8    9      10     11
white     12   13      14     15

        ball  pen  pencil  paper
red        0    1       2      3
blue       4    5       6      7
yellow     8    9      10     11
white     12   13      14     15


The ranking is an operation closely related to sorting. It mainly consists of assigning 
a rank (that is, a value that starts at 0 and then increase gradually) to each element of the 
series. The rank will be assigned starting from the lowest value to the highest

In [70]:
print(s1.rank())
print("")
print(s1.rank(method = 'first'))
print("")
print(s1.rank(ascending = False))

red       4.0
blue      1.0
yellow    2.0
white     5.0
green     3.0
dtype: float64

red       4.0
blue      1.0
yellow    2.0
white     5.0
green     3.0
dtype: float64

red       2.0
blue      5.0
yellow    4.0
white     1.0
green     3.0
dtype: float64


# Correlation and Covariance

Two important statistical calculations are correlation and covariance, expressed in 
pandas by the corr() and cov() functions. These kind of calculations normally involve 
two series.
Covariance shows you how two variables differ, whereas correlation shows you how the two variablea are related.

In [71]:
seq2 = pd.Series([3,4,3,4,5,4,3,2],['2006','2007','2008','2009','2010','2011','2012','2013'])
seq = pd.Series([1,2,3,4,4,3,2,1],['2006','2007','2008','2009','2010','2011','2012','2013'])
print(seq.corr(seq2))
print("")
print(seq.cov(seq2))

0.7745966692414835

0.8571428571428571


In [76]:
print(seq.corr(seq))
print(seq.cov(seq))

1.0
1.4285714285714284


In [80]:
frame2 = pd.DataFrame([[1,4,3,6],[4,5,6,1],[3,3,1,5],[4,1,6,4]], index=['red','blue','yellow','white'], columns=['ball','pen','pencil','paper'])
print(frame2.corr())
print("")
print(frame2.cov())

            ball       pen    pencil     paper
ball    1.000000 -0.276026  0.577350 -0.763763
pen    -0.276026  1.000000 -0.079682 -0.361403
pencil  0.577350 -0.079682  1.000000 -0.692935
paper  -0.763763 -0.361403 -0.692935  1.000000

            ball       pen    pencil     paper
ball    2.000000 -0.666667  2.000000 -2.333333
pen    -0.666667  2.916667 -0.333333 -1.333333
pencil  2.000000 -0.333333  6.000000 -3.666667
paper  -2.333333 -1.333333 -3.666667  4.666667


In [81]:
ser = pd.Series([0,1,2,3,9], index=['red','blue','yellow','white','green'])
print(frame2.corrwith(ser))
print("")
print( frame2.corrwith(frame1))

ball      0.730297
pen      -0.831522
pencil    0.210819
paper    -0.119523
dtype: float64

ball      0.730297
pen      -0.831522
pencil    0.210819
paper    -0.119523
dtype: float64


# “Not a Number” Data

In the previous sections, you saw how easily missing data can be formed. They are 
recognizable in the data structures by the NaN (Not a Number) value. So, having values 
that are not defined in a data structure is quite common in data analysis.
However, pandas is designed to better manage this eventuality. In fact, in this 
section, you will learn how to treat these values so that many issues can be obviated. 
For example, in the pandas library, calculating descriptive statistics excludes NaN values 
implicitly

In [82]:
ser = pd.Series([0,1,2,np.NaN,9], index=['red','blue','yellow','white','green'])
ser

red       0.0
blue      1.0
yellow    2.0
white     NaN
green     9.0
dtype: float64

In [84]:
ser['white'] = None
ser

red       0.0
blue      1.0
yellow    2.0
white     NaN
green     9.0
dtype: float64

# Filtering Out NaN Values
There are various ways to eliminate the NaN values during data analysis. Eliminating them 
by hand, element by element, can be very tedious and risky, and you’re never sure that 
you eliminated all the NaN values. This is where the dropna() function comes to your aid

In [85]:
print(ser.dropna())
print("")
print(ser.notnull())

red       0.0
blue      1.0
yellow    2.0
green     9.0
dtype: float64

red        True
blue       True
yellow     True
white     False
green      True
dtype: bool


In [87]:
frame3 = pd.DataFrame([[6,np.nan,6],[np.nan,np.nan,np.nan],[2,np.nan,5]], index = ['blue','green','red'], columns = ['ball','mug','pen'])
print(frame3)
print("")
print(frame3.dropna())
print("")
print(frame3.dropna(how='all'))

       ball  mug  pen
blue    6.0  NaN  6.0
green   NaN  NaN  NaN
red     2.0  NaN  5.0

Empty DataFrame
Columns: [ball, mug, pen]
Index: []

      ball  mug  pen
blue   6.0  NaN  6.0
red    2.0  NaN  5.0


# Filling in NaN Occurrences
Rather than filter NaN values within data structures, with the risk of discarding them 
along with values that could be relevant in the context of data analysis, you can replace 
them with other numbers. For most purposes, the fillna() function is a great choice. 
This method takes one argument, the value with which to replace any NaN. It can be the 
same for all cases

In [88]:
print(frame3.fillna(0))
print("")
print(frame3.fillna({'ball':1,'mug':0,'pen':99}))

       ball  mug  pen
blue    6.0  0.0  6.0
green   0.0  0.0  0.0
red     2.0  0.0  5.0

       ball  mug   pen
blue    6.0  0.0   6.0
green   1.0  0.0  99.0
red     2.0  0.0   5.0


# Hierarchical Indexing and Leveling
Hierarchical indexing is a very important feature of pandas, as it allows you to have 
multiple levels of indexes on a single axis. It gives you a way to work with data in multiple 
dimensions while continuing to work in a two-dimensional structure.

In [91]:
s2 = pd.Series(np.random.rand(8), index=[['white','white','white','blue','blue','red','red','red'], ['up','down','right','up','down','up','down','left']])
print(s2)
print("")
print(s2.index)
print("")
print(s2['white'])
print("")
print(s2[:, 'up'])
print("")
print(s2['white', 'up'])

white  up       0.877203
       down     0.484977
       right    0.288400
blue   up       0.431322
       down     0.686827
red    up       0.815620
       down     0.944900
       left     0.644017
dtype: float64

MultiIndex([('white',    'up'),
            ('white',  'down'),
            ('white', 'right'),
            ( 'blue',    'up'),
            ( 'blue',  'down'),
            (  'red',    'up'),
            (  'red',  'down'),
            (  'red',  'left')],
           )

up       0.877203
down     0.484977
right    0.288400
dtype: float64

white    0.877203
blue     0.431322
red      0.815620
dtype: float64

0.8772026431572727


Hierarchical indexing plays a critical role in reshaping data and group-based 
operations such as a pivot-table. For example, the data could be rearranged and used 
in a dataframe with a special function called unstack(). This function converts the 
series with a hierarchical index to a simple dataframe, where the second set of indexes is 
converted into a new set of columns.

In [92]:
s2.unstack()

Unnamed: 0,down,left,right,up
blue,0.686827,,,0.431322
red,0.9449,0.644017,,0.81562
white,0.484977,,0.2884,0.877203


If what you want is to perform the reverse operation, which is to convert a dataframe 
to a series, you use the stack() function

In [94]:
frame1.stack()

red     ball       0
        pen        1
        pencil     2
        paper      3
blue    ball       4
        pen        5
        pencil     6
        paper      7
yellow  ball       8
        pen        9
        pencil    10
        paper     11
white   ball      12
        pen       13
        pencil    14
        paper     15
dtype: int32