In [2]:
#Data Analysis using Pandas 

In [48]:
##Python for Finance, 2nd Edition. This notebook summarizes important concepts and functions of the pandas library that you will need to analyze and summarize data.


In [None]:
'''
Frequently used functions of pandas
DataFrame()
date_range()
'''

In [5]:
import pandas as pd 
df = pd.DataFrame([10,20,30,40],columns=['numbers'],index=['a','b','c','d'])
df

Unnamed: 0,numbers
a,10
b,20
c,30
d,40


In [7]:
#Things to be noted:

## Data itself can be provided in different shapes and types (list, tuple, ndarray, and dict objects are candidates).
## Data is organized in columns, which can have custom names (labels).
## There is an index that can take on different formats (e.g., numbers, strings, time information).



In [8]:
df.columns

Index(['numbers'], dtype='object')

In [9]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [13]:
df.loc['a']

numbers    10
Name: a, dtype: int64

In [15]:
df.sum()

numbers    100
dtype: int64

In [16]:
df.apply(lambda x:x**2)

Unnamed: 0,numbers
a,100
b,400
c,900
d,1600


In [17]:
df**2

Unnamed: 0,numbers
a,100
b,400
c,900
d,1600


In [18]:
df['floats']=(1.5,2.5,3.5,4.5)

In [19]:
df

Unnamed: 0,numbers,floats
a,10,1.5
b,20,2.5
c,30,3.5
d,40,4.5


In [20]:
df['floats']

a    1.5
b    2.5
c    3.5
d    4.5
Name: floats, dtype: float64

In [21]:
df['names'] = pd.DataFrame(['Yves', 'Sandra', 'Lilli', 'Henry'],index=['d', 'a', 'b', 'c']) 

In [22]:
df

Unnamed: 0,numbers,floats,names
a,10,1.5,Sandra
b,20,2.5,Lilli
c,30,3.5,Henry
d,40,4.5,Yves


In [26]:
 df.append({'numbers': 100, 'floats': 5.75, 'names': 'Jil'},
                        ignore_index=True)

Unnamed: 0,numbers,floats,names
0,10,1.5,Sandra
1,20,2.5,Lilli
2,30,3.5,Henry
3,40,4.5,Yves
4,100,5.75,Jil


In [29]:
df = df.append(pd.DataFrame({'numbers': 100, 'floats': 5.75,
                                      'names': 'Jil'}, index=['y']))

In [30]:
df

Unnamed: 0,numbers,floats,names
a,10,1.5,Sandra
b,20,2.5,Lilli
c,30,3.5,Henry
d,40,4.5,Yves
y,100,5.75,Jil
y,100,5.75,Jil


In [31]:
df = df.append(pd.DataFrame({'names': 'Liz'}, index=['z',]),
                        sort=False)

In [32]:
df

Unnamed: 0,numbers,floats,names
a,10.0,1.5,Sandra
b,20.0,2.5,Lilli
c,30.0,3.5,Henry
d,40.0,4.5,Yves
y,100.0,5.75,Jil
y,100.0,5.75,Jil
z,,,Liz


In [33]:
df.dtypes

numbers    float64
floats     float64
names       object
dtype: object

In [34]:
df[['numbers', 'floats']].mean()

numbers    50.000000
floats      3.916667
dtype: float64

In [35]:
df[['numbers', 'floats']].std()

numbers    40.000000
floats      1.736855
dtype: float64

In [36]:
#Second Steps with the DataFrame Class

In [37]:
 import numpy as np
 np.random.seed(100)

In [38]:
a = np.random.standard_normal((9, 4))
a

array([[-1.74976547,  0.3426804 ,  1.1530358 , -0.25243604],
       [ 0.98132079,  0.51421884,  0.22117967, -1.07004333],
       [-0.18949583,  0.25500144, -0.45802699,  0.43516349],
       [-0.58359505,  0.81684707,  0.67272081, -0.10441114],
       [-0.53128038,  1.02973269, -0.43813562, -1.11831825],
       [ 1.61898166,  1.54160517, -0.25187914, -0.84243574],
       [ 0.18451869,  0.9370822 ,  0.73100034,  1.36155613],
       [-0.32623806,  0.05567601,  0.22239961, -1.443217  ],
       [-0.75635231,  0.81645401,  0.75044476, -0.45594693]])

In [39]:
df = pd.DataFrame(a)

In [40]:
df

Unnamed: 0,0,1,2,3
0,-1.749765,0.34268,1.153036,-0.252436
1,0.981321,0.514219,0.22118,-1.070043
2,-0.189496,0.255001,-0.458027,0.435163
3,-0.583595,0.816847,0.672721,-0.104411
4,-0.53128,1.029733,-0.438136,-1.118318
5,1.618982,1.541605,-0.251879,-0.842436
6,0.184519,0.937082,0.731,1.361556
7,-0.326238,0.055676,0.2224,-1.443217
8,-0.756352,0.816454,0.750445,-0.455947


In [41]:
df.dtypes

0    float64
1    float64
2    float64
3    float64
dtype: object

In [42]:
df.columns = ['No1', 'No2', 'No3', 'No4'] 
df

Unnamed: 0,No1,No2,No3,No4
0,-1.749765,0.34268,1.153036,-0.252436
1,0.981321,0.514219,0.22118,-1.070043
2,-0.189496,0.255001,-0.458027,0.435163
3,-0.583595,0.816847,0.672721,-0.104411
4,-0.53128,1.029733,-0.438136,-1.118318
5,1.618982,1.541605,-0.251879,-0.842436
6,0.184519,0.937082,0.731,1.361556
7,-0.326238,0.055676,0.2224,-1.443217
8,-0.756352,0.816454,0.750445,-0.455947


In [43]:
df['No2'].mean()

0.7010330941456459

In [45]:
dates = pd.date_range('2019-1-1', periods=9, freq='M')
dates

DatetimeIndex(['2019-01-31', '2019-02-28', '2019-03-31', '2019-04-30',
               '2019-05-31', '2019-06-30', '2019-07-31', '2019-08-31',
               '2019-09-30'],
              dtype='datetime64[ns]', freq='M')

In [46]:
df.index=dates

In [47]:
df

Unnamed: 0,No1,No2,No3,No4
2019-01-31,-1.749765,0.34268,1.153036,-0.252436
2019-02-28,0.981321,0.514219,0.22118,-1.070043
2019-03-31,-0.189496,0.255001,-0.458027,0.435163
2019-04-30,-0.583595,0.816847,0.672721,-0.104411
2019-05-31,-0.53128,1.029733,-0.438136,-1.118318
2019-06-30,1.618982,1.541605,-0.251879,-0.842436
2019-07-31,0.184519,0.937082,0.731,1.361556
2019-08-31,-0.326238,0.055676,0.2224,-1.443217
2019-09-30,-0.756352,0.816454,0.750445,-0.455947


In [49]:
df.values

array([[-1.74976547,  0.3426804 ,  1.1530358 , -0.25243604],
       [ 0.98132079,  0.51421884,  0.22117967, -1.07004333],
       [-0.18949583,  0.25500144, -0.45802699,  0.43516349],
       [-0.58359505,  0.81684707,  0.67272081, -0.10441114],
       [-0.53128038,  1.02973269, -0.43813562, -1.11831825],
       [ 1.61898166,  1.54160517, -0.25187914, -0.84243574],
       [ 0.18451869,  0.9370822 ,  0.73100034,  1.36155613],
       [-0.32623806,  0.05567601,  0.22239961, -1.443217  ],
       [-0.75635231,  0.81645401,  0.75044476, -0.45594693]])

In [50]:
np.array(df)

array([[-1.74976547,  0.3426804 ,  1.1530358 , -0.25243604],
       [ 0.98132079,  0.51421884,  0.22117967, -1.07004333],
       [-0.18949583,  0.25500144, -0.45802699,  0.43516349],
       [-0.58359505,  0.81684707,  0.67272081, -0.10441114],
       [-0.53128038,  1.02973269, -0.43813562, -1.11831825],
       [ 1.61898166,  1.54160517, -0.25187914, -0.84243574],
       [ 0.18451869,  0.9370822 ,  0.73100034,  1.36155613],
       [-0.32623806,  0.05567601,  0.22239961, -1.443217  ],
       [-0.75635231,  0.81645401,  0.75044476, -0.45594693]])

In [55]:
#Things to be notes for ARRAYS AND DATAFRAMES
###One can generate a DataFrame object from an ndarray object, but one can also easily generate an ndarray object out of a DataFrame by using the values attribute of the DataFrame class or the function np.array() of NumPy.

In [56]:
#Basic Analytics

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9 entries, 2019-01-31 to 2019-09-30
Freq: M
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   No1     9 non-null      float64
 1   No2     9 non-null      float64
 2   No3     9 non-null      float64
 3   No4     9 non-null      float64
dtypes: float64(4)
memory usage: 360.0 bytes


In [58]:
df.describe()

Unnamed: 0,No1,No2,No3,No4
count,9.0,9.0,9.0,9.0
mean,-0.150212,0.701033,0.289193,-0.387788
std,0.988306,0.457685,0.57992,0.877532
min,-1.749765,0.055676,-0.458027,-1.443217
25%,-0.583595,0.34268,-0.251879,-1.070043
50%,-0.326238,0.816454,0.2224,-0.455947
75%,0.184519,0.937082,0.731,-0.104411
max,1.618982,1.541605,1.153036,1.361556
