## Creating DataFrame from scratch

#### Import Pandas and set some display options for output

In [1]:
# Reference Pandas and Numpy
import numpy as np
import pandas as pd
# Set the output options
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows',10)

### Creating a DataFrame from scratch

#### Create DataFrame from 2-D ndarray

In [2]:
pd.DataFrame(np.array([[10,20],[21,22]]))

    0   1
0  10  20
1  21  22

#### Create DataFrame object by passing a list of Series object

In [3]:
df1 = pd.DataFrame([pd.Series(np.arange(10,20)),pd.Series(np.arange(30,40))])
df1

    0   1   2   3   4   5   6   7   8   9
0  10  11  12  13  14  15  16  17  18  19
1  30  31  32  33  34  35  36  37  38  39

#### Dimensions of DataFrame objects

In [4]:
df1.shape

(2, 10)

### Specifying the column name

In [5]:
df2 = pd.DataFrame(np.array([[10,20],[30,40]]), columns = ["a","b"])
df2

    a   b
0  10  20
1  30  40

#### Access the names of columns for the DataFrame

In [6]:
df2.columns

Index(['a', 'b'], dtype='object')

#### Accessing the default column names

In [7]:
df1.columns

RangeIndex(start=0, stop=10, step=1)

### Retrive just the names of DataFrame columns by position

In [8]:
"{0}, {1}".format(df2.columns[0],df2.columns[1])

'a, b'

#### The Names of column can be changed by using .columns property

In [9]:
df2.columns = ['c','d']
"{0},{1}".format(df2.columns[0],df2.columns[1])

'c,d'

#### Assignment of index labels of the DataFrame

In [10]:
df3 = pd.DataFrame(np.array([[1,2,3],[4,5,6]]),columns = ["a","b","c"], index = ["d", "e"])
df3

   a  b  c
d  1  2  3
e  4  5  6

#### Access the index of the DataFrame object

In [11]:
df3.index

Index(['d', 'e'], dtype='object')

In [12]:
print("""The name of index of Data Frame are:- 
         {0},{1}\n""".format(df3.index[0],df3.index[1]),
     """The name of columns of the Data Frame are:-
         {0},{1},{2}""".format(df3.columns[0],df3.columns[1],df3.columns[2]))

The name of index of Data Frame are:- 
         d,e
 The name of columns of the Data Frame are:-
         a,b,c


#### Creation of DataFrame by a dictionary and Series object

In [13]:
s1 = pd.Series(np.arange(1,6))
s2 = pd.Series(np.arange(7,12))
df4 = pd.DataFrame({'c1':s1,'c2':s2})
df4

   c1  c2
0   1   7
1   2   8
2   3   9
3   4  10
4   5  11

#### Demonstration of automatic alingnment in DataFrame

In [14]:
s3 = pd.Series(np.arange(13,15), index = [1,3])
df5 = pd.DataFrame({'c1':s1,'c2':s2,'c3':s3})
df5

   c1  c2    c3
0   1   7   NaN
1   2   8  13.0
2   3   9   NaN
3   4  10  14.0
4   5  11   NaN

### Read Data from file

#### To view the entire content of file on windows

In [15]:
# !type data\constituents-financials_csv.csv
# commented because creates a long output

### Read the csv file and Examine the first 5 records using '.head()' method

In [16]:
sp505 = pd.read_csv("data\constituents-financials_csv.csv", index_col = "Symbol",usecols = [0,2,3,12])
sp505.head()

                        Sector   Price  Price/Book
Symbol                                            
MMM                Industrials  222.89       11.34
AOS                Industrials   60.24        6.35
ABT                Health Care   56.27        3.19
ABBV               Health Care  108.48       26.14
ACN     Information Technology  150.51       10.62

### Examine the last 5 records using '.tail()' method

In [17]:
sp505.tail()

                        Sector   Price  Price/Book
Symbol                                            
XYL                Industrials   70.24        5.31
YUM     Consumer Discretionary   76.30      212.08
ZBH                Health Care  115.53        2.39
ZION                Financials   50.71        1.42
ZTS                Health Care   71.51       18.09

### Verify The nunmber of records/rows in the data frame

In [18]:
len(sp505)

505

### Examine the index of DataFrame by .index attribute

In [19]:
sp505.index

Index(['MMM', 'AOS', 'ABT', 'ABBV', 'ACN', 'ATVI', 'AYI', 'ADBE', 'AAP', 'AMD',
       ...
       'WYNN', 'XEL', 'XRX', 'XLNX', 'XL', 'XYL', 'YUM', 'ZBH', 'ZION', 'ZTS'],
      dtype='object', name='Symbol', length=505)

### Get the columns of the Data Frame

In [20]:
sp505.columns

Index(['Sector', 'Price', 'Price/Book'], dtype='object')

#### Next data set to be used

In [21]:
! type data\omh.csv

Date,MSFT,AAPL
2014-12-01,48.62,115.07
2014-12-02,48.46,114.63
2014-12-03,48.08,115.93
2014-12-04,48.84,115.49
2014-12-05,48.42,115.0
2014-12-08,47.7,112.4
2014-12-09,47.59,114.12
2014-12-10,46.9,111.95
2014-12-11,47.17,111.62
2014-12-12,46.95,109.73
2014-12-15,46.67,108.23
2014-12-16,45.16,106.75
2014-12-17,45.74,109.41
2014-12-18,47.52,112.65
2014-12-19,47.66,111.78
2014-12-22,47.98,112.94
2014-12-23,48.45,112.54
2014-12-24,48.14,112.01
2014-12-26,47.88,113.99
2014-12-29,47.45,113.91
2014-12-30,47.02,112.52
2014-12-31,46.45,110.38


In [22]:
hist_one_mon = pd.read_csv("data\omh.csv")
hist_one_mon[:4]

         Date   MSFT    AAPL
0  2014-12-01  48.62  115.07
1  2014-12-02  48.46  114.63
2  2014-12-03  48.08  115.93
3  2014-12-04  48.84  115.49

#### Unable to read the dataframe column by position

In [23]:
sp505[['Price', 'Price/Book']].head()

         Price  Price/Book
Symbol                    
MMM     222.89       11.34
AOS      60.24        6.35
ABT      56.27        3.19
ABBV    108.48       26.14
ACN     150.51       10.62

In [24]:
df = sp505.copy()
df.columns = [0,1,2]
df[[1]].head()

             1
Symbol        
MMM     222.89
AOS      60.24
ABT      56.27
ABBV    108.48
ACN     150.51

### Type of DataFrame column

In [25]:
type(df[[1]])

pandas.core.frame.DataFrame

In [26]:
type(df[1])

pandas.core.series.Series

#### Columns retrived via the attribute access 

#### Following will not by applicable to columns having space in their names

In [27]:
sp505.Price.head()

Symbol
MMM     222.89
AOS      60.24
ABT      56.27
ABBV    108.48
ACN     150.51
Name: Price, dtype: float64

### Get location of specified column

In [28]:
print(sp505.columns.get_loc("Price/Book"))

2


### Index selection in a DataFrame

#### Slicing the DataFrame with use of [ ] operator  

In [38]:
sp505[:5]

                        Sector   Price  Price/Book
Symbol                                            
MMM                Industrials  222.89       11.34
AOS                Industrials   60.24        6.35
ABT                Health Care   56.27        3.19
ABBV               Health Care  108.48       26.14
ACN     Information Technology  150.51       10.62

In [29]:
print(sp505[5:12])

                        Sector   Price  Price/Book
Symbol                                            
ATVI    Information Technology   65.83        5.16
AYI                Industrials  145.41        3.55
ADBE    Information Technology  185.16       11.06
AAP     Consumer Discretionary  109.63        2.51
AMD     Information Technology   11.22       21.47
AES                  Utilities   10.06        2.20
AET                Health Care  178.00        3.79


In [30]:
print(sp505["ATVI":"AET"])

                        Sector   Price  Price/Book
Symbol                                            
ATVI    Information Technology   65.83        5.16
AYI                Industrials  145.41        3.55
ADBE    Information Technology  185.16       11.06
AAP     Consumer Discretionary  109.63        2.51
AMD     Information Technology   11.22       21.47
AES                  Utilities   10.06        2.20
AET                Health Care  178.00        3.79


#### Get the row with index label "ATVI" as a series

In [37]:
sp505.loc["ATVI"]

Sector        Information Technology
Price                          65.83
Price/Book                      5.16
Name: ATVI, dtype: object

In [32]:
type(sp505.loc["ATVI"])

pandas.core.series.Series

#### When Two or More indices of sp505 retrived using .loc() the result is a DataFrame  

In [33]:
sp505.loc[["ATVI","AMD"]]

                        Sector  Price  Price/Book
Symbol                                           
ATVI    Information Technology  65.83        5.16
AMD     Information Technology  11.22       21.47

In [34]:
type(sp505.loc[["ATVI","AMD"]])

pandas.core.frame.DataFrame

#### Get the location of any particular symbol in index and use .iloc() to retrive it

In [40]:
i1 = sp505.index.get_loc("YUM")
i2 = sp505.index.get_loc("ATVI")
print("The index location of 'YUM' and 'ATVI' are {0},{1}\n \n".format(i1,i2))
print(sp505.iloc[[i1,i2]])

The index location of 'YUM' and 'ATVI' are 501,5
 

                        Sector  Price  Price/Book
Symbol                                           
YUM     Consumer Discretionary  76.30      212.08
ATVI    Information Technology  65.83        5.16


### Scalar Value lookup by label or location

##### Scalar Value lookup by label 

In [47]:
sp505.at['XRX','Price']

29.8

#### Scalar Value lookup by position -- Prefered method

In [49]:
sp505.iat[0,2]

11.34

### Boolean Selection of rows in DataFrame

In [62]:
sp505[sp505.Price < 15]

                        Sector  Price  Price/Book
Symbol                                           
AMD     Information Technology  11.22       21.47
AES                  Utilities  10.06        2.20
CHK                     Energy   2.82        1.84
F       Consumer Discretionary  10.43        1.26
GE                 Industrials  14.45        1.70
KIM                Real Estate  14.01        1.20
NAVI                Financials  13.38        1.02
RRC                     Energy  12.82        0.59
UAA     Consumer Discretionary  13.14        2.72
UA      Consumer Discretionary  11.95        2.50

#### Boolean selection based on multiple condition

#### Using & operator

In [63]:
sp505[(sp505.Price < 15)&(sp505.Sector == "Consumer Discretionary")]

                        Sector  Price  Price/Book
Symbol                                           
F       Consumer Discretionary  10.43        1.26
UAA     Consumer Discretionary  13.14        2.72
UA      Consumer Discretionary  11.95        2.50

#### Using | operator

In [65]:
sp505[(sp505.Price < 15)|(sp505.Sector == "Consumer Discretionary")]

                        Sector    Price  Price/Book
Symbol                                             
AAP     Consumer Discretionary   109.63        2.51
AMD     Information Technology    11.22       21.47
AES                  Utilities    10.06        2.20
AMZN    Consumer Discretionary  1350.50       24.28
APTV    Consumer Discretionary    89.27        7.56
...                        ...      ...         ...
VIAB    Consumer Discretionary    32.71        2.08
WHR     Consumer Discretionary   164.95        2.57
WYN     Consumer Discretionary   113.56       18.91
WYNN    Consumer Discretionary   169.28       51.69
YUM     Consumer Discretionary    76.30      212.08

[91 rows x 3 columns]

#### Specific column selection using boolean selection

In [73]:
sp505[(sp505.Price < 15)&(sp505.Sector == "Consumer Discretionary")][["Price/Book"]]

        Price/Book
Symbol            
F             1.26
UAA           2.72
UA            2.50

In [74]:
type(sp505[(sp505.Price < 15)&(sp505.Sector == "Consumer Discretionary")][["Price/Book"]])

pandas.core.frame.DataFrame

In [75]:
type(sp505[(sp505.Price < 15)&(sp505.Sector == "Consumer Discretionary")]["Price/Book"])

pandas.core.series.Series

#### Renaming a column

In [78]:
df = sp505.rename(columns= {'Price/Book':'BookValue'}) 
df[:2]

             Sector   Price  BookValue
Symbol                                
MMM     Industrials  222.89      11.34
AOS     Industrials   60.24       6.35

#### No renaming in orignal DataFrame

In [79]:
sp505.columns

Index(['Sector', 'Price', 'Price/Book'], dtype='object')