In [1]:
import numpy as np
import pandas as pd

In [2]:
sp500 = pd.read_csv("sp500.csv", index_col='Symbol', usecols= [0, 2, 3, 7])

In [4]:
    #Rename 'Book Value' columns:
newSP500 = sp500.rename(columns=
                        {'Book Value':'FY'})  #rename the new data, not the origion data
newSP500

Unnamed: 0_level_0,Sector,Price,FY
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,141.14,26.668
ABT,Health Care,39.60,15.573
ABBV,Health Care,53.95,2.954
ACN,Information Technology,79.79,8.326
ACE,Financials,102.91,86.897
...,...,...,...
YHOO,Information Technology,35.02,12.768
YUM,Consumer Discretionary,74.77,5.147
ZMH,Health Care,101.84,37.181
ZION,Financials,28.43,30.191


In [8]:
#Rename 'Book Value' columns in Origin Data
sp500.rename(columns =
             {'Book Value': 'FK'}, inplace= True)
sp500
    #it is possible to use .FK to access the data
sp500.FK[:5]

Symbol
MMM     26.668
ABT     15.573
ABBV     2.954
ACN      8.326
ACE     86.897
Name: FK, dtype: float64

In [9]:
#Adding new columns with []
sp500_copy = sp500.copy() # create a new data, keep origin data unchanged
sp500_copy['RoundedPrice'] = sp500.Price.round() # add new columns RoundedPrice to data, which is round the Price
                                                 # column
sp500_copy[:2]

Unnamed: 0_level_0,Sector,Price,FK,RoundedPrice
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MMM,Industrials,141.14,26.668,141.0
ABT,Health Care,39.6,15.573,40.0


In [14]:
#Adding new columns using insert(); it is inplace method
    #insert 'RoundedPrice' column at column 1.
copy = sp500.copy()
copy.insert(1, 'RoundedPrice', sp500.Price.round())
copy[:2]

Unnamed: 0_level_0,Sector,RoundedPrice,Price,FK
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MMM,Industrials,141.0,141.14,26.668
ABT,Health Care,40.0,39.6,15.573


In [15]:
#Adding new columns using .loc[]; it is inplace
ss = sp500[:3].copy()
    #a new column an initialize to 0
ss.loc[:,'PER'] = 0 # apply for all row; add new column 'PER' with all values = 0
    # take a look at ss
ss

Unnamed: 0_level_0,Sector,Price,FK,PER
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MMM,Industrials,141.14,26.668,0
ABT,Health Care,39.6,15.573,0
ABBV,Health Care,53.95,2.954,0


# Adding columns using concatenation

In [28]:
# Create a DataFrame with only the RoundedPrice column (RoundedPrice is name of column)
rounded_price = pd.DataFrame({'Price':
                                  sp500.Price.round()})
rounded_price

Unnamed: 0_level_0,Price
Symbol,Unnamed: 1_level_1
MMM,141.0
ABT,40.0
ABBV,54.0
ACN,80.0
ACE,103.0
...,...
YHOO,35.0
YUM,75.0
ZMH,102.0
ZION,28.0


In [20]:
#concatenate a long the coumns axis
concatenated = pd.concat([sp500, rounded_price],axis=1) #(axis 0 is rows; axis 1 is column) concanate new row: 'RoundedPrice'
concatenated

Unnamed: 0_level_0,Sector,Price,FK,RoundedPrice
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MMM,Industrials,141.14,26.668,141.0
ABT,Health Care,39.60,15.573,40.0
ABBV,Health Care,53.95,2.954,54.0
ACN,Information Technology,79.79,8.326,80.0
ACE,Financials,102.91,86.897,103.0
...,...,...,...,...
YHOO,Information Technology,35.02,12.768,35.0
YUM,Consumer Discretionary,74.77,5.147,75.0
ZMH,Health Care,101.84,37.181,102.0
ZION,Financials,28.43,30.191,28.0


In [21]:
#reordering columns
reversed_columns_names = sp500.columns[::-1]
reversed_columns_names

Index(['FK', 'Price', 'Sector'], dtype='object')

In [22]:
sp500[reversed_columns_names][:5]

Unnamed: 0_level_0,FK,Price,Sector
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,26.668,141.14,Industrials
ABT,15.573,39.6,Health Care
ABBV,2.954,53.95,Health Care
ACN,8.326,79.79,Information Technology
ACE,86.897,102.91,Financials


In [29]:
#replacing the contents of a column
copy = sp500.copy()
copy.Price = rounded_price.Price

In [27]:
rounded_price

Unnamed: 0_level_0,RoundedPrice
Symbol,Unnamed: 1_level_1
MMM,141.0
ABT,40.0
ABBV,54.0
ACN,80.0
ACE,103.0
...,...
YHOO,35.0
YUM,75.0
ZMH,102.0
ZION,28.0


# Deleting columns: Using del or pop(); drop()

In [31]:
    #del delete in-place
copy = sp500.copy()
del copy['FK'] # delete 'FK' column
copy[:3]

Unnamed: 0_level_0,Sector,Price
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
MMM,Industrials,141.14
ABT,Health Care,39.6
ABBV,Health Care,53.95


In [32]:
#pop() delete the series and return a series as a result (in-place)
copy =sp500.copy()
poped = copy.pop('Sector')
copy[:2]

Unnamed: 0_level_0,Price,FK
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
MMM,141.14,26.668
ABT,39.6,15.573


In [33]:
 #drop(labels, axis=1) return a new dataframe with columns removed (the original DataFrame object is not modified)
copy =sp500.copy()
afterdrop = copy.drop(['Sector'], axis =1)
afterdrop[:5]

Unnamed: 0_level_0,Price,FK
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
MMM,141.14,26.668
ABT,39.6,15.573
ABBV,53.95,2.954
ACN,79.79,8.326
ACE,102.91,86.897


<Header>Deleting columns: Using del or pop(); drop()</Header>

In [69]:
    #del delete in-place
copy = sp500.copy()
del copy['FK']
copy[:2]

Unnamed: 0_level_0,Sector,Price
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
MMM,Industrials,141.14
ABT,Health Care,39.6


In [70]:
#pop() delete the series and return a series as a result (in-place)
copy =sp500.copy()
poped = copy.pop('Sector')
copy[:2]

Unnamed: 0_level_0,Price,FK
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
MMM,141.14,26.668
ABT,39.6,15.573


In [71]:
#drop(labels, axis=1) return a new dataframe with columns removed (the original DataFrame object is not modified)
copy =sp500.copy()
afterdrop = copy.drop(['Sector'], axis =1)
afterdrop[:5]

Unnamed: 0_level_0,Price,FK
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
MMM,141.14,26.668
ABT,39.6,15.573
ABBV,53.95,2.954
ACN,79.79,8.326
ACE,102.91,86.897


# Appending rows from other DataFrame objects with .append()

In [73]:
# copy the first three rows of sp500
df1 = sp500.iloc[0:3].copy()
# copy 10th and 11th rows
df2 = sp500.iloc[[10, 11, 2]]
# append df1 and df2
appended = df1.append(df2)
# the result is the rows of the first followed by 
# those of the second
appended

Unnamed: 0_level_0,Sector,Price,FK
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,141.14,26.668
ABT,Health Care,39.6,15.573
ABBV,Health Care,53.95,2.954
A,Health Care,56.18,16.928
GAS,Utilities,52.98,32.462
ABBV,Health Care,53.95,2.954


In [75]:
# data frame using df1.index and just a PER column
# also a good example of using a scalar value
# to initialize multiple rows
df3 = pd.DataFrame(0.0, 
                   index=df1.index,
                   columns=['PER'])
df3

Unnamed: 0_level_0,PER
Symbol,Unnamed: 1_level_1
MMM,0.0
ABT,0.0
ABBV,0.0


In [76]:
# append df1 and df3
# each has three rows, so 6 rows is the result
# df1 had no PER column, so NaN from for those rows
# df3 had no BookValue, Price or Sector, so NaN's
df1.append(df3)

Unnamed: 0_level_0,Sector,Price,FK,PER
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MMM,Industrials,141.14,26.668,
ABT,Health Care,39.6,15.573,
ABBV,Health Care,53.95,2.954,
MMM,,,,0.0
ABT,,,,0.0
ABBV,,,,0.0


# Concatenating rows (same as columns with axis= 0)

In [None]:
"""The rows from multiple DataFrame objects can be concatenated to each other using the
pd.concat() function and by specifying axis=0. The default operation of pd.concat()
on two DataFrame objects along the row axis operates in the same way as the .append() method

In [55]:
# copy the first three rows of sp500
df1 = sp500.iloc[0:3].copy()
# copy 10th and 11th rows
df2 = sp500.iloc[[10, 11, 2]]
# pass them as a list
pd.concat([df1, df2])

Unnamed: 0_level_0,Sector,Price,FK
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,141.14,26.668
ABT,Health Care,39.6,15.573
ABBV,Health Care,53.95,2.954
A,Health Care,56.18,16.928
GAS,Utilities,52.98,32.462
ABBV,Health Care,53.95,2.954


In [56]:
# copy df2
df2_2 = df2.copy()
# add a column to df2_2 that is not in df1
df2_2.insert(3, 'Foo', pd.Series(0, index=df2.index))
# see what it looks like
df2_2

Unnamed: 0_level_0,Sector,Price,FK,Foo
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,Health Care,56.18,16.928,0
GAS,Utilities,52.98,32.462,0
ABBV,Health Care,53.95,2.954,0


In [42]:
#Adding and replacing rows
    #remove rows using drop
ss = sp500[:3].copy() # extract the first three rows
ss

Unnamed: 0_level_0,Sector,Price,FK
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,141.14,26.668
ABT,Health Care,39.6,15.573
ABBV,Health Care,53.95,2.954


In [57]:
# now concatenate
pd.concat([df1, df2_2])

Unnamed: 0_level_0,Sector,Price,FK,Foo
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MMM,Industrials,141.14,26.668,
ABT,Health Care,39.6,15.573,
ABBV,Health Care,53.95,2.954,
A,Health Care,56.18,16.928,0.0
GAS,Utilities,52.98,32.462,0.0
ABBV,Health Care,53.95,2.954,0.0


In [58]:
# specify keys
r = pd.concat([df1, df2_2], keys=['df1', 'df2'])
r

Unnamed: 0_level_0,Unnamed: 1_level_0,Sector,Price,FK,Foo
Unnamed: 0_level_1,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df1,MMM,Industrials,141.14,26.668,
df1,ABT,Health Care,39.6,15.573,
df1,ABBV,Health Care,53.95,2.954,
df2,A,Health Care,56.18,16.928,0.0
df2,GAS,Utilities,52.98,32.462,0.0
df2,ABBV,Health Care,53.95,2.954,0.0


# Adding and replacing rows via setting with enlargement

In [59]:
"""Rows can also be added to a DataFrame using the loc property. The parameter for loc
specifies the index label where the row is to be placed. If the label does not exist, the values
are appended to the data frame using the given index label. If the label does exist, the
values in the specified row are replaced"""

ss.loc['FOO'] = ['the sector', 100, 110] # Assign some values to the columns via a list
ss

Unnamed: 0_level_0,Sector,Price,FK
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,141.14,26.668
ABT,Health Care,39.6,15.573
ABBV,Health Care,53.95,2.954
FOO,the sector,100.0,110.0


<Header>Removing rows using .drop()</Header>

In [61]:
# get a copy of the first 5 rows of sp500
ss = sp500[:5]
ss

Unnamed: 0_level_0,Sector,Price,FK
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,141.14,26.668
ABT,Health Care,39.6,15.573
ABBV,Health Care,53.95,2.954
ACN,Information Technology,79.79,8.326
ACE,Financials,102.91,86.897


In [62]:
# drop rows with labels ABT and ACN
afterdrop = ss.drop(['ABT', 'ACN']) # using drop for column need to identify axis =1; using drop for rows don't need axis =
afterdrop[:5]

Unnamed: 0_level_0,Sector,Price,FK
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,141.14,26.668
ABBV,Health Care,53.95,2.954
ACE,Financials,102.91,86.897


# Removing rows using Boolean selection

In [77]:
# determine the rows where Price > 300
selection = sp500.Price > 300 # determine the rows where Price > 300
# report number of rows and number that will be dropped
(len(selection), selection.sum()) # => (500, 10): there are 10 rows the Price higher than 300 in total 500 rows

(500, 10)

In [64]:
# select the complement of the expression
# note the use of the complement of the selection
price_higher_than_300 = sp500[selection]
price_less_than_300 = sp500[~selection]

<Header>Removing rows using a slice</Header>

In [67]:
# get only the first three rows
only_first_three = sp500[:3]
only_first_three

Unnamed: 0_level_0,Sector,Price,FK
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,141.14,26.668
ABT,Health Care,39.6,15.573
ABBV,Health Care,53.95,2.954


In [68]:
# first three, but a copy of them
only_first_three = sp500[:3].copy()
only_first_three

Unnamed: 0_level_0,Sector,Price,FK
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,141.14,26.668
ABT,Health Care,39.6,15.573
ABBV,Health Care,53.95,2.954
