# Pandas: More data wrangling functions

In [1]:
import pandas as pd

# may need to do: pip install yfinance
import yfinance as yf

In [2]:
aapl = yf.download(tickers='AAPL', period='5d', interval='1d')  # Apple
cvx  = yf.download(tickers='CVX',  period='5d', interval='1d')  # Chevron
ggl  = yf.download(tickers='GOOGL',  period='5d', interval='1d')  # Google
wmt  = yf.download(tickers='WMT',  period='5d', interval='1d')  # Walmart

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [3]:
aapl

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-04-13,161.630005,165.800003,161.419998,165.559998,165.559998,68445600
2023-04-14,164.589996,166.320007,163.820007,165.210007,165.210007,49337200
2023-04-17,165.089996,165.389999,164.029999,165.229996,165.229996,41516200
2023-04-18,166.100006,167.410004,165.649994,166.470001,166.470001,49923000
2023-04-19,165.800003,168.160004,165.539993,167.630005,167.630005,47601874


## reset_index()

Move the values of the index to be a normal column. A default numeric index is created

In [4]:
aapl_ri = aapl.reset_index()
aapl_ri

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2023-04-13,161.630005,165.800003,161.419998,165.559998,165.559998,68445600
1,2023-04-14,164.589996,166.320007,163.820007,165.210007,165.210007,49337200
2,2023-04-17,165.089996,165.389999,164.029999,165.229996,165.229996,41516200
3,2023-04-18,166.100006,167.410004,165.649994,166.470001,166.470001,49923000
4,2023-04-19,165.800003,168.160004,165.539993,167.630005,167.630005,47601874


## melt()

Go from a wide to a long (narrow) format. 

Turns each row to an "object-atribute-value" (triple) format, i.e., row-index, column-name, value

In [5]:
aapl_ri.melt()

Unnamed: 0,variable,value
0,Date,2023-04-13 00:00:00
1,Date,2023-04-14 00:00:00
2,Date,2023-04-17 00:00:00
3,Date,2023-04-18 00:00:00
4,Date,2023-04-19 00:00:00
5,Open,161.630005
6,Open,164.589996
7,Open,165.089996
8,Open,166.100006
9,Open,165.800003


In [6]:
# original dataframe not modified
aapl_ri

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2023-04-13,161.630005,165.800003,161.419998,165.559998,165.559998,68445600
1,2023-04-14,164.589996,166.320007,163.820007,165.210007,165.210007,49337200
2,2023-04-17,165.089996,165.389999,164.029999,165.229996,165.229996,41516200
3,2023-04-18,166.100006,167.410004,165.649994,166.470001,166.470001,49923000
4,2023-04-19,165.800003,168.160004,165.539993,167.630005,167.630005,47601874


In [7]:
# You can select a column to be the id or the triple: date in this case

aapl_ri.melt(id_vars=['Date'])

Unnamed: 0,Date,variable,value
0,2023-04-13,Open,161.63
1,2023-04-14,Open,164.59
2,2023-04-17,Open,165.09
3,2023-04-18,Open,166.1
4,2023-04-19,Open,165.8
5,2023-04-13,High,165.8
6,2023-04-14,High,166.32
7,2023-04-17,High,165.39
8,2023-04-18,High,167.41
9,2023-04-19,High,168.16


In [8]:
# You can also create triples for only a subset of the "predicate" columns 

aapl_ri.melt(id_vars=['Date'], value_vars=['Close', 'Volume'])

Unnamed: 0,Date,variable,value
0,2023-04-13,Close,165.56
1,2023-04-14,Close,165.21
2,2023-04-17,Close,165.23
3,2023-04-18,Close,166.47
4,2023-04-19,Close,167.63
5,2023-04-13,Volume,68445600.0
6,2023-04-14,Volume,49337200.0
7,2023-04-17,Volume,41516200.0
8,2023-04-18,Volume,49923000.0
9,2023-04-19,Volume,47601870.0


In [9]:
# You can set nice names for the columns of the long ("triples") dataframe

aapl_triples = aapl_ri.melt(id_vars=['Date'], value_vars=['Close', 'Volume'], var_name='Property', value_name='Value')
aapl_triples

Unnamed: 0,Date,Property,Value
0,2023-04-13,Close,165.56
1,2023-04-14,Close,165.21
2,2023-04-17,Close,165.23
3,2023-04-18,Close,166.47
4,2023-04-19,Close,167.63
5,2023-04-13,Volume,68445600.0
6,2023-04-14,Volume,49337200.0
7,2023-04-17,Volume,41516200.0
8,2023-04-18,Volume,49923000.0
9,2023-04-19,Volume,47601870.0


In [10]:
# you can set the index to be the date again
aapl_triples.set_index('Date')

Unnamed: 0_level_0,Property,Value
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-04-13,Close,165.56
2023-04-14,Close,165.21
2023-04-17,Close,165.23
2023-04-18,Close,166.47
2023-04-19,Close,167.63
2023-04-13,Volume,68445600.0
2023-04-14,Volume,49337200.0
2023-04-17,Volume,41516200.0
2023-04-18,Volume,49923000.0
2023-04-19,Volume,47601870.0


In [11]:
# All these methods create copies. They don't moidfy the original dataframe, unless you use the inplace=True argument
aapl_triples

Unnamed: 0,Date,Property,Value
0,2023-04-13,Close,165.56
1,2023-04-14,Close,165.21
2,2023-04-17,Close,165.23
3,2023-04-18,Close,166.47
4,2023-04-19,Close,167.63
5,2023-04-13,Volume,68445600.0
6,2023-04-14,Volume,49337200.0
7,2023-04-17,Volume,41516200.0
8,2023-04-18,Volume,49923000.0
9,2023-04-19,Volume,47601870.0


## pivot()

Reshape data (produce a “pivot” table) based on column values. Uses unique values from specified index / columns to form axes (columns) of the resulting DataFrame. 

You can use it to "unmelt"

In [12]:
aapl_triples

Unnamed: 0,Date,Property,Value
0,2023-04-13,Close,165.56
1,2023-04-14,Close,165.21
2,2023-04-17,Close,165.23
3,2023-04-18,Close,166.47
4,2023-04-19,Close,167.63
5,2023-04-13,Volume,68445600.0
6,2023-04-14,Volume,49337200.0
7,2023-04-17,Volume,41516200.0
8,2023-04-18,Volume,49923000.0
9,2023-04-19,Volume,47601870.0


In [13]:
aapl_cv = aapl_triples.pivot(index='Date', columns='Property', values='Value')
aapl_cv

Property,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-04-13,165.559998,68445600.0
2023-04-14,165.210007,49337200.0
2023-04-17,165.229996,41516200.0
2023-04-18,166.470001,49923000.0
2023-04-19,167.630005,47601874.0


In [14]:
# Selecting rows with .loc

aapl_cv.loc['20230414':'20230418']

Property,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-04-14,165.210007,49337200.0
2023-04-17,165.229996,41516200.0
2023-04-18,166.470001,49923000.0


## concat

Append rows (or columns) of a list of DataFrames

In [15]:
aapl.head(2)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-04-13,161.630005,165.800003,161.419998,165.559998,165.559998,68445600
2023-04-14,164.589996,166.320007,163.820007,165.210007,165.210007,49337200


In [16]:
ggl.head(2)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-04-13,105.839996,107.489998,105.839996,107.43,107.43,24843600
2023-04-14,106.889999,108.940002,106.839996,108.870003,108.870003,26547800


In [17]:
cvx.head(2)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-04-13,170.25,172.470001,169.820007,172.089996,172.089996,6856600
2023-04-14,172.5,172.880005,171.220001,172.440002,172.440002,4930100


In [18]:
# By default it concatenates along axis0, i.e., rows
pd.concat([aapl.head(2),ggl.head(2),cvx.head(2)])

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-04-13,161.630005,165.800003,161.419998,165.559998,165.559998,68445600
2023-04-14,164.589996,166.320007,163.820007,165.210007,165.210007,49337200
2023-04-13,105.839996,107.489998,105.839996,107.43,107.43,24843600
2023-04-14,106.889999,108.940002,106.839996,108.870003,108.870003,26547800
2023-04-13,170.25,172.470001,169.820007,172.089996,172.089996,6856600
2023-04-14,172.5,172.880005,171.220001,172.440002,172.440002,4930100


In [19]:
# By default it concatenates along axis0, i.e., rows
pd.concat([aapl,ggl,cvx])

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-04-13,161.630005,165.800003,161.419998,165.559998,165.559998,68445600
2023-04-14,164.589996,166.320007,163.820007,165.210007,165.210007,49337200
2023-04-17,165.089996,165.389999,164.029999,165.229996,165.229996,41516200
2023-04-18,166.100006,167.410004,165.649994,166.470001,166.470001,49923000
2023-04-19,165.800003,168.160004,165.539993,167.630005,167.630005,47601874
2023-04-13,105.839996,107.489998,105.839996,107.43,107.43,24843600
2023-04-14,106.889999,108.940002,106.839996,108.870003,108.870003,26547800
2023-04-17,104.660004,106.160004,104.519997,105.970001,105.970001,37571200
2023-04-18,106.489998,106.540001,104.07,104.5,104.5,26596400
2023-04-19,103.580002,104.980003,103.07,104.18,104.18,20858252


In [20]:
aapl['Close']

Date
2023-04-13    165.559998
2023-04-14    165.210007
2023-04-17    165.229996
2023-04-18    166.470001
2023-04-19    167.630005
Name: Close, dtype: float64

In [21]:
aapl['Close'].rename('APPL Close')

Date
2023-04-13    165.559998
2023-04-14    165.210007
2023-04-17    165.229996
2023-04-18    166.470001
2023-04-19    167.630005
Name: APPL Close, dtype: float64

In [22]:
# Append columns
pd.concat([aapl['Close'].rename('APPL_close'), ggl['Close'].rename('GOOGL_Close')], axis=1)

Unnamed: 0_level_0,APPL_close,GOOGL_Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-04-13,165.559998,107.43
2023-04-14,165.210007,108.870003
2023-04-17,165.229996,105.970001
2023-04-18,166.470001,104.5
2023-04-19,167.630005,104.18


In [23]:
pd.concat([aapl['Close'].rename('APPL'), ggl['Close'].rename('GOOGL'), cvx['Close'].rename('CVX'), wmt['Close'].rename('WMT') ], axis=1)

Unnamed: 0_level_0,APPL,GOOGL,CVX,WMT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-04-13,165.559998,107.43,172.089996,149.490005
2023-04-14,165.210007,108.870003,172.440002,148.479996
2023-04-17,165.229996,105.970001,170.919998,149.520004
2023-04-18,166.470001,104.5,170.520004,149.850006
2023-04-19,167.630005,104.18,170.679993,150.009995


## drop_duplicates()

In [24]:
aapl_dup = pd.concat([aapl.head(2),aapl.head(2)])
aapl_dup

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-04-13,161.630005,165.800003,161.419998,165.559998,165.559998,68445600
2023-04-14,164.589996,166.320007,163.820007,165.210007,165.210007,49337200
2023-04-13,161.630005,165.800003,161.419998,165.559998,165.559998,68445600
2023-04-14,164.589996,166.320007,163.820007,165.210007,165.210007,49337200


In [25]:
aapl_dup.drop_duplicates()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-04-13,161.630005,165.800003,161.419998,165.559998,165.559998,68445600
2023-04-14,164.589996,166.320007,163.820007,165.210007,165.210007,49337200


## sample()

Sample rows from a dataframe.

Useful for statistics and machine learning, e.g, to select a random subset of a large dataframe

In [26]:
aapl_1000 = yf.download(tickers='AAPL', period='1000d', interval='1d')  # Apple

[*********************100%***********************]  1 of 1 completed


In [27]:
# randomly select a fraction of the rows

aapl_1000.sample(frac=0.01) # 1000 * 0.01 = 10 rows

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-06-22,87.834999,89.864998,87.787498,89.717499,88.206627,135445200
2020-03-17,61.877499,64.402496,59.599998,63.215,61.982658,324056000
2021-02-03,135.759995,135.770004,133.610001,133.940002,132.14946,89880900
2020-06-26,91.102501,91.330002,88.254997,88.407501,86.918686,205256800
2021-06-28,133.410004,135.25,133.350006,134.779999,133.403152,62111300
2021-12-01,167.479996,170.300003,164.529999,164.770004,163.569489,152052500
2019-11-08,64.672501,65.110001,64.212502,65.035004,63.616207,69986400
2022-09-14,154.789993,157.100006,153.610001,155.309998,154.816406,87965400
2021-07-23,147.550003,148.720001,146.919998,148.559998,147.042358,71447400
2021-08-17,150.229996,151.679993,149.089996,150.190002,148.878418,92229700


In [28]:
# randomly select a number of rows

aapl_1000.sample(n=7) 

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-11-01,155.080002,155.449997,149.130005,150.649994,150.171204,80379300
2021-02-01,133.75,135.380005,130.929993,134.139999,132.346771,106239800
2020-12-17,128.899994,129.580002,128.039993,128.699997,126.979477,94359800
2021-08-17,150.229996,151.679993,149.089996,150.190002,148.878418,92229700
2020-07-24,90.987503,92.970001,89.144997,92.614998,91.055336,185438800
2020-03-31,63.900002,65.622498,63.0,63.572498,62.333191,197002000
2021-09-21,143.929993,144.600006,142.779999,143.429993,142.17746,75834000


## Method Chaining

Most operations on dataframes return a dataframe, so you can chain methods using the dot (.) notation

In [29]:
aapl_1000.head(7)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-05-01,52.470001,53.827499,52.307499,52.630001,50.937286,259309200
2019-05-02,52.459999,53.162498,52.032501,52.287498,50.605801,127985200
2019-05-03,52.7225,52.959999,52.557499,52.9375,51.23489,83569600
2019-05-06,51.072498,52.209999,50.875,52.119999,50.443687,129772400
2019-05-07,51.470001,51.855,50.2075,50.715,49.083878,155054800
2019-05-08,50.474998,51.334999,50.4375,50.724998,49.093555,105358000
2019-05-09,50.099998,50.419998,49.165001,50.18,48.566086,139634400


In [30]:
aapl_1000.head(7).reset_index()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2019-05-01,52.470001,53.827499,52.307499,52.630001,50.937286,259309200
1,2019-05-02,52.459999,53.162498,52.032501,52.287498,50.605801,127985200
2,2019-05-03,52.7225,52.959999,52.557499,52.9375,51.23489,83569600
3,2019-05-06,51.072498,52.209999,50.875,52.119999,50.443687,129772400
4,2019-05-07,51.470001,51.855,50.2075,50.715,49.083878,155054800
5,2019-05-08,50.474998,51.334999,50.4375,50.724998,49.093555,105358000
6,2019-05-09,50.099998,50.419998,49.165001,50.18,48.566086,139634400


In [31]:
aapl_1000.head(7).reset_index().melt(id_vars=['Date'], value_vars=['Close', 'Volume'])

Unnamed: 0,Date,variable,value
0,2019-05-01,Close,52.63
1,2019-05-02,Close,52.2875
2,2019-05-03,Close,52.9375
3,2019-05-06,Close,52.12
4,2019-05-07,Close,50.715
5,2019-05-08,Close,50.725
6,2019-05-09,Close,50.18
7,2019-05-01,Volume,259309200.0
8,2019-05-02,Volume,127985200.0
9,2019-05-03,Volume,83569600.0


In [32]:
aapl_1000.head(7).reset_index().melt(id_vars=['Date'], value_vars=['Close', 'Volume'], var_name='Property', value_name='Value')

Unnamed: 0,Date,Property,Value
0,2019-05-01,Close,52.63
1,2019-05-02,Close,52.2875
2,2019-05-03,Close,52.9375
3,2019-05-06,Close,52.12
4,2019-05-07,Close,50.715
5,2019-05-08,Close,50.725
6,2019-05-09,Close,50.18
7,2019-05-01,Volume,259309200.0
8,2019-05-02,Volume,127985200.0
9,2019-05-03,Volume,83569600.0


In [34]:
# For clarity you can break a long line into multiple lines using backslash (\)
aapl_1000.head(7)\
.reset_index()\
.melt(id_vars=['Date'], value_vars=['Close', 'Volume'], var_name='Property', value_name='Value')\
.query("Property=='Close'")

Unnamed: 0,Date,Property,Value
0,2019-05-01,Close,52.630001
1,2019-05-02,Close,52.287498
2,2019-05-03,Close,52.9375
3,2019-05-06,Close,52.119999
4,2019-05-07,Close,50.715
5,2019-05-08,Close,50.724998
6,2019-05-09,Close,50.18


In [35]:
# For clarity you can break a long line into multiple lines using backslash (\)
aapl_1000\
.reset_index()\
.melt(id_vars=['Date'], value_vars=['Close', 'Volume'], var_name='Property', value_name='Value')\
.query("Property=='Close'")\
.max()

Date        2023-04-19 00:00:00
Property                  Close
Value                182.009995
dtype: object

In [36]:
# For clarity you can break a long line into multiple lines using backslash (\)
aapl_1000\
.reset_index()\
.melt(id_vars=['Date'], value_vars=['Close', 'Volume'], var_name='Property', value_name='Value')\
.query("Property=='Close'")\
.min()

Date        2019-05-01 00:00:00
Property                  Close
Value                 43.325001
dtype: object

In [37]:
aapl_1000.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-04-13,161.630005,165.800003,161.419998,165.559998,165.559998,68445600
2023-04-14,164.589996,166.320007,163.820007,165.210007,165.210007,49337200
2023-04-17,165.089996,165.389999,164.029999,165.229996,165.229996,41516200
2023-04-18,166.100006,167.410004,165.649994,166.470001,166.470001,49923000
2023-04-19,165.800003,168.160004,165.539993,167.630005,167.630005,47601874
