# Introduction to pandas
* The premier data science library
* The name pandas derives from panel data 
* Has two main objects : DataFrame, Series
* Is a wrapper around, among other libraries, NumPy and matplotlib
* Series is a vector of data with an index
* DataFrame is a collection of Series with a single index (columns are also an index)
* Was developed by a quant working for a hedge fund, long since semi-retired by the age of about 27


In [1]:
import matplotlib as plt
import numpy as np
import numpy.random as npr
import pandas as pd

In [3]:
print(pd.__version__)
print(np.__version__)
print(plt.__version__)

2.2.2
1.26.4
3.9.2


### 1. make a Series

In [27]:
norms = pd.Series(npr.standard_normal(10))
rands = pd.Series(npr.rand(10))
rands

0    0.450390
1    0.900122
2    0.048221
3    0.644890
4    0.984342
5    0.176996
6    0.822800
7    0.692292
8    0.601607
9    0.671082
dtype: float64

In [29]:
norms.mean()

0.11990127792513527

### 2. make a DataFrame

In [35]:
df = pd.DataFrame([rands, norms])
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.45039,0.900122,0.048221,0.64489,0.984342,0.176996,0.8228,0.692292,0.601607,0.671082
1,-0.343378,0.64258,0.13878,-1.922269,1.393787,-0.508849,0.927443,0.466294,1.506709,-1.102084


In [59]:
df = df.T

In [61]:
df

Unnamed: 0,0,1
0,0.45039,-0.343378
1,0.900122,0.64258
2,0.048221,0.13878
3,0.64489,-1.922269
4,0.984342,1.393787
5,0.176996,-0.508849
6,0.8228,0.927443
7,0.692292,0.466294
8,0.601607,1.506709
9,0.671082,-1.102084


In [65]:
df.mean(axis = 1)
df.mean(axis = 0)

0    0.599274
1    0.119901
dtype: float64

In [67]:
df.describe()

Unnamed: 0,0,1
count,10.0,10.0
mean,0.599274,0.119901
std,0.299776,1.098754
min,0.048221,-1.922269
25%,0.488194,-0.467482
50%,0.657986,0.302537
75%,0.790173,0.856228
max,0.984342,1.506709


In [81]:
data = pd.read_csv('GBTC.csv')

In [83]:
data

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2022-12-15,8.210000,8.353000,8.140000,8.300000,8.300000,2934400
1,2022-12-16,8.055000,8.130000,7.900000,7.930000,7.930000,3795900
2,2022-12-19,7.935000,8.400000,7.930000,8.130000,8.130000,3558400
3,2022-12-20,8.125000,8.300000,7.840000,8.080000,8.080000,3390300
4,2022-12-21,8.000000,8.070000,7.920000,8.050000,8.050000,2538200
...,...,...,...,...,...,...,...
247,2023-12-11,33.520000,33.750000,31.930000,32.459999,32.459999,7478900
248,2023-12-12,32.939999,33.590000,32.810001,33.419998,33.419998,4411500
249,2023-12-13,33.470001,35.619999,33.400002,35.590000,35.590000,6212200
250,2023-12-14,35.209999,35.330002,34.417999,34.900002,34.900002,5263500


In [87]:
data.tail(5)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Change
247,2023-12-11,33.52,33.75,31.93,32.459999,32.459999,7478900,-0.085891
248,2023-12-12,32.939999,33.59,32.810001,33.419998,33.419998,4411500,0.029575
249,2023-12-13,33.470001,35.619999,33.400002,35.59,35.59,6212200,0.064931
250,2023-12-14,35.209999,35.330002,34.417999,34.900002,34.900002,5263500,-0.019387
251,2023-12-15,34.349998,34.400002,33.599998,33.619999,33.619999,1876793,-0.036676


In [85]:
data['Change'] = data['Close'].pct_change()
data['Range'] = (data['Close'] - data.open).abs()
data.dropna(inplace=True)

AttributeError: 'DataFrame' object has no attribute 'open'

In [95]:
data[['Open', 'Close']]

Unnamed: 0,Open,Close
0,8.210000,8.300000
1,8.055000,7.930000
2,7.935000,8.130000
3,8.125000,8.080000
4,8.000000,8.050000
...,...,...
247,33.520000,32.459999
248,32.939999,33.419998
249,33.470001,35.590000
250,35.209999,34.900002


In [107]:
data.iloc[11:30]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Change
11,2023-01-03,8.22,8.24,8.01,8.2,8.2,2821600,-0.010856
12,2023-01-04,8.28,8.45,8.13,8.38,8.38,2291400,0.021951
13,2023-01-05,8.37,8.49,8.21,8.45,8.45,2434500,0.008353
14,2023-01-06,8.31,8.69,8.25,8.65,8.65,2183800,0.023669
15,2023-01-09,8.825,9.83,8.82,9.65,9.65,5852900,0.115607
16,2023-01-10,9.705,9.95,9.45,9.88,9.88,2622300,0.023834
17,2023-01-11,9.635,9.96,9.26,9.8,9.8,2589500,-0.008097
18,2023-01-12,10.205,11.24,9.9,10.49,10.49,7541300,0.070408
19,2023-01-13,10.305,11.43,10.22,11.32,11.32,5758000,0.079123
20,2023-01-17,12.02,12.45,11.3,11.72,11.72,7197600,0.035336


### 3. Dates and Indexes

In [113]:
data.index = data['Date']

data.index

Index(['2022-12-15', '2022-12-16', '2022-12-19', '2022-12-20', '2022-12-21',
       '2022-12-22', '2022-12-23', '2022-12-27', '2022-12-28', '2022-12-29',
       ...
       '2023-12-04', '2023-12-05', '2023-12-06', '2023-12-07', '2023-12-08',
       '2023-12-11', '2023-12-12', '2023-12-13', '2023-12-14', '2023-12-15'],
      dtype='object', name='Date', length=252)

In [123]:
data.drop(columns=['Date'], inplace = True)


In [125]:
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-12-15,8.21,8.353,8.14,8.3,8.3,2934400,
2022-12-16,8.055,8.13,7.9,7.93,7.93,3795900,-0.044578
2022-12-19,7.935,8.4,7.93,8.13,8.13,3558400,0.025221
2022-12-20,8.125,8.3,7.84,8.08,8.08,3390300,-0.00615
2022-12-21,8.0,8.07,7.92,8.05,8.05,2538200,-0.003713


In [127]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 252 entries, 2022-12-15 to 2023-12-15
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       252 non-null    float64
 1   High       252 non-null    float64
 2   Low        252 non-null    float64
 3   Close      252 non-null    float64
 4   Adj Close  252 non-null    float64
 5   Volume     252 non-null    int64  
 6   Change     251 non-null    float64
dtypes: float64(6), int64(1)
memory usage: 15.8+ KB


In [131]:
data.loc['2022-12-20']

Open         8.125000e+00
High         8.300000e+00
Low          7.840000e+00
Close        8.080000e+00
Adj Close    8.080000e+00
Volume       3.390300e+06
Change      -6.150062e-03
Name: 2022-12-20, dtype: float64

In [165]:
#data.index = pd.DatetimeIndex(data.index)
data.loc['2023-01-03':'2023-01-31']

AttributeError: 'DatetimeIndex' object has no attribute 'loc'

In [163]:
# need to give pandas an aggregate function, i.e. mean, std, first, last
data['Close'].resample('ME').last

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [80]:
# M, BM, MS, BMS


In [81]:
# be careful with resampling


### 4. Slicing index loc vs. iloc

### 5. Slicing columns (columns a second index in DataFrames)

In [82]:
# or data['Close']


In [83]:
# reorders columns alphabetically


### 6. Renaming columns

In [167]:
data.rename(columns={'Open':'FirstTrade'}, inplace=True)

TypeError: Index.rename() got an unexpected keyword argument 'columns'

In [84]:
# what gives? 


### 7. Reordering columns

In [171]:
reordered = data[['Volume','Close','Change']]

IndexError: arrays used as indices must be of integer or boolean type

### 8. Removing columns or rows

In [85]:
# set inplace=True to drop the column from original dataset


In [None]:
data.drop(columns = [])

### 9. Sorting

In [173]:
# sort in descending order - sorts on index
data.sort_index(ascending=False)

AttributeError: 'DatetimeIndex' object has no attribute 'sort_index'

In [5]:
# reorders columns alphabetically
data.sort_values(by='Close')

In [6]:
# value_counts - useful summarization method for aggregating categorical data
