# pandas Refresher Part I

In [1]:
import numpy as np
import pandas as pd

In [2]:
pd.__version__

'0.24.2'

### Compare NumPy array to pandas dataframe

In [3]:
arr = np.random.standard_normal((5,5))
arr

array([[-0.26778015,  0.22772269,  1.35590743,  0.17916716, -0.11394779],
       [-2.2975288 ,  0.13037513,  0.79212126, -0.84978141,  0.13620768],
       [-0.78356928,  0.64671396, -0.73981546,  0.27048954,  0.19918343],
       [ 0.52835287, -0.41585217, -1.1820609 , -0.00522687, -1.82540673],
       [-0.29831601,  1.88401071,  0.30369638, -0.74970517,  0.36061248]])

In [4]:
df = pd.DataFrame(arr)
df

Unnamed: 0,0,1,2,3,4
0,-0.26778,0.227723,1.355907,0.179167,-0.113948
1,-2.297529,0.130375,0.792121,-0.849781,0.136208
2,-0.783569,0.646714,-0.739815,0.27049,0.199183
3,0.528353,-0.415852,-1.182061,-0.005227,-1.825407
4,-0.298316,1.884011,0.303696,-0.749705,0.360612


In [8]:
pd.options.display.max_rows = 10
df.tail(3)

Unnamed: 0,0,1,2,3,4
2,-0.783569,0.646714,-0.739815,0.27049,0.199183
3,0.528353,-0.415852,-1.182061,-0.005227,-1.825407
4,-0.298316,1.884011,0.303696,-0.749705,0.360612


### Changing column names

In [9]:
cols = "A B C D E".split()
cols

['A', 'B', 'C', 'D', 'E']

In [10]:
df.columns = cols
df

Unnamed: 0,A,B,C,D,E
0,-0.26778,0.227723,1.355907,0.179167,-0.113948
1,-2.297529,0.130375,0.792121,-0.849781,0.136208
2,-0.783569,0.646714,-0.739815,0.27049,0.199183
3,0.528353,-0.415852,-1.182061,-0.005227,-1.825407
4,-0.298316,1.884011,0.303696,-0.749705,0.360612


### Getting metadata

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
A    5 non-null float64
B    5 non-null float64
C    5 non-null float64
D    5 non-null float64
E    5 non-null float64
dtypes: float64(5)
memory usage: 280.0 bytes


### Changing dataframe index

In [13]:
index = pd.date_range('1/1/2019', periods=5)
index

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05'],
              dtype='datetime64[ns]', freq='D')

In [14]:
df.set_index(index, inplace=True)
df

Unnamed: 0,A,B,C,D,E
2019-01-01,-0.26778,0.227723,1.355907,0.179167,-0.113948
2019-01-02,-2.297529,0.130375,0.792121,-0.849781,0.136208
2019-01-03,-0.783569,0.646714,-0.739815,0.27049,0.199183
2019-01-04,0.528353,-0.415852,-1.182061,-0.005227,-1.825407
2019-01-05,-0.298316,1.884011,0.303696,-0.749705,0.360612


### Subsetting data

In [17]:
df[["A", "B"]]

Unnamed: 0,A,B
2019-01-01,-0.26778,0.227723
2019-01-02,-2.297529,0.130375
2019-01-03,-0.783569,0.646714
2019-01-04,0.528353,-0.415852
2019-01-05,-0.298316,1.884011


In [20]:
df.iloc[0]

A   -0.267780
B    0.227723
C    1.355907
D    0.179167
E   -0.113948
Name: 2019-01-01 00:00:00, dtype: float64

In [25]:
df.loc['2019-01-01']

A   -0.267780
B    0.227723
C    1.355907
D    0.179167
E   -0.113948
Name: 2019-01-01 00:00:00, dtype: float64

### Creating boolean masks

In [27]:
mask = df < 0
df[mask] = 0
df

Unnamed: 0,A,B,C,D,E
2019-01-01,0.0,0.227723,1.355907,0.179167,0.0
2019-01-02,0.0,0.130375,0.792121,0.0,0.136208
2019-01-03,0.0,0.646714,0.0,0.27049,0.199183
2019-01-04,0.528353,0.0,0.0,0.0,0.0
2019-01-05,0.0,1.884011,0.303696,0.0,0.360612


### Creating dataframes from dicts

In [29]:
df2 = pd.DataFrame({"A":range(15), "B": range(15,30)})
df2

Unnamed: 0,A,B
0,0,15
1,1,16
2,2,17
3,3,18
4,4,19
...,...,...
10,10,25
11,11,26
12,12,27
13,13,28


### Adding columns

In [34]:
df2["C"]  = df2["B"] ** 2
df2

Unnamed: 0,A,B,C
0,0,15,225
1,1,16,256
2,2,17,289
3,3,18,324
4,4,19,361
...,...,...,...
10,10,25,625
11,11,26,676
12,12,27,729
13,13,28,784


### Getting descriptives

In [37]:
df2.describe()

Unnamed: 0,A,B,C
count,15.0,15.0,15.0
mean,7.0,22.0,502.666667
std,4.472136,4.472136,197.521307
min,0.0,15.0,225.0
25%,3.5,18.5,342.5
50%,7.0,22.0,484.0
75%,10.5,25.5,650.5
max,14.0,29.0,841.0


### Aggregation

In [39]:
index = np.random.randint(2008, 2020,size=100)
df3 = pd.DataFrame(np.random.normal(100,15, size=100))
df3.set_index(index,inplace=True)
df3

Unnamed: 0,0
2009,124.378870
2018,108.870704
2009,121.644833
2014,77.429134
2010,78.159405
...,...
2017,100.475277
2016,77.042840
2016,104.239844
2008,92.519500


In [42]:
df3.groupby(index).mean()

Unnamed: 0,0
2008,98.693762
2009,101.370750
2010,80.254261
2011,86.427983
2012,95.319531
...,...
2015,103.538378
2016,97.062157
2017,98.895844
2018,106.140560
