

<a href='https://huntsman.usu.edu/directory/jahangiry-pedram'> <img src="logo.jpg" /></a>

___
## Pedram Jahangiry 

# Introduction to Pandas

Topics to be covered:

1. Series
2. DataFrames
3. Missing Variables
4. Operations
5. Data import and export

Make sure you have access to the pandas cheatsheet provided in the course folder. 

## 1. Series

Series are very similar to NumPy arrays. The difference is that a Series can have axis labels, meaning it can be indexed by a label, instead of just a number location. We can convert a list,numpy array, or dictionary to a Series.

In [5]:
import numpy as np
import pandas as pd

In [7]:
my_list = [1,2,3]
my_list

[1, 2, 3]

In [8]:
pd.Series(data=my_list)

0    1
1    2
2    3
dtype: int64

In [9]:
labels = ['a','b','c']
pd.Series(data=my_list,index=labels)

a    1
b    2
c    3
dtype: int64

In [10]:
my_array = np.array([1,2,3])
pd.Series(my_array)

0    1
1    2
2    3
dtype: int32

In [11]:
pd.Series(my_array,labels)

a    1
b    2
c    3
dtype: int32

In [12]:
my_dict = {'a':1,'b':2,'c':3}
pd.Series(my_dict)

a    1
b    2
c    3
dtype: int64

In [13]:
my_series = pd.Series(my_dict)
my_series

a    1
b    2
c    3
dtype: int64

In [14]:
my_series[0]         # unlike dictionaries, we can extract info by index number and label. 

1

In [15]:
my_series['a']

1

## 2. DataFrames

DataFrames are directly inspired by the R programming language and are the workhorse of pandas.

In [16]:
np.random.seed(100)  # do this if you want to see the same results as mine

In [21]:
df = pd.DataFrame(data= np.random.randn(4,4),index='A B C D'.split(),columns='W X Y Z'.split())
df

Unnamed: 0,W,X,Y,Z
A,-1.613579,1.470714,-1.188018,-0.549746
B,-0.940046,-0.827932,0.108863,0.50781
C,-0.862227,1.24947,-0.079611,-0.889731
D,-0.881798,0.018639,0.237845,0.013549


In [30]:
df.describe()

Unnamed: 0,W,X,Y,Z
count,4.0,4.0,4.0,4.0
mean,-1.074413,0.477723,-0.23023,-0.22953
std,0.36096,1.079679,0.651696,0.616755
min,-1.613579,-0.827932,-1.188018,-0.889731
25%,-1.108429,-0.193004,-0.356713,-0.634743
50%,-0.910922,0.634054,0.014626,-0.268099
75%,-0.876906,1.304781,0.141109,0.137114
max,-0.862227,1.470714,0.237845,0.50781


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, A to D
Data columns (total 4 columns):
W    4 non-null float64
X    4 non-null float64
Y    4 non-null float64
Z    4 non-null float64
dtypes: float64(4)
memory usage: 160.0+ bytes


In [31]:
df.describe().transpose()              # or equivalently, df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
W,4.0,-1.074413,0.36096,-1.613579,-1.108429,-0.910922,-0.876906,-0.862227
X,4.0,0.477723,1.079679,-0.827932,-0.193004,0.634054,1.304781,1.470714
Y,4.0,-0.23023,0.651696,-1.188018,-0.356713,0.014626,0.141109,0.237845
Z,4.0,-0.22953,0.616755,-0.889731,-0.634743,-0.268099,0.137114,0.50781


### Indexing and extraction

In [32]:
df

Unnamed: 0,W,X,Y,Z
A,-1.613579,1.470714,-1.188018,-0.549746
B,-0.940046,-0.827932,0.108863,0.50781
C,-0.862227,1.24947,-0.079611,-0.889731
D,-0.881798,0.018639,0.237845,0.013549


In [33]:
df['W']   

A   -1.613579
B   -0.940046
C   -0.862227
D   -0.881798
Name: W, dtype: float64

In [34]:
df[['W']]

Unnamed: 0,W
A,-1.613579
B,-0.940046
C,-0.862227
D,-0.881798


In [35]:
df[['W','Y']]

Unnamed: 0,W,Y
A,-1.613579,-1.188018
B,-0.940046,0.108863
C,-0.862227,-0.079611
D,-0.881798,0.237845


In [36]:
df['new'] = df['W'] + df['Y']

In [37]:
df

Unnamed: 0,W,X,Y,Z,new
A,-1.613579,1.470714,-1.188018,-0.549746,-2.801596
B,-0.940046,-0.827932,0.108863,0.50781,-0.831183
C,-0.862227,1.24947,-0.079611,-0.889731,-0.941839
D,-0.881798,0.018639,0.237845,0.013549,-0.643954


In [39]:
df.drop('A',axis=0)

Unnamed: 0,W,X,Y,Z,new
B,-0.940046,-0.827932,0.108863,0.50781,-0.831183
C,-0.862227,1.24947,-0.079611,-0.889731,-0.941839
D,-0.881798,0.018639,0.237845,0.013549,-0.643954


In [40]:
df.drop('new',axis=1)   

Unnamed: 0,W,X,Y,Z
A,-1.613579,1.470714,-1.188018,-0.549746
B,-0.940046,-0.827932,0.108863,0.50781
C,-0.862227,1.24947,-0.079611,-0.889731
D,-0.881798,0.018639,0.237845,0.013549


In [41]:
df

Unnamed: 0,W,X,Y,Z,new
A,-1.613579,1.470714,-1.188018,-0.549746,-2.801596
B,-0.940046,-0.827932,0.108863,0.50781,-0.831183
C,-0.862227,1.24947,-0.079611,-0.889731,-0.941839
D,-0.881798,0.018639,0.237845,0.013549,-0.643954


In [42]:
df.drop('new',axis=1,inplace=True)    
# or alternatively use: df = df.drop('new', 1)

In [43]:
df

Unnamed: 0,W,X,Y,Z
A,-1.613579,1.470714,-1.188018,-0.549746
B,-0.940046,-0.827932,0.108863,0.50781
C,-0.862227,1.24947,-0.079611,-0.889731
D,-0.881798,0.018639,0.237845,0.013549


In [48]:
# we can select a row by calling its label or by selecting based on its position instead of label 
df.loc['A']

W   -1.613579
X    1.470714
Y   -1.188018
Z   -0.549746
Name: A, dtype: float64

In [45]:
df.iloc[0]

W   -1.613579
X    1.470714
Y   -1.188018
Z   -0.549746
Name: A, dtype: float64

In [49]:
df.iloc[np.arange(3)]

Unnamed: 0,W,X,Y,Z
A,-1.613579,1.470714,-1.188018,-0.549746
B,-0.940046,-0.827932,0.108863,0.50781
C,-0.862227,1.24947,-0.079611,-0.889731


In [50]:
df.loc[['A','D'],['W','Z']]

Unnamed: 0,W,Z
A,-1.613579,-0.549746
D,-0.881798,0.013549


### Conditional extraction
This is very similar to numpy array conditional extraction

In [51]:
df

Unnamed: 0,W,X,Y,Z
A,-1.613579,1.470714,-1.188018,-0.549746
B,-0.940046,-0.827932,0.108863,0.50781
C,-0.862227,1.24947,-0.079611,-0.889731
D,-0.881798,0.018639,0.237845,0.013549


In [52]:
df>0

Unnamed: 0,W,X,Y,Z
A,False,True,False,False
B,False,False,True,True
C,False,True,False,False
D,False,True,True,True


In [54]:
df[df>0]

Unnamed: 0,W,X,Y,Z
A,,1.470714,,
B,,,0.108863,0.50781
C,,1.24947,,
D,,0.018639,0.237845,0.013549


In [55]:
df[df['Y']>0]

Unnamed: 0,W,X,Y,Z
B,-0.940046,-0.827932,0.108863,0.50781
D,-0.881798,0.018639,0.237845,0.013549


In [56]:
df[df['Y']>0]['X']

B   -0.827932
D    0.018639
Name: X, dtype: float64

In [57]:
df[df['Y']>0][['Y','Z']]

Unnamed: 0,Y,Z
B,0.108863,0.50781
D,0.237845,0.013549


In [60]:
df[(df['Y']>0) & (df['Z'] < 0.5)]

Unnamed: 0,W,X,Y,Z
D,-0.881798,0.018639,0.237845,0.013549


In [61]:
df

Unnamed: 0,W,X,Y,Z
A,-1.613579,1.470714,-1.188018,-0.549746
B,-0.940046,-0.827932,0.108863,0.50781
C,-0.862227,1.24947,-0.079611,-0.889731
D,-0.881798,0.018639,0.237845,0.013549


In [63]:
df.tail(2)

Unnamed: 0,W,X,Y,Z
C,-0.862227,1.24947,-0.079611,-0.889731
D,-0.881798,0.018639,0.237845,0.013549


## 3. Missing variables

In [64]:
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [67]:
df.isnull()

Unnamed: 0,A,B,C
0,False,False,False
1,False,True,False
2,True,True,False


In [68]:
df.dropna() # by default axis=0,   # this is similar to df[complete.cases(df),] in R

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [69]:
df.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


In [72]:
df.fillna('new value')

Unnamed: 0,A,B,C
0,1,5,1
1,2,new value,2
2,new value,new value,3


In [73]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [74]:
df['A'].mean()

1.5

In [75]:
df['A'].fillna(value=df['A'].mean()) # filling the value with the mean of a column

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64


## 4. Operations

In [76]:
df = pd.DataFrame({'names': 'PJ PJ TJ MJ'.split() ,
                   'GPA': [4,4,3.8,3.5]}, index='A B C D'.split())
df

Unnamed: 0,names,GPA
A,PJ,4.0
B,PJ,4.0
C,TJ,3.8
D,MJ,3.5


In [77]:
df.head(3)

Unnamed: 0,names,GPA
A,PJ,4.0
B,PJ,4.0
C,TJ,3.8


In [78]:
df.tail(1)

Unnamed: 0,names,GPA
D,MJ,3.5


In [79]:
# Unique Values
df['names'].unique()

array(['PJ', 'TJ', 'MJ'], dtype=object)

In [82]:
# number of unique values
df['GPA'].nunique()

3

In [83]:
df['names'].value_counts()     # this is table(df$names) in R

PJ    2
MJ    1
TJ    1
Name: names, dtype: int64

In [84]:
# Applying Functions
df['GPA'].mean()

3.825

In [85]:
round(df['GPA'].std(), 2)

0.24

In [86]:
df['GPA_100'] = df['GPA'].apply(lambda x: x*25)   # of ocurse we are looking for some special functions not just x*25
df

# or alternatively: df['GPA_100']=list(map(lambda x:x*25 , df['GPA']))

Unnamed: 0,names,GPA,GPA_100
A,PJ,4.0,100.0
B,PJ,4.0,100.0
C,TJ,3.8,95.0
D,MJ,3.5,87.5


In [87]:
df.columns            # names(df) in R

Index(['names', 'GPA', 'GPA_100'], dtype='object')

In [88]:
df.index

Index(['A', 'B', 'C', 'D'], dtype='object')

In [89]:
df.sort_values(by='GPA') #inplace=False by default (what does this mean?)

Unnamed: 0,names,GPA,GPA_100
D,MJ,3.5,87.5
C,TJ,3.8,95.0
A,PJ,4.0,100.0
B,PJ,4.0,100.0


In [90]:
df.reset_index(inplace=True)

In [91]:
df

Unnamed: 0,index,names,GPA,GPA_100
0,A,PJ,4.0,100.0
1,B,PJ,4.0,100.0
2,C,TJ,3.8,95.0
3,D,MJ,3.5,87.5


In [94]:
df.set_index('index')

Unnamed: 0_level_0,names,GPA,GPA_100
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,PJ,4.0,100.0
B,PJ,4.0,100.0
C,TJ,3.8,95.0
D,MJ,3.5,87.5


#### Renaming columns:

In [95]:
df.rename(columns={'GPA_100': 'Normalized GPA'}, inplace=True)

In [96]:
df.head()

Unnamed: 0,index,names,GPA,Normalized GPA
0,A,PJ,4.0,100.0
1,B,PJ,4.0,100.0
2,C,TJ,3.8,95.0
3,D,MJ,3.5,87.5


#### Conditional mutation (creating new variables)
Let's say we want to create a new variable named "is_pass" and is defined as "is_pass"=1 if "GPA">3.6 and 0 otherwise. There are multiple ways to do that:

In [97]:
# let's begin with the hard way!
df['GPA']>3.6

0     True
1     True
2     True
3    False
Name: GPA, dtype: bool

In [98]:
temp=[]
for x in df['GPA']:
    if x>3.6: temp.append('pass') 
    else: temp.append('fail')

df['is_pass']=temp
df

Unnamed: 0,index,names,GPA,Normalized GPA,is_pass
0,A,PJ,4.0,100.0,pass
1,B,PJ,4.0,100.0,pass
2,C,TJ,3.8,95.0,pass
3,D,MJ,3.5,87.5,fail


Can we do it in one line? How about using list comprehension? 

In [99]:
df['is_pass2'] = ['pass' if x > 3.6 else 'fail' for x in df['GPA']]    
# in R: df <- mutate(df,is_pass = ifelse(GPA>3.5, "pass", "fail"))
df

Unnamed: 0,index,names,GPA,Normalized GPA,is_pass,is_pass2
0,A,PJ,4.0,100.0,pass,pass
1,B,PJ,4.0,100.0,pass,pass
2,C,TJ,3.8,95.0,pass,pass
3,D,MJ,3.5,87.5,fail,fail


Is there any other way?

In [100]:
df.loc[df['GPA']> 3.6, 'is_pass3'] ='pass'
df.loc[df['GPA']<=3.6, 'is_pass3'] ='fail'
df.head()

Unnamed: 0,index,names,GPA,Normalized GPA,is_pass,is_pass2,is_pass3
0,A,PJ,4.0,100.0,pass,pass,pass
1,B,PJ,4.0,100.0,pass,pass,pass
2,C,TJ,3.8,95.0,pass,pass,pass
3,D,MJ,3.5,87.5,fail,fail,fail


## 5. Data import and export

In [None]:
pd.read_csv()

In [101]:
# reading from CSV file
df = pd.read_csv('GDP.csv')   # reading excel files: pd.read_excel('GDP.xlsx',sheetname='Sheet1')
df.tail(5)

Unnamed: 0,DATE,GDP
285,2018-04-01,20510.177
286,2018-07-01,20749.752
287,2018-10-01,20897.804
288,2019-01-01,21098.827
289,2019-04-01,21339.121


In [102]:
# Writing to CSV file
df.to_csv('GDP_new.csv',index=False)   # writing to excel files: df.to_excel('GDP.xlsx',sheet_name='raw data')

In [103]:
df.to_excel('GDP.xlsx',sheet_name='raw data', index=False)