# Introduction to Pandas

In this section of the course we will learn how to use pandas for data analysis. In this section of the course, you should go through the notebooks in this order:

* Series
* DataFrames
* Missing Data
* GroupBy
* Merging,Joining,and Concatenating
* Operations
* Data Input and Output

# Series

In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
_labels_list = ['ank','ist','izm']
_data = ['06','34','35']
_arr_np = np.array(_data)
_dictionary = {'ank':'06', 'ist':'34', 'izm':'35'}

In [4]:
#pass python list o the pandas
pd.Series(data=_data)

0    06
1    34
2    35
dtype: object

In [5]:
# pass numpy array to the pandas
pd.Series(data= _arr_np)

0    06
1    34
2    35
dtype: object

In [6]:
# pass dictionary to the pandas
pd.Series(_dictionary)

ank    06
ist    34
izm    35
dtype: object

In [7]:
_pd = pd.Series(data=_arr_np, index=_labels_list)

In [8]:
_pd

ank    06
ist    34
izm    35
dtype: object

In [9]:
# you cann access the data with index
_pd['ank']

'06'

In [10]:
_ser1 = pd.Series([45,2,45,76,77],['a','b','c','d','e'])

In [11]:
_ser1

a    45
b     2
c    45
d    76
e    77
dtype: int64

In [12]:
_ser2 = pd.Series([34,45,56,78,89],['a','b','c','d','f'])

In [13]:
_ser2

a    34
b    45
c    56
d    78
f    89
dtype: int64

In [14]:
_ser1 + _ser2

a     79.0
b     47.0
c    101.0
d    154.0
e      NaN
f      NaN
dtype: float64

# DataFrames

In [15]:
from numpy.random import randn

In [16]:
_df = pd.DataFrame(randn(3,5), ['A','B','C'],['DD','EE','FF','GG','FF'])

In [17]:
_df

Unnamed: 0,DD,EE,FF,GG,FF.1
A,-1.755283,-0.693847,-0.003175,1.364844,0.154238
B,0.158059,1.327743,-0.224515,-0.306344,0.632991
C,-0.184397,-0.014375,1.02607,0.234434,0.195283


In [18]:
# you can access by columns
_df['GG']

A    1.364844
B   -0.306344
C    0.234434
Name: GG, dtype: float64

In [19]:
type(_df['GG'])

pandas.core.series.Series

In [20]:
type(_df)

pandas.core.frame.DataFrame

In [21]:
_df[['GG','DD']]

Unnamed: 0,GG,DD
A,1.364844,-1.755283
B,-0.306344,0.158059
C,0.234434,-0.184397


In [22]:
_df['new'] = _df['EE'] + _df['DD']

In [23]:
_df

Unnamed: 0,DD,EE,FF,GG,FF.1,new
A,-1.755283,-0.693847,-0.003175,1.364844,0.154238,-2.44913
B,0.158059,1.327743,-0.224515,-0.306344,0.632991,1.485802
C,-0.184397,-0.014375,1.02607,0.234434,0.195283,-0.198771


In [24]:
# axis= 0 refers to the indexess
_df.drop('FF',axis=1)

Unnamed: 0,DD,EE,GG,new
A,-1.755283,-0.693847,1.364844,-2.44913
B,0.158059,1.327743,-0.306344,1.485802
C,-0.184397,-0.014375,0.234434,-0.198771


In [25]:
_df

Unnamed: 0,DD,EE,FF,GG,FF.1,new
A,-1.755283,-0.693847,-0.003175,1.364844,0.154238,-2.44913
B,0.158059,1.327743,-0.224515,-0.306344,0.632991,1.485802
C,-0.184397,-0.014375,1.02607,0.234434,0.195283,-0.198771


In [26]:
_df.drop(['FF'],axis=1,inplace=True)

In [27]:
_df

Unnamed: 0,DD,EE,GG,new
A,-1.755283,-0.693847,1.364844,-2.44913
B,0.158059,1.327743,-0.306344,1.485802
C,-0.184397,-0.014375,0.234434,-0.198771


In [28]:
#not starding with zero
_df.shape

(3, 4)

In [29]:
# Selecting Rows
_df.loc['A']
# return Series

DD    -1.755283
EE    -0.693847
GG     1.364844
new   -2.449130
Name: A, dtype: float64

In [30]:
# Selecting rows with numeric indexes with iloc

In [31]:
_df.iloc[0]

DD    -1.755283
EE    -0.693847
GG     1.364844
new   -2.449130
Name: A, dtype: float64

In [32]:
_df > 0


Unnamed: 0,DD,EE,GG,new
A,False,False,True,False
B,True,True,False,True
C,False,False,True,False


In [33]:
_df.loc[['A','C'],['EE','new']]

Unnamed: 0,EE,new
A,-0.693847,-2.44913
C,-0.014375,-0.198771


In [34]:
_df1 = _df > 0


In [35]:
_df1

Unnamed: 0,DD,EE,GG,new
A,False,False,True,False
B,True,True,False,True
C,False,False,True,False


In [36]:
_df[_df1]

Unnamed: 0,DD,EE,GG,new
A,,,1.364844,
B,0.158059,1.327743,,1.485802
C,,,0.234434,


In [37]:
_df[_df['DD']>0]

Unnamed: 0,DD,EE,GG,new
B,0.158059,1.327743,-0.306344,1.485802


In [38]:
_df

Unnamed: 0,DD,EE,GG,new
A,-1.755283,-0.693847,1.364844,-2.44913
B,0.158059,1.327743,-0.306344,1.485802
C,-0.184397,-0.014375,0.234434,-0.198771


In [39]:
_df[_df['DD']>0]['GG']

B   -0.306344
Name: GG, dtype: float64

In [44]:
 # "and" keyword does not work with multiple conditions
_df[(_df['GG'] > 0) & (_df['new'] > 1)]

Unnamed: 0,DD,EE,GG,new


In [45]:
_df[(_df['GG'] > 0) | (_df['new'] > 1)]

Unnamed: 0,DD,EE,GG,new
A,-1.755283,-0.693847,1.364844,-2.44913
B,0.158059,1.327743,-0.306344,1.485802
C,-0.184397,-0.014375,0.234434,-0.198771


In [46]:
_df.reset_index()

Unnamed: 0,index,DD,EE,GG,new
0,A,-1.755283,-0.693847,1.364844,-2.44913
1,B,0.158059,1.327743,-0.306344,1.485802
2,C,-0.184397,-0.014375,0.234434,-0.198771


In [47]:
newCol = 'tab teh ist'.split()

In [48]:
newCol

['tab', 'teh', 'ist']

In [49]:
_df['newCol'] = newCol

In [50]:
_df

Unnamed: 0,DD,EE,GG,new,newCol
A,-1.755283,-0.693847,1.364844,-2.44913,tab
B,0.158059,1.327743,-0.306344,1.485802,teh
C,-0.184397,-0.014375,0.234434,-0.198771,ist


In [54]:
_df.set_index('newCol')

Unnamed: 0_level_0,DD,EE,GG,new
newCol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tab,-1.755283,-0.693847,1.364844,-2.44913
teh,0.158059,1.327743,-0.306344,1.485802
ist,-0.184397,-0.014375,0.234434,-0.198771
