## DataFrame
DataFrames are the workhorse of pandas and are directly inspired by the R programming language. We can think of a DataFrame as a bunch of Series objects put together to share the same index. 

In [None]:
import numpy as np
import pandas as pd

* Getting Single Column
* Getting Multiple Columns
* Add New Cloumn
* Remove Existing Column

In [2]:
columns = ['W','X','Y','Z']
index = ['A','B','C','D','E']

In [3]:
np.random.seed(42)
data = np.random.randint(-100,100, (5,4))
data

array([[  2,  79,  -8, -86],
       [  6, -29,  88, -80],
       [  2,  21, -26, -13],
       [ 16,  -1,   3,  51],
       [ 30,  49, -48, -99]])

In [4]:
df = pd.DataFrame(data, index, columns)
df

Unnamed: 0,W,X,Y,Z
A,2,79,-8,-86
B,6,-29,88,-80
C,2,21,-26,-13
D,16,-1,3,51
E,30,49,-48,-99


In [5]:
df['W']

A     2
B     6
C     2
D    16
E    30
Name: W, dtype: int32

In [6]:
#Single column is Series
#Full DataFrame is collection of Series
print(type(df['W']))
print(type(df))

<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>


In [7]:
#Multiple Columns/Series from DataFrame
df[['W','X']]

Unnamed: 0,W,X
A,2,79
B,6,-29
C,2,21
D,16,-1
E,30,49


### Feature Engineering

In [8]:
#Add a new column
df['new'] = [10,20,30,40,50]

In [9]:
df

Unnamed: 0,W,X,Y,Z,new
A,2,79,-8,-86,10
B,6,-29,88,-80,20
C,2,21,-26,-13,30
D,16,-1,3,51,40
E,30,49,-48,-99,50


In [25]:
df['SumX+Y'] = df['X'] + df['Y']

In [26]:
df

Unnamed: 0,W,X,Y,Z,new,sumX+Y,SumX+Y
A,2,79,-8,-86,10,71,71
B,6,-29,88,-80,20,59,59
C,2,21,-26,-13,30,-5,-5
D,16,-1,3,51,40,2,2
E,30,49,-48,-99,50,1,1


In [28]:
#Remove column from df
df.drop('SumX+Y', axis=1)

Unnamed: 0,W,X,Y,Z,new,sumX+Y
A,2,79,-8,-86,10,71
B,6,-29,88,-80,20,59
C,2,21,-26,-13,30,-5
D,16,-1,3,51,40,2
E,30,49,-48,-99,50,1


In [32]:
#df.drop() is not inplace removed
df

Unnamed: 0,W,X,Y,Z,new,sumX+Y,SumX+Y
A,2,79,-8,-86,10,71,71
B,6,-29,88,-80,20,59,59
C,2,21,-26,-13,30,-5,-5
D,16,-1,3,51,40,2,2
E,30,49,-48,-99,50,1,1


In [37]:
#For permanently removed need to re-assignment
df = df.drop('SumX+Y', axis=1)

In [40]:
#There is no longer the column --> SumX+Y
df

Unnamed: 0,W,X,Y,Z,new,sumX+Y
A,2,79,-8,-86,10,71
B,6,-29,88,-80,20,59
C,2,21,-26,-13,30,-5
D,16,-1,3,51,40,2
E,30,49,-48,-99,50,1


In [41]:
#Another way to do inplace = True
df.drop('sumX+Y', axis=1, inplace=True)

In [43]:
#There is no longer the column --> sumX+Y
df

Unnamed: 0,W,X,Y,Z,new
A,2,79,-8,-86,10
B,6,-29,88,-80,20
C,2,21,-26,-13,30
D,16,-1,3,51,40
E,30,49,-48,-99,50


### DataFrame
* Getting Single Row
* Getting Multiple Rows
* Add New Row
* Remove Existing Row

In [50]:
# loc --> looking for row
# loc return row as series for single row
# column will be index for single series
df.loc['A']

W       2
X      79
Y      -8
Z     -86
new    10
Name: A, dtype: int64

In [51]:
# Multiple rows
# For multiple rows column has no change
df.loc[['A','E']]

Unnamed: 0,W,X,Y,Z,new
A,2,79,-8,-86,10
E,30,49,-48,-99,50


In [52]:
# iloc --> integer of loc
# iloc return row as series for single row
# column will be index for single series
df.iloc[0]

W       2
X      79
Y      -8
Z     -86
new    10
Name: A, dtype: int64

In [56]:
# Multiple rows (same as df.loc[['A','E']])
# For multiple rows column has no change
df.iloc[0:3]

Unnamed: 0,W,X,Y,Z,new
A,2,79,-8,-86,10
B,6,-29,88,-80,20
C,2,21,-26,-13,30


In [57]:
#Remove row
#No inplacement (It is not permannently removeed)
#Need to re-assignment or inplace = True 
df.drop('C')

Unnamed: 0,W,X,Y,Z,new
A,2,79,-8,-86,10
B,6,-29,88,-80,20
D,16,-1,3,51,40
E,30,49,-48,-99,50


In [58]:
df

Unnamed: 0,W,X,Y,Z,new
A,2,79,-8,-86,10
B,6,-29,88,-80,20
C,2,21,-26,-13,30
D,16,-1,3,51,40
E,30,49,-48,-99,50


In [59]:
# inclpace=True
df.drop('C', inplace=True)

In [63]:
#There is no longer any row of 'C'
df

Unnamed: 0,W,X,Y,Z,new
A,2,79,-8,-86,10
B,6,-29,88,-80,20
D,16,-1,3,51,40
E,30,49,-48,-99,50


### Selection of Subset of DataFrame 

In [67]:
df.loc[['A','D']][['X','Z']]

Unnamed: 0,X,Z
A,79,-86
D,-1,51


In [68]:
df.loc['E']['new']

50

## Conditional Selection

In [10]:
df

Unnamed: 0,W,X,Y,Z,new
A,2,79,-8,-86,10
B,6,-29,88,-80,20
C,2,21,-26,-13,30
D,16,-1,3,51,40
E,30,49,-48,-99,50


In [16]:
# Filter only positive values
df[df > 0]

Unnamed: 0,W,X,Y,Z,new
A,2,79.0,,,10
B,6,,88.0,,20
C,2,21.0,,,30
D,16,,3.0,51.0,40
E,30,49.0,,,50


In [22]:
# Filter only positive values from 'X' feature/column
df[df['X'] >0]

Unnamed: 0,W,X,Y,Z,new
A,2,79,-8,-86,10
C,2,21,-26,-13,30
E,30,49,-48,-99,50


In [24]:
# It will return a series (Because we know single column is series)
df[df['X'] >0]['X']

A    79
C    21
E    49
Name: X, dtype: int32

### Selection by multiple condition

In [28]:
# Filter 'W'>0 and 'Y'>1
df[(df['W']>0) & (df['Y'] >1)]

Unnamed: 0,W,X,Y,Z,new
B,6,-29,88,-80,20
D,16,-1,3,51,40


In [32]:
# Filter 'W'<0 or 'Y'>1
df[(df['W']<0) | (df['Y'] >1)]

Unnamed: 0,W,X,Y,Z,new
B,6,-29,88,-80,20
D,16,-1,3,51,40


### index as a column
* df.reset_index()

In [40]:
# Set index as column or feature
# Index will be set by default index 0,1,2...
df_reset = df.reset_index()
df_reset

Unnamed: 0,index,W,X,Y,Z,new
0,A,2,79,-8,-86,10
1,B,6,-29,88,-80,20
2,C,2,21,-26,-13,30
3,D,16,-1,3,51,40
4,E,30,49,-48,-99,50


In [41]:
df_reset['index']

0    A
1    B
2    C
3    D
4    E
Name: index, dtype: object

### Column as index
* df.set_index('existing column name')

In [49]:
#df.drop('new', axis=1, inplace=True)
new_index = ['CA', 'NY', 'WY', 'OR', 'CO']

In [51]:
df['state'] = new_index
df

Unnamed: 0,W,X,Y,Z,state
A,2,79,-8,-86,CA
B,6,-29,88,-80,NY
C,2,21,-26,-13,WY
D,16,-1,3,51,OR
E,30,49,-48,-99,CO


In [53]:
df = df.set_index('state')
df

Unnamed: 0_level_0,W,X,Y,Z
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2,79,-8,-86
NY,6,-29,88,-80
WY,2,21,-26,-13
OR,16,-1,3,51
CO,30,49,-48,-99


In [56]:
# 'state' is only an index
df.columns

Index(['W', 'X', 'Y', 'Z'], dtype='object')

### Some useful function of Dataframe
* df.describe()
* df.info()
* df.dtypes

In [58]:
df.describe()

Unnamed: 0,W,X,Y,Z
count,5.0,5.0,5.0,5.0
mean,11.2,23.8,1.8,-45.4
std,11.96662,42.109381,51.915316,63.366395
min,2.0,-29.0,-48.0,-99.0
25%,2.0,-1.0,-26.0,-86.0
50%,6.0,21.0,-8.0,-80.0
75%,16.0,49.0,3.0,-13.0
max,30.0,79.0,88.0,51.0


In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, CA to CO
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   W       5 non-null      int32
 1   X       5 non-null      int32
 2   Y       5 non-null      int32
 3   Z       5 non-null      int32
dtypes: int32(4)
memory usage: 120.0+ bytes


In [61]:
df.dtypes

W    int32
X    int32
Y    int32
Z    int32
dtype: object