# DataFrame
## Construct a DataFrame

In [5]:
import pandas as pd
data={'state':['Ohio','Ohio','Ohio','Nevada','Nevada','Nevada'],'year':[2000,2001,2002,2001,2002,2003],
      'pop':[1.5,1.7,3.6,2.4,2.9,3.2]}
df=pd.DataFrame(data)
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


### Select the first five rows

In [6]:
df.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


### Specify a sequence of columns

In [7]:
pd.DataFrame(df,columns=['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


### Retireve a column as a series 

In [8]:
df['state']

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [14]:
df.state

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

### Rows can be retrived by position or name with special loc attribute

In [16]:
df.loc[3]

state    Nevada
year       2001
pop         2.4
Name: 3, dtype: object

### Cloumns can be modified by assignment

In [17]:
import numpy as np

In [20]:
df['debt']=np.arange(6.)
df

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,0.0
1,Ohio,2001,1.7,1.0
2,Ohio,2002,3.6,2.0
3,Nevada,2001,2.4,3.0
4,Nevada,2002,2.9,4.0
5,Nevada,2003,3.2,5.0


In [22]:
df.index=['one','two','three','four','five','six']
df

Unnamed: 0,state,year,pop,debt
one,Ohio,2000,1.5,0.0
two,Ohio,2001,1.7,1.0
three,Ohio,2002,3.6,2.0
four,Nevada,2001,2.4,3.0
five,Nevada,2002,2.9,4.0
six,Nevada,2003,3.2,5.0


In [24]:
val=pd.Series([-1,-2,-3],index=['two','four','five'])
df['debt']=val
df

Unnamed: 0,state,year,pop,debt
one,Ohio,2000,1.5,
two,Ohio,2001,1.7,-1.0
three,Ohio,2002,3.6,
four,Nevada,2001,2.4,-2.0
five,Nevada,2002,2.9,-3.0
six,Nevada,2003,3.2,


### Delete a column with a dict

In [36]:
df['eastern']=df.state=='ohio'
df
del df['eastern']
df.columns

Index(['state', 'year', 'pop', 'debt'], dtype='object')

### Nested dict of dicts

In [38]:
pop={'Nevada':{2001:2.4,2002:2.9},'Ohio':{2000:1.5,2001:1.7,2002:3.6}}

### Outer Key as columns and inner keys as rows

In [42]:
df3=pd.DataFrame(pop)

df3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


### Transpose the DataFrame

In [41]:
df3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


### Dict of Series

In [43]:
pdata={'Ohio':df3['Ohio'][:-1],'Nevada':df3['Nevada'][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


### Set index and columns name attributes

In [45]:
df3.index.name='year'
df3.columns.name='state'
df3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


### values( ) attribute returns the data contained in the DataFrame as a two-dimensional ndarray:

In [46]:
df3.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

In [50]:
obj=pd.Series(range(3),index=['a','b','c'])
index=obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [51]:
index[1:]

Index(['b', 'c'], dtype='object')

In [52]:
labels=pd.Index(np.arange(3))
labels

Int64Index([0, 1, 2], dtype='int64')

In [53]:
df3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [55]:
df3.columns

Index(['Nevada', 'Ohio'], dtype='object', name='state')

In [57]:
'Ohio' in df3.columns

True

In [61]:
2002 in df3.index

True

## 5.2 Essential Functionality 

## Reindexing

In [62]:
obj=pd.Series([3.4,4.3,-5.3,3.5],index=['a','b','c','d'])

In [65]:
obj1=obj.reindex(['c','d','a','b','e'])
obj1

c   -5.3
d    3.5
a    3.4
b    4.3
e    NaN
dtype: float64

## Forward Fills(Interpolation, filling of values)

In [67]:
obj3=pd.Series(['blue','purple','yellow'],index=[0,2,4])
obj4=obj3.reindex(range(6),method='ffill')
obj4

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

## Reindexing in Dataframe

In [68]:
df4=pd.DataFrame(np.arange(9).reshape(3,3),index=['a','c','d'],columns=['Ohio','Texas','California'])
df4

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


_When passed only a sequence, it reindexes the rows in the result_

In [69]:
df5=df4.reindex(['a','b','c','d'])
df5

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


_Columns can be indexed with the columns keyword_

In [70]:
df6=df5.reindex(columns=['California','Ohio','Texas'])
df6

Unnamed: 0,California,Ohio,Texas
a,2.0,0.0,1.0
b,,,
c,5.0,3.0,4.0
d,8.0,6.0,7.0


### Dropping Entries from an Axis
_Drop rows_

In [72]:
df7=df6.drop('b')
df7

Unnamed: 0,California,Ohio,Texas
a,2.0,0.0,1.0
c,5.0,3.0,4.0
d,8.0,6.0,7.0


_Drop columns_

In [73]:
df8=df6.drop(['Texas'],axis=1)
df8

Unnamed: 0,California,Ohio
a,2.0,0.0
b,,
c,5.0,3.0
d,8.0,6.0


_Drop without returning a new object_

In [78]:
df8.drop('Ohio',axis=1,inplace=True)
df8

Unnamed: 0,California
a,2.0
b,
c,5.0
d,8.0


In [76]:
df7

Unnamed: 0,California,Ohio
a,2.0,0.0
c,5.0,3.0
d,8.0,6.0


### Indexing, Selection, and Filtering

In [79]:
obj=pd.Series(np.arange(4.),index=['a','b','c','d'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

### Indexing
**Indexing in Series**

In [88]:
obj=pd.Series(np.arange(4.0),index=['a','b','c','d'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

_Access using  index or integers_

In [89]:
obj['c']

2.0

In [90]:
obj[2]

2.0

In [91]:
obj[obj<2]

a    0.0
b    1.0
dtype: float64

### Slicing in Series
**Slicing with labels behave differently than norml python slicing（Endpoints are inclusive)**

In [92]:
obj['b':'c']

b    1.0
c    2.0
dtype: float64

**Setting**

In [93]:
obj['b':'c']=5
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

**Indexing into DataFrame is for retrieveing one or more columns either with a single value or sequence**

In [94]:
data=pd.DataFrame(np.arange(16).reshape(4,4),
                  index=['Ohio','Colorado','Utah','New York'],
                  columns=['One','Two','Three','Four'])

In [95]:
data

Unnamed: 0,One,Two,Three,Four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [99]:
data[['Two','Four']]

Unnamed: 0,Two,Four
Ohio,1,3
Colorado,5,7
Utah,9,11
New York,13,15


**Passinga single element or a list to the square bracket selects columns**

In [102]:
data[:2]

Unnamed: 0,One,Two,Three,Four
Ohio,0,1,2,3
Colorado,4,5,6,7


**Slicing or selecting data with a boolean array**

In [103]:
data[data['Three']>5]

Unnamed: 0,One,Two,Three,Four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


**Indexing with boolean scalars**

In [104]:
data[data<5]=0
data

Unnamed: 0,One,Two,Three,Four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


### Selection with loc and iloc
**Selection with loc**

In [105]:
data.loc['Colorado',['Two','Three']]

Two      5
Three    6
Name: Colorado, dtype: int64

**Selection with iloc**

In [106]:
data.iloc[2,[3,0,1]]

Four    11
One      8
Two      9
Name: Utah, dtype: int64

**Both functions work wtih slices in addition to single labels or lists of labels**

In [108]:
data.loc[:'Utah','Two']

Ohio        0
Colorado    5
Utah        9
Name: Two, dtype: int64

In [109]:
data.iloc[:,:3][data['Three']>5]

Unnamed: 0,One,Two,Three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


In [110]:
data.iloc[:,3]

Ohio         0
Colorado     7
Utah        11
New York    15
Name: Four, dtype: int64

In [111]:
data.iloc

Ohio         0
Colorado     7
Utah        11
New York    15
Name: Four, dtype: int64

## Arithmetic and Data Alignment
### Alignment in Series
**If any inde pairs are not the same, the respective index in the result will be the union of the index pairs**

In [113]:
s1=pd.Series([7.3,-2.5,3.4,1.5],index=['a','b','c','d'])
s2=pd.Series([-2.1,3.4,4.2,9],index=['a','d','c','e'])

In [117]:
s1+s2

a    5.2
b    NaN
c    7.6
d    4.9
e    NaN
dtype: float64

### Alignment in DataFrame
**Perfromed in Both the rows and the columns**

In [118]:
df1=pd.DataFrame(np.arange(9.).reshape(3,3),columns=list('bcd'),
                index=['Ohio','Texas','Colorado'])
df2=pd.DataFrame(np.arange(12.).reshape(4,3),columns=list('bde'),
                 index=['Utah','Ohio','Texas','Oregon'])

In [120]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [121]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [119]:
df1+df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


### Arithmetic methods with fill values

In [123]:
df1.add(df2,fill_value=0)

Unnamed: 0,b,c,d,e
Colorado,6.0,7.0,8.0,
Ohio,3.0,1.0,6.0,5.0
Oregon,9.0,,10.0,11.0
Texas,9.0,4.0,12.0,8.0
Utah,0.0,,1.0,2.0


## Operations between DataFrame and Series
**Arithmetic between DataFrame and Series matches the index
of the Series on the DataFrame's columns, broadcasting down 
the rows**

In [126]:
df3=pd.DataFrame(np.arange(12).reshape(4,3),columns=list('bde'),
                 index=['Utah','Ohio','Texas','Oregon'])

In [127]:
df3

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [131]:
series=df3.iloc[0]

In [132]:
df3-series

Unnamed: 0,b,d,e
Utah,0,0,0
Ohio,3,3,3
Texas,6,6,6
Oregon,9,9,9


**If an index value is not found in either the columns or the index, the object's index will be reindexed to form a union**

In [133]:
series2=pd.Series(range(3),index=['b','e','f'])
df3+series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


## Broadcast over columns
**Use arithemtic methods**

In [138]:
series3=df3['d']

In [139]:
df3.sub(series3,axis='index')

Unnamed: 0,b,d,e
Utah,-1,0,1
Ohio,-1,0,1
Texas,-1,0,1
Oregon,-1,0,1


## Function Application and Mapping

In [141]:
frame=pd.DataFrame(np.random.rand(4,3),columns=list('bde'),
                   index=['Utah','Ohio','Texas','Oregon'])

In [142]:
frame

Unnamed: 0,b,d,e
Utah,0.834096,0.112537,0.395777
Ohio,0.183482,0.788294,0.314072
Texas,0.452546,0.783952,0.4118
Oregon,0.722348,0.174958,0.425795


In [143]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.834096,0.112537,0.395777
Ohio,0.183482,0.788294,0.314072
Texas,0.452546,0.783952,0.4118
Oregon,0.722348,0.174958,0.425795


#### Apply a function on one-dimensional arrays to each column or row

In [144]:
f= lambda x: x.max()-x.min()

**Apply to rows(Default)**

In [145]:
frame.apply(f)

b    0.650614
d    0.675757
e    0.111723
dtype: float64

**Apply to columns**

In [146]:
frame.apply(f,axis='columns')

Utah      0.721559
Ohio      0.604812
Texas     0.372152
Oregon    0.547389
dtype: float64

**Element-wise python functions**

In [147]:
format= lambda x: '%.2f' % x

In [148]:
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,0.83,0.11,0.4
Ohio,0.18,0.79,0.31
Texas,0.45,0.78,0.41
Oregon,0.72,0.17,0.43
