### Dataframes

In [3]:
import pandas as pd 
import numpy as np

#### How can we create dataframes?

In [4]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)

In [5]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [6]:
pd.DataFrame(data, columns= ['year', 'state', 'pop']) #select the order of the columns appearing in the dataframe

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [7]:
frame2 = pd.DataFrame(data, columns = ['year', 'state', 'pop'], index = [1,2,3,4,5,6]) # we can also assign the indices to suit our needs

In [8]:
frame2

Unnamed: 0,year,state,pop
1,2000,Ohio,1.5
2,2001,Ohio,1.7
3,2002,Ohio,3.6
4,2001,Nevada,2.4
5,2002,Nevada,2.9
6,2003,Nevada,3.2


In [9]:
frame2.year  # we can print the relevant elements within a dataframe 

1    2000
2    2001
3    2002
4    2001
5    2002
6    2003
Name: year, dtype: int64

In [10]:
frame2.index

Int64Index([1, 2, 3, 4, 5, 6], dtype='int64')

In [11]:
frame2.columns

Index(['year', 'state', 'pop'], dtype='object')

In [12]:
frame2['year'] #dot or [] syntax can be used to print columns

1    2000
2    2001
3    2002
4    2001
5    2002
6    2003
Name: year, dtype: int64

In [13]:
#for finding particular elements within a data frame there are 3 functions we can use: 
#iloc, loc, ix

In [14]:
frame2.loc[1]

year     2000
state    Ohio
pop       1.5
Name: 1, dtype: object

In [15]:
frame2['debt'] = np.arange(6.)

In [16]:
val2  = pd.Series([-1.2, -1.5, -1.7], index = [2,4,5]) #we can insert a pd series as a colomn in a dataframe

In [17]:
frame2['debt'] = val2

In [18]:
frame2 #this will give us nan values where we have incompletely assigned the indices

Unnamed: 0,year,state,pop,debt
1,2000,Ohio,1.5,
2,2001,Ohio,1.7,-1.2
3,2002,Ohio,3.6,
4,2001,Nevada,2.4,-1.5
5,2002,Nevada,2.9,-1.7
6,2003,Nevada,3.2,


In [19]:
frame2['eastern'] = frame2.state == 'Ohio'
frame2                                      #boolean values can also be inserted
                                            #New columns cannot be created with the frame2.eastern syntax.

Unnamed: 0,year,state,pop,debt,eastern
1,2000,Ohio,1.5,,True
2,2001,Ohio,1.7,-1.2,True
3,2002,Ohio,3.6,,True
4,2001,Nevada,2.4,-1.5,False
5,2002,Nevada,2.9,-1.7,False
6,2003,Nevada,3.2,,False


In [20]:
del frame2['eastern']

In [21]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}  #nested dictionaries can also be used, outer keys act as columns and inner keys as indices

In [22]:
frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [23]:
pdata = {'Ohio': frame3['Ohio'][:-1],
         'Nevada': frame3['Nevada'][:2]}  #we can revert back by adding the data inside a new dictionary and performing indexing in the first dataframe

In [24]:
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [25]:
frame3.index.name = "year"
frame3.columns.name = "state"
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [26]:
frame3.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

In [27]:
#Index Objects: these are responsible for holding the axis labels and other metadata (e.g. axis names)
#Note that indices are imutable objects 

In [28]:
labels = pd.Index(np.arange(3))

In [29]:
obj2 = pd.Series([-1.2, -1.5, -1.7], index = labels)

In [30]:
obj2.index is labels # indices can have duplicate values

True

In [31]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
        index=['a', 'c', 'd'],
        columns=['Ohio', 'Texas', 'California']) #we can also reindex a dataframe

In [32]:
states = ['Utah', 'Ohio', 'Texas']

In [33]:
#we can 'drop' data by using the drop() method: name_of_dataframe.drop([list of elements], axis = 0 |1, inplace = Trues|False )

In [34]:
#The stansard way to find elements in a datframe is via loc(labels) and iloc(integers), let's have a look at some examples 


In [35]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
       index=['Ohio', 'Colorado', 'Utah', 'New York'],
       columns=['one', 'two', 'three', 'four'])

In [36]:
data.loc['Ohio', ['one', 'two']]

one    0
two    1
Name: Ohio, dtype: int32

In [37]:
data.loc[['Ohio', 'Colorado'], ['one','two']]

Unnamed: 0,one,two
Ohio,0,1
Colorado,4,5


In [38]:
data.iloc[0, [0,1]]

one    0
two    1
Name: Ohio, dtype: int32

In [39]:
data.loc['Ohio':'Colorado', 'one':] #slicing works with both loc and iloc

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [40]:
data.iloc[0:2, 0:] #not inclusive slicing in iloc

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [41]:
data.iloc[0:, 0:] [data.two >0]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [42]:
# Indexing, slicing and filtering in Dataframes and Series

In [43]:
data[['one', 'two', 'three']]

Unnamed: 0,one,two,three
Ohio,0,1,2
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14


In [44]:
#Indexing like the above comes with special cases

In [45]:
data[0:2] # there is no indexing version for the rows  -> keyerror

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [46]:
data[data['one'] >5]

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [47]:
data < 5 #This will result in a boolean array 

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [48]:
#for series what changes from the above notation is that slicing is inclusive 
#also remember that slicing is usually a  view while indexing creates a copy of the initial array-like object

#### Integer Indexes 

In [49]:
ser = pd.Series(np.arange(3.))
ser
ser[-1]  # -> error keytype

KeyError: -1

In [50]:
#to be consistent in this case we have integer index values it preferable to use loc or iloc

#### Apply and mapping 

In [51]:
frame = pd.DataFrame(np.random.randn(4,3), index = list('abcd'), columns = ["Utah", "Ohio", "Texas"])
frame 

Unnamed: 0,Utah,Ohio,Texas
a,1.472283,-0.122449,-0.486621
b,0.398447,-0.998915,-2.408186
c,0.303132,-0.112025,0.547417
d,0.378413,0.480952,1.116706


In [52]:
f = lambda x: x.max() - x.min() 

In [53]:
frame.apply(f) # applies f per column 

Utah     1.169151
Ohio     1.479867
Texas    3.524892
dtype: float64

In [54]:
frame.apply(f, axis = 1) #same as calling axis = "columns"

a    1.958904
b    2.806634
c    0.659442
d    0.738293
dtype: float64

In [55]:
#we can also return apart a scalar value a one dimensional array i.e. a Series

In [56]:
def f(x):
    return pd.Series([x.max(), x.min(), x.sum()], index = ['max', 'min', 'sum'])

In [57]:
frame.apply(f)

Unnamed: 0,Utah,Ohio,Texas
max,1.472283,0.480952,1.116706
min,0.303132,-0.998915,-2.408186
sum,2.552275,-0.752437,-1.230685


In [58]:
#we can parse through each element performing a predefined operation by using the applymap() method

In [59]:
round_to_two = lambda x: "%.2f" %x 

In [60]:
frame.applymap(round_to_two)

Unnamed: 0,Utah,Ohio,Texas
a,1.47,-0.12,-0.49
b,0.4,-1.0,-2.41
c,0.3,-0.11,0.55
d,0.38,0.48,1.12


In [61]:
#for perfrorming the same operations on a Series we use the map() method

In [62]:
frame["Utah"].map(round_to_two)

a    1.47
b    0.40
c    0.30
d    0.38
Name: Utah, dtype: object

#### Sorting and Ranking

In [63]:
obj = pd.Series(range(4), index = [ 'b', 'a','c', 'd'])
obj 

b    0
a    1
c    2
d    3
dtype: int64

In [64]:
obj.sort_index() # we can also have the option to sort by values: sort_values()

a    1
b    0
c    2
d    3
dtype: int64

In [65]:
frame = pd.DataFrame(np.arange(8).reshape(2,4), index = ['a','b'], columns = ["Utah", "Ohio", "Texas" , "Oregon"] )
frame

Unnamed: 0,Utah,Ohio,Texas,Oregon
a,0,1,2,3
b,4,5,6,7


In [66]:
frame.sort_index(axis=1)

Unnamed: 0,Ohio,Oregon,Texas,Utah
a,1,3,2,0
b,5,7,6,4


In [67]:
frame.sort_index(axis = 1, ascending  = False)

Unnamed: 0,Utah,Texas,Oregon,Ohio
a,0,2,3,1
b,4,6,7,5


In [68]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [69]:
frame.sort_values(by= ['b','a']) # we can sort a dataframe by the values of one or more than one of its columns

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


#### Descriptive statistics 

options for reductions methods (sum, mean, etc): 
  - axis (0, 1)
  - skipna (False, True)
  - level 

idmax(), idmin() can be used to find the maximum and minimum values in each column of the dataframe 

 cumsum() can perform an accumulation over each of the columns 


describe() give us a general overview of our data within our dataframe (mean, deviation, percentiles, count)

corr() and cov() can be used in Series as well as a Dataframe