# Pandas DataFrame overview

- A DataFrame represents a rectangular table of data and contains an ordered collection of columns, each of which can be a different value type (numeric, string, boolean, etc.).The DataFrame has both a row and column index

# DataFrame

In [5]:
import pandas as pd

apples = pd.Series([3,2,0,1])
oranges = pd.Series([3,4,7,8])

print(apples, oranges)


0    3
1    2
2    0
3    1
dtype: int64 0    3
1    4
2    7
3    8
dtype: int64


In [7]:
import pandas as pd

apples = pd.Series([3,2,0,1])
oranges = pd.Series([3,4,7,8])

#print(apples, oranges)

data = {"apples": apples, "oranges": oranges}
fruits_df = pd.DataFrame(data)
print(fruits_df)

   apples  oranges
0       3        3
1       2        4
2       0        7
3       1        8


# keep in mind, Indexes

In [10]:
import pandas as pd

apples = pd.Series([3,2,0,1], ["a", "b", "c", "d"] )

oranges = pd.Series([3,2,0,1], index = ["mon", "tue", "wed", "thr"])

#print(apples, oranges)
data = {"apples": apples, "oranges": oranges}
fruits_df = pd.DataFrame(data)
print(fruits_df)
# index not matched

     apples  oranges
a       3.0      NaN
b       2.0      NaN
c       0.0      NaN
d       1.0      NaN
mon     NaN      3.0
thr     NaN      1.0
tue     NaN      2.0
wed     NaN      0.0


In [11]:
import pandas as pd

apples = pd.Series([3,2,0,1] , index = ["mon", "tue", "wed", "thr"] )   # same index
oranges = pd.Series([3,2,0,1], index = ["mon", "tue", "wed", "thr"])

#print(apples,"\n", oranges)
data = {"apples": apples, "oranges": oranges}
fruits_df = pd.DataFrame(data)
print(fruits_df)

     apples  oranges
mon       3        3
tue       2        2
wed       0        0
thr       1        1


In [12]:
state = ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada']
data = {'state': state ,
        'year' : [2000, 2001, 2002, 2001, 2002, 2003],
        'pop'  : [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

state_pop_df = pd.DataFrame(data 
                     , index = ['1st', '2nd', '3rd', 
                                '4th', '5th','6th'] )
print(state_pop_df)

      state  year  pop
1st    Ohio  2000  1.5
2nd    Ohio  2001  1.7
3rd    Ohio  2002  3.6
4th  Nevada  2001  2.4
5th  Nevada  2002  2.9
6th  Nevada  2003  3.2


In [13]:
state_pop_df =pd.DataFrame(data, columns=['year', 'state', 'pop'])
print(state_pop_df)

   year   state  pop
0  2000    Ohio  1.5
1  2001    Ohio  1.7
2  2002    Ohio  3.6
3  2001  Nevada  2.4
4  2002  Nevada  2.9
5  2003  Nevada  3.2


In [15]:
frame2 = pd.DataFrame(data, 
                      columns=['year', 'state', 'pop', 'debt'],
index=['one', 'two', 'three', 'four','five', 'six'])
 
frame2.head()   # Head function used for displayed 1st 5 rows

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


displaying index and columns names
-

In [16]:
print(frame2.columns)
print( frame2.index )

Index(['year', 'state', 'pop', 'debt'], dtype='object')
Index(['one', 'two', 'three', 'four', 'five', 'six'], dtype='object')


In [17]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

A column in a DataFrame can be retrieved as a Series either by dict-like notation or by attribute:
-

dictinary like notation to access or extract
-

In [19]:
# this is dictinary like notation to access or extract
# dataframe column 

frame2["state"]

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

there frame2 is another method , attribute style of accessing
-

In [20]:
frame2.state


one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [22]:
print(frame2.loc['two'] )  # loc attribute used for specific rows location
print()
frame2.head()

year     2001
state    Ohio
pop       1.7
debt      NaN
Name: two, dtype: object



Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


Columns can be modified by assignment. For example, the empty 'debt' column could be assigned a scalar value or an array values:
-

In [23]:
frame2['debt'] = 16.5
frame2.head()

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [25]:
val = pd.Series([-1.2, -1.5, -1.7, 2.6], index=['two', 'four', 'five', 'six'])

frame2['debt'] = val  # in debt column adding values
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,2.6


# Deleting data (row or column from dataframe)


In [33]:
import numpy as np
import pandas as pd
data_df = pd.DataFrame(np.arange(16).reshape((4, 4)),   # random generate values
             index=['Ohio', 'Colorado', 'Utah', 'New York'],
             columns=['one', 'two', 'three', 'four'])
print(data_df)

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


Deleting columns from dataframe
-

In [34]:

#You can drop values from the columns by passing axis=1 
#or axis='columns' :
                             #  Deleting two rows
data_df = data_df.drop('two', axis=1)
print(data_df)


          one  three  four
Ohio        0      2     3
Colorado    4      6     7
Utah        8     10    11
New York   12     14    15


In [30]:
data_df.drop('three', axis=1, inplace = True)
print(data_df)

          one  four
Ohio        0     3
Colorado    4     7
Utah        8    11
New York   12    15


Deleting specific rows from DataFrame
-

In [31]:
data_df

Unnamed: 0,one,four
Ohio,0,3
Colorado,4,7
Utah,8,11
New York,12,15


In [35]:
data_df.drop('Utah', axis = 0, inplace= True)

In [36]:
print(data_df)

          one  three  four
Ohio        0      2     3
Colorado    4      6     7
New York   12     14    15


# Selection with loc and iloc

In [37]:
import numpy as np
import pandas as pd

data_df = pd.DataFrame(np.arange(16).reshape((4, 4)), 
             index=['Ohio', 'Colorado', 'Utah', 'New York'],
             columns=['one', 'two', 'three', 'four'])
print(data_df)

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


In [38]:
# in the loc method specify row label first 
# then specify column names
# remember! mutiple column names require array notation

print( data_df.loc[['Colorado','Ohio'], ['two',  'three'] ] )

          two  three
Colorado    5      6
Ohio        1      2


In [39]:
print( data_df.iloc[2:, [3, 0, 1] ] ) # using number instead of labels
                
            # iloc works row wise operation


          four  one  two
Utah        11    8    9
New York    15   12   13


In [40]:
print( data_df.iloc[:])

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


In [41]:
print ( data_df.iloc[ :3 , :3 ] )


          one  two  three
Ohio        0    1      2
Colorado    4    5      6
Utah        8    9     10


In [42]:
print( data_df.iloc[:, :3])

          one  two  three
Ohio        0    1      2
Colorado    4    5      6
Utah        8    9     10
New York   12   13     14


# Arithmetic methods with fill values


In [45]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                   columns=list('abcd'))

df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                    columns=list('abcde'))
print(df1)
#print(df2)

     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0


In [46]:
print(df2)

      a     b     c     d     e
0   0.0   1.0   2.0   3.0   4.0
1   5.0   6.0   7.0   8.0   9.0
2  10.0  11.0  12.0  13.0  14.0
3  15.0  16.0  17.0  18.0  19.0


In [48]:
df3 = df1+df2
print(df3)

      a     b     c     d   e
0   0.0   2.0   4.0   6.0 NaN
1   9.0   NaN  13.0  15.0 NaN
2  18.0  20.0  22.0  24.0 NaN
3   NaN   NaN   NaN   NaN NaN


In [49]:
# We can use add method for filling NaN cells with a value
# Nan will be replaced by 0 and then addition operation will apply
print("addition using a method with replacing Nan with 0")
df3 = df1.add(df2, fill_value=0)
print(df3)

addition using a method with replacing Nan with 0
      a     b     c     d     e
0   0.0   2.0   4.0   6.0   4.0
1   9.0   5.0  13.0  15.0   9.0
2  18.0  20.0  22.0  24.0  14.0
3  15.0  16.0  17.0  18.0  19.0
