In [2]:
import numpy as np
import pandas as pd

## The DataFrame 
It is a tabular data structure very similar to a spreadsheet. This data structure is designed to extend series to multiple dimensions. It consists of an ordered collection of columns, each of which can contain a value of a different type.
<br><i>A dataframe may also be understood as a dict of series, where the keys are the
column names and the values are the series that will form the columns of the dataframe.</i></br>

## Defining a DataFrame

In [3]:
data = {'color' : ['blue','green','yellow','red','white'], 'object' : ['ball','pen','pencil','paper','mug'], 'price' : [1.2,1.0,0.6,0.9,1.7]}

In [4]:
frame = pd.DataFrame(data)
frame

Unnamed: 0,color,object,price
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,0.6
3,red,paper,0.9
4,white,mug,1.7


In [5]:
# Selecting custom columns from the dict object into the dataframe.
frame2 = pd.DataFrame(data,columns=['object','price'])
frame2

Unnamed: 0,object,price
0,ball,1.2
1,pen,1.0
2,pencil,0.6
3,paper,0.9
4,mug,1.7


In [6]:
# Custom index
frame2 = pd.DataFrame(data, index=['one','two','three','four','five'])
frame2

Unnamed: 0,color,object,price
one,blue,ball,1.2
two,green,pen,1.0
three,yellow,pencil,0.6
four,red,paper,0.9
five,white,mug,1.7


## Selecting Elements

In [7]:
# To know the names of the columns
frame.columns

Index(['color', 'object', 'price'], dtype='object')

In [8]:
frame.index

RangeIndex(start=0, stop=5, step=1)

In [9]:
frame.values

array([['blue', 'ball', 1.2],
       ['green', 'pen', 1.0],
       ['yellow', 'pencil', 0.6],
       ['red', 'paper', 0.9],
       ['white', 'mug', 1.7]], dtype=object)

In [10]:
# If only interested in selecting contents of a column:
frame['price']

0    1.2
1    1.0
2    0.6
3    0.9
4    1.7
Name: price, dtype: float64

In [11]:
# Column name can be passed as an attribute as well
frame.price

0    1.2
1    1.0
2    0.6
3    0.9
4    1.7
Name: price, dtype: float64

In [12]:
# For rows within a dataframe, it is possible to use loc attribute with the index value of the row that you want to extract
frame.loc[3]

color       red
object    paper
price       0.9
Name: 3, dtype: object

In [15]:
# Selecting multiple rows:
frame.loc[[2,4]]

Unnamed: 0,color,object,price
2,yellow,pencil,0.6
4,white,mug,1.7


In [17]:
# Extracting a portion of a DataFrame (slicing)
frame[0:1]

Unnamed: 0,color,object,price
0,blue,ball,1.2


In [18]:
frame[1:4]

Unnamed: 0,color,object,price
1,green,pen,1.0
2,yellow,pencil,0.6
3,red,paper,0.9


In [19]:
# If a single value is to be achieved from the DataFrame then first the name of the column is to be given and then the 
# index or label of the row
frame['price'][3]

0.9

In [20]:
frame['object'][1]

'pen'

## Assigning Values

In [21]:
frame

Unnamed: 0,color,object,price
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,0.6
3,red,paper,0.9
4,white,mug,1.7


In [22]:
# The name attribute here is used to assign a label to the substructures('index' and 'columns' namely) within the DataFrame
frame.index.name = 'id'
frame.columns.name = 'item'
frame

item,color,object,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,0.6
3,red,paper,0.9
4,white,mug,1.7


In [23]:
# Adding a new column to an existing DataFrame. Simply assign a value to the instance of the dataframe and specifying 
# a new column name.
frame['new'] = 12
frame

item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,12
1,green,pen,1.0,12
2,yellow,pencil,0.6,12
3,red,paper,0.9,12
4,white,mug,1.7,12


In [26]:
# To update an entire column, you have to use an array
frame['new'] = np.random.random(5)
frame

item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,0.546939
1,green,pen,1.0,0.252947
2,yellow,pencil,0.6,0.6625
3,red,paper,0.9,0.441759
4,white,mug,1.7,0.793642


## Creating a new column using Series data structure

In [27]:
ser = pd.Series(np.arange(5))

In [28]:
ser

0    0
1    1
2    2
3    3
4    4
dtype: int32

In [29]:
frame['new'] = ser
frame

item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,0
1,green,pen,1.0,1
2,yellow,pencil,0.6,2
3,red,paper,0.9,3
4,white,mug,1.7,4


In [32]:
# To change a single value
frame['price'][2] = 0.5
frame

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frame['price'][2] = 0.5


item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,0
1,green,pen,1.0,1
2,yellow,pencil,0.5,2
3,red,paper,0.9,3
4,white,mug,1.7,4


## Membership of a Value

In [33]:
frame.isin(['pencil',1.7])

item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,False,False,False,False
1,False,False,False,False
2,False,True,False,False
3,False,False,False,False
4,False,False,True,False


In [36]:
# If you pass the value returned as a condition, then you will get a new DataFrame containing only the values that satisfy 
# the condition 
frame[frame.isin([1,'yellow'])]

item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,,,,
1,,,1.0,1.0
2,yellow,,,
3,,,,
4,,,,


## Deleting a Column

In [37]:
# To delete an entire column and all of its contents use the 'del' command
del frame['new']

In [38]:
frame

item,color,object,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,0.5
3,red,paper,0.9
4,white,mug,1.7


## Filtering
You can apply the filtering through the application of certain conditions.

In [43]:
# For example, to get all the values smaller than a certain number
frame[frame['price'] < 1.2]

item,color,object,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,green,pen,1.0
2,yellow,pencil,0.5
3,red,paper,0.9


## DataFrame from Nested dict
This data structure, when it is passed directly as an argument to the DataFrame() constructor, will be interpreted by pandas to treat external keys as column names and internal keys as labels for the indexes.<p>During the interpretation of the nested structure, it is possible that not all fields will find a successful match. pandas compensates for this inconsistency by adding the NaN value to missing values.</p>

In [44]:
nestdict = { 'red': { 2012: 22, 2013: 33 }, 
            'white': { 2011: 13, 2012: 22, 2013: 16},
            'blue': {2011: 17, 2012: 27, 2013: 18}}

In [45]:
frame2 = pd.DataFrame(nestdict)
frame2

Unnamed: 0,red,white,blue
2012,22.0,22,27
2013,33.0,16,18
2011,,13,17


## Transposition of a DataFrame
To get the transposition of the DataFrame, just add the T attribute to its application.

In [46]:
frame2.T

Unnamed: 0,2012,2013,2011
red,22.0,33.0,
white,22.0,16.0,13.0
blue,27.0,18.0,17.0
