# DataFrames

Pandas DataFrame is two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns). A Data frame is a two-dimensional data structure, i.e., data is aligned in a tabular fashion in rows and columns. Pandas DataFrame consists of three principal components, the data, rows, and columns.

In [1]:
import pandas as pd
import numpy as np

In [2]:
record1 = pd.Series({"Name":"Alice",
                    "Class" : "Physics",
                    "Score" : 85})

record2 = pd.Series({"Name":"Jack",
                    "Class" : "Chemistry",
                    "Score" : 82})

record3 = pd.Series({"Name":"Helen",
                    "Class" : "Biology",
                    "Score" : 90})

In [3]:
# Creating a dataframe

df = pd.DataFrame([record1,record2,record3],
                 index = ["School1","School2","School1"])
df

Unnamed: 0,Name,Class,Score
School1,Alice,Physics,85
School2,Jack,Chemistry,82
School1,Helen,Biology,90


In [4]:
# Looking at top 2 Lines (default is 5)

df.head(2)

Unnamed: 0,Name,Class,Score
School1,Alice,Physics,85
School2,Jack,Chemistry,82


In [5]:
# Negative value of n removes those specific rows i.e is equivalent to df[:n,:] or simply df[:n]

df.head(-1)

Unnamed: 0,Name,Class,Score
School1,Alice,Physics,85
School2,Jack,Chemistry,82


### Querying a DataFrame

In [6]:
# Similar to Series, iloc and loc can be used to query dataframes


# Passing a single parameter assumes it for row
print("Original df: \n\n",df,"\n\n")

print("Using df.iloc[0] i.e just first row: \n\n",df.iloc[0],"\n\n")

print("Using df.loc['School2'] i.e just second row: \n\n",df.loc["School2"],"\n\n")

print("Using df.loc['School1'] i.e First and Third rows: \n\n",df.loc["School1"],"\n\n")

Original df: 

           Name      Class  Score
School1  Alice    Physics     85
School2   Jack  Chemistry     82
School1  Helen    Biology     90 


Using df.iloc[0] i.e just first row: 

 Name       Alice
Class    Physics
Score         85
Name: School1, dtype: object 


Using df.loc['School2'] i.e just second row: 

 Name          Jack
Class    Chemistry
Score           82
Name: School2, dtype: object 


Using df.loc['School1'] i.e First and Third rows: 

           Name    Class  Score
School1  Alice  Physics     85
School1  Helen  Biology     90 




In [7]:
# If passing two parameters to iloc or loc, the first one is the row and the second one is the column


print("Original df: \n\n",df,"\n\n")

print("Using df.iloc[0,0] i.e singular element: \n\n",df.iloc[0],"\n\n")

print("Using df.iloc[:,0] i.e. only the entire first column: \n\n",df.iloc[:,0],"\n\n")

print("Using df.loc['School2','Name'] i.e just a single name: \n\n",df.loc["School2","Name"],"\n\n")

print("Using df.loc['School1',:] i.e all columns related to School1: \n\n",df.loc["School1",:],"\n\n")

print("Using df.loc[:,['Name','Score']] i.e all columns related to School1: \n\n",df.loc[:,["Name","Score"]],"\n\n")

Original df: 

           Name      Class  Score
School1  Alice    Physics     85
School2   Jack  Chemistry     82
School1  Helen    Biology     90 


Using df.iloc[0,0] i.e singular element: 

 Name       Alice
Class    Physics
Score         85
Name: School1, dtype: object 


Using df.iloc[:,0] i.e. only the entire first column: 

 School1    Alice
School2     Jack
School1    Helen
Name: Name, dtype: object 


Using df.loc['School2','Name'] i.e just a single name: 

 Jack 


Using df.loc['School1',:] i.e all columns related to School1: 

           Name    Class  Score
School1  Alice  Physics     85
School1  Helen  Biology     90 


Using df.loc[:,['Name','Score']] i.e all columns related to School1: 

           Name  Score
School1  Alice     85
School2   Jack     82
School1  Helen     90 




In [8]:
# If not using iloc/loc, then pandas indexing operator works on column names


print("Original df: \n\n",df,"\n\n")

print("Using df['Name'] i.e entire Name column: \n\n",df["Name"],"\n\n")

Original df: 

           Name      Class  Score
School1  Alice    Physics     85
School2   Jack  Chemistry     82
School1  Helen    Biology     90 


Using df['Name'] i.e entire Name column: 

 School1    Alice
School2     Jack
School1    Helen
Name: Name, dtype: object 




In [9]:
# The combination of iloc/loc and the pandas indexing operator is also possible


print("Original df: \n\n",df,"\n\n")

print("Using df.iloc[0]['Name'] i.e Name for first row: \n\n",df.iloc[0]["Name"],"\n\n")

print("Using df.loc['School1']['Name'] i.e Name for School1 students: \n\n",df.loc["School1"]["Name"],"\n\n")

Original df: 

           Name      Class  Score
School1  Alice    Physics     85
School2   Jack  Chemistry     82
School1  Helen    Biology     90 


Using df.iloc[0]['Name'] i.e Name for first row: 

 Alice 


Using df.loc['School1']['Name'] i.e Name for School1 students: 

 School1    Alice
School1    Helen
Name: Name, dtype: object 




### Dropping Data

Remove rows or columns by specifying label names and corresponding axis, or by specifying directly index or column names. 
<br><br>
Syntax : ***DataFrame.drop(labels=None, *, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise')***
<br>
<br>
This only gives a copy of the data and does not update the original dataframe. In order to update the actual dataframe, set ***inplace = True***

In [10]:
midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
                             ['speed', 'weight', 'length']],
                     codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
                            [0, 1, 2, 0, 1, 2, 0, 1, 2]])
df = pd.DataFrame(index=midx, columns=['big', 'small'],
                  data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
                        [250, 150], [1.5, 0.8], [320, 250],
                        [1, 0.8], [0.3, 0.2]])
df

Unnamed: 0,Unnamed: 1,big,small
lama,speed,45.0,30.0
lama,weight,200.0,100.0
lama,length,1.5,1.0
cow,speed,30.0,20.0
cow,weight,250.0,150.0
cow,length,1.5,0.8
falcon,speed,320.0,250.0
falcon,weight,1.0,0.8
falcon,length,0.3,0.2


In [11]:
# By default, parameters are for rows only

df.drop("lama")

Unnamed: 0,Unnamed: 1,big,small
cow,speed,30.0,20.0
cow,weight,250.0,150.0
cow,length,1.5,0.8
falcon,speed,320.0,250.0
falcon,weight,1.0,0.8
falcon,length,0.3,0.2


In [12]:
# In order to drop columns, the axis needs to be changed

df.drop("big",axis=1)

# or df.drop(columns=["big"])
# df.drop("big") wouldnt work

Unnamed: 0,Unnamed: 1,small
lama,speed,30.0
lama,weight,100.0
lama,length,1.0
cow,speed,20.0
cow,weight,150.0
cow,length,0.8
falcon,speed,250.0
falcon,weight,0.8
falcon,length,0.2


In [13]:
df.drop(index='cow', columns='small')

Unnamed: 0,Unnamed: 1,big
lama,speed,45.0
lama,weight,200.0
lama,length,1.5
falcon,speed,320.0
falcon,weight,1.0
falcon,length,0.3


In [14]:
df.drop(index='lama', level=0)

Unnamed: 0,Unnamed: 1,big,small
cow,speed,30.0,20.0
cow,weight,250.0,150.0
cow,length,1.5,0.8
falcon,speed,320.0,250.0
falcon,weight,1.0,0.8
falcon,length,0.3,0.2


In [15]:
df.drop(index='length', level=1)

Unnamed: 0,Unnamed: 1,big,small
lama,speed,45.0,30.0
lama,weight,200.0,100.0
cow,speed,30.0,20.0
cow,weight,250.0,150.0
falcon,speed,320.0,250.0
falcon,weight,1.0,0.8
