### Pandas - To explore datasets

- Data Frames are similar to SQL tables or excel spreadsheets.
- Data Series means a single row or column 
- Operation in Pandas

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Explore Dataframe
# param1 : Number of elements
# param2: Number of rows & cols
# param3: Row & col names
dataFrame=pd.DataFrame(np.arange(0,20).reshape(5,4),index=['Row1','Row2','Row3','Row4','Row5'],columns=['Column1','Column2','Column3','Column4'])

In [3]:
dataFrame.head() #Returns top5 rows (default=5)

Unnamed: 0,Column1,Column2,Column3,Column4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [4]:
dataFrame.head(2) # It will return top 2 rows

Unnamed: 0,Column1,Column2,Column3,Column4
Row1,0,1,2,3
Row2,4,5,6,7


In [6]:
dataFrame.head(-3) # It will return except the last 3 rows

Unnamed: 0,Column1,Column2,Column3,Column4
Row1,0,1,2,3
Row2,4,5,6,7


In [6]:
# Convert dataframe to csv
# It will get created in your folder
dataFrame.to_csv('Test.csv')

In [8]:
# Retrieve elements
# Retrieve by location (rowIndex & colIndex)

# Retrieve first row
# When you retieve a single row it returns a 'data series'
dataFrame.loc['Row1']

Column1    0
Column2    1
Column3    2
Column4    3
Name: Row1, dtype: int32

In [9]:
# check the type
type(dataFrame.loc['Row1']) # Series are any one row or column

pandas.core.series.Series

In [23]:
# Retrieve a list of rows
# using [[]] will return a 'dataFrame'

dataFrame.loc[['Row1','Row3']]

Unnamed: 0,Column1,Column2,Column3,Column4
Row1,0,1,2,3
Row3,8,9,10,11


In [14]:
type(dataFrame.loc[['Row1','Row3']])

pandas.core.frame.DataFrame

In [24]:
# Retrieve a particular value using row & col

dataFrame.loc['Row1','Column3']

2

In [25]:
# Retrieve a set of row values based on column 

dataFrame.loc['Row1':'Row2','Column3']

Row1    2
Row2    6
Name: Column3, dtype: int32

In [45]:
# Retrieve values based on conditions

dataFrame.loc[dataFrame['Column1']>5]

Unnamed: 0,Column1,Column2,Column3,Column4
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [9]:
dataFrame.iloc[:,:]

Unnamed: 0,Column1,Column2,Column3,Column4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [46]:
# Retrieve data using iloc based on slicing

dataFrame.iloc[0:2,0:1]

Unnamed: 0,Column1
Row1,0
Row2,4


In [11]:
type(dataFrame.iloc[0:2,0:1]) # Atleast more than 1 row it will return a dataFrame

pandas.core.frame.DataFrame

In [14]:
# Retrieve elements from column 2 onwards

dataFrame.iloc[:,1:]

Unnamed: 0,Column2,Column3,Column4
Row1,1,2,3
Row2,5,6,7
Row3,9,10,11
Row4,13,14,15
Row5,17,18,19


In [15]:
# Convert Dataframe into an array
dataFrame.iloc[:,1:].values

array([[ 1,  2,  3],
       [ 5,  6,  7],
       [ 9, 10, 11],
       [13, 14, 15],
       [17, 18, 19]])

In [16]:
# Verify the shape of above
dataFrame.iloc[:,1:].values.shape

(5, 3)

In [17]:
# To check numm values in a data Frame
dataFrame.isnull()

Unnamed: 0,Column1,Column2,Column3,Column4
Row1,False,False,False,False
Row2,False,False,False,False
Row3,False,False,False,False
Row4,False,False,False,False
Row5,False,False,False,False


In [19]:
dataFrame.isnull().sum() # It will return column wise how many null values are present.

Column1    0
Column2    0
Column3    0
Column4    0
dtype: int64

In [22]:
dataFrame['Column2'].value_counts() # In column 2 how many unique categories are there

13    1
5     1
17    1
9     1
1     1
Name: Column2, dtype: int64

In [23]:
# unique function 
dataFrame['Column2'].unique()

array([ 1,  5,  9, 13, 17])

In [24]:
dataFrame['Column4']

Row1     3
Row2     7
Row3    11
Row4    15
Row5    19
Name: Column4, dtype: int32

In [25]:
dataFrame[['Column3','Column4']]

Unnamed: 0,Column3,Column4
Row1,2,3
Row2,6,7
Row3,10,11
Row4,14,15
Row5,18,19
