In [1]:
import numpy as np
import pandas as pd  

In [2]:
# Create an Empty DataFrame
dt=pd.DataFrame()
dt

## Create a DataFrame from Lists
The DataFrame can be created using a single list or a list of lists

In [97]:
data=[4,5,6,7,8]
dt=pd.DataFrame(data)
dt

Unnamed: 0,0
0,4
1,5
2,6
3,7
4,8


In [99]:
# Example 2
data=[['anjali',19],['anuj',20],['alex',20]]
dt=pd.DataFrame(data,columns=['name','age'])
dt

Unnamed: 0,name,age
0,anjali,19
1,anuj,20
2,alex,20


In [6]:
# example 3
data=[['anjali',19],['anuj',20],['alex',20]]
dt=pd.DataFrame(data,columns=['name','age'],dtype=float)
dt

Unnamed: 0,name,age
0,anjali,19.0
1,anuj,20.0
2,alex,20.0


In [7]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    3 non-null      object 
 1   age     3 non-null      float64
dtypes: float64(1), object(1)
memory usage: 176.0+ bytes


## Create a DataFrame from Dict of ndarrays / Lists
All the ndarrays must be of same length. If index is passed, then the length of the index should equal to the length of the arrays.
If no index is passed, then by default, index will be range(n), where n is the array length.

In [101]:
data={'Name':['tom','alan','jack','rose'],'Age':[28,32,50,45]}
dt=pd.DataFrame(data)
dt

Unnamed: 0,Name,Age
0,tom,28
1,alan,32
2,jack,50
3,rose,45


In [102]:
# Example
# Let us now create an indexed DataFrame using arrays.
data={'Name':['tom','alan','jack','rose'],'Age':[28,32,50,45]}
dt=pd.DataFrame(data,index=['rank1','rank2','rank3','rank4'])
dt

Unnamed: 0,Name,Age
rank1,tom,28
rank2,alan,32
rank3,jack,50
rank4,rose,45


## Create a DataFrame from List of Dicts
List of Dictionaries can be passed as input data to create a DataFrame. The dictionary keys are by default taken as column names.

In [104]:
data=[{'a':2,'b':8},{'a':6,'b':7,'c':20}]
dt=pd.DataFrame(data)
dt

Unnamed: 0,a,b,c
0,2,8,
1,6,7,20.0


In [112]:
# Example2
# The following example shows how to create a DataFrame by passing a list of dictionaries and the row indices.
data=[{'a':2,'b':8,'c':9},{'a':6,'b':7,'c':20}]
dt=pd.DataFrame(data,index=['first','second'])
dt

Unnamed: 0,a,b,c
first,2,8,9
second,6,7,20


In [2]:
# First we create a DataFrame using all defaults, and then another where we pass index and column labels

df=pd.DataFrame(np.arange(30,50).reshape(5,4))
df

Unnamed: 0,0,1,2,3
0,30,31,32,33
1,34,35,36,37
2,38,39,40,41
3,42,43,44,45
4,46,47,48,49


In [3]:
df1=pd.DataFrame(data=np.arange(20,50).reshape(5,6),columns=list('abcdef'),index=list('vwxyz'))
df1

Unnamed: 0,a,b,c,d,e,f
v,20,21,22,23,24,25
w,26,27,28,29,30,31
x,32,33,34,35,36,37
y,38,39,40,41,42,43
z,44,45,46,47,48,49


## Using a Dict of equal length Lists 
The keys of the dictionary will be used as column names, the values will form the data in the table. We can optionally provide a list of strings to be used as the index (or row labels.)


In [10]:
x={'ints':np.arange(5),
   'floats':np.arange(0.1,0.6,0.1),
   'strings':list('abcde')}
x

{'ints': array([0, 1, 2, 3, 4]),
 'floats': array([0.1, 0.2, 0.3, 0.4, 0.5]),
 'strings': ['a', 'b', 'c', 'd', 'e']}

In [11]:
df2=pd.DataFrame(x,index=list('abcde'))
df2

Unnamed: 0,ints,floats,strings
a,0,0.1,a
b,1,0.2,b
c,2,0.3,c
d,3,0.4,d
e,4,0.5,e


## DataFrame Attributes 
Some of the most commonly used ones are – index, columns, dtypes, shape, info


In [6]:
# Get row labels
df2.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [7]:
# DataFrame Attributes
df2.columns

Index(['ints', 'floats', 'strings'], dtype='object')

In [8]:
# Get data types for each column 
df2.dtypes

ints         int32
floats     float64
strings     object
dtype: object

In [9]:
# Get number of rows, columns 
df2.shape

(5, 3)

In [10]:
#  Get overview of the dataset 
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, a to e
Data columns (total 3 columns):
ints       5 non-null int32
floats     5 non-null float64
strings    5 non-null object
dtypes: float64(1), int32(1), object(1)
memory usage: 140.0+ bytes


## Subsetting DataFrames 
Pandas allows us to subset DataFrames in a variety of ways to extract from a given dataset a value a row or  a column (returns a Series) multiple rows/columns (returns a subset of the dataframe )
Let's look at a few examples of these


## Selecting a single Row/Column 
For a DataFrame, basic indexing selects the columns.  An individual column can be retrieved as a Series using (a) the square bracket accessor (b) the dot accessor, or (c) one of the accessors - loc, iloc, ix etc.


In [11]:
# the dataframe
df2

Unnamed: 0,ints,floats,strings
a,0,0.1,a
b,1,0.2,b
c,2,0.3,c
d,3,0.4,d
e,4,0.5,e


In [12]:
# Using a column label with the square bracket accessor
df2['floats']

a    0.1
b    0.2
c    0.3
d    0.4
e    0.5
Name: floats, dtype: float64

In [13]:
df2['strings']

a    a
b    b
c    c
d    d
e    e
Name: strings, dtype: object

In [14]:
# Using a column label with the dot accessor 
df2.ints

a    0
b    1
c    2
d    3
e    4
Name: ints, dtype: int32

In [15]:
#  Using .loc and a column label 
df2.loc[:,'strings']

a    a
b    b
c    c
d    d
e    e
Name: strings, dtype: object

In [16]:
df2.loc[:,'floats']

a    0.1
b    0.2
c    0.3
d    0.4
e    0.5
Name: floats, dtype: float64

In [17]:
# Using .iloc and a column position 
df2.iloc[1]

ints         1
floats     0.2
strings      b
Name: b, dtype: object

In [18]:
df2.iloc[2]

ints         2
floats     0.3
strings      c
Name: c, dtype: object

In [19]:
df2.iloc[:,2]

a    a
b    b
c    c
d    d
e    e
Name: strings, dtype: object

In [20]:
#  Using .loc and a row label
df2.loc['b']

ints         1
floats     0.2
strings      b
Name: b, dtype: object

In [21]:
df2.iloc[4]

ints         4
floats     0.5
strings      e
Name: e, dtype: object

## Selecting 2 or more rows/columns 
This can be accomplished by passing a list of column labels to the double square bracket accessor like [[list-of-columns]] passing a list or slice of row/column labels/positions to loc, iloc, ix passing a boolean series to loc, iloc, ix for selecting particular rows/columns
Each time we subset 2 or more rows/columns from a DataFrame, the result will be a DataFrame.

In [8]:
#  Create a new DataFrame
df3=pd.DataFrame(np.random.randint(0,50,20).reshape(5,4),index=list('abcde'),columns=list('pqrs'))
df3

Unnamed: 0,p,q,r,s
a,6,2,31,40
b,26,24,27,43
c,32,7,13,29
d,19,35,36,15
e,36,1,24,21


In [23]:
#  Subset multiple columns using [[]] 
df3[['p','r']]

Unnamed: 0,p,r
a,16,17
b,31,42
c,1,43
d,31,24
e,4,15


In [24]:
df3.loc[:,['r','s']]

Unnamed: 0,r,s
a,17,44
b,42,7
c,43,44
d,24,16
e,15,24


In [94]:
df3.loc[:,['q','s']]   # df4.loc[:, ['R':'T']] 

Unnamed: 0,q,s
a,,
b,15.0,7.0
c,,
d,32.0,16.0
e,,


In [26]:
#  Subset multiple columns using .iloc 
df3.iloc[:,2]

a    17
b    42
c    43
d    24
e    15
Name: r, dtype: int32

In [27]:
df3.iloc[2]   # row values

p     1
q    23
r    43
s    44
Name: c, dtype: int32

In [28]:
df3.iloc[:,1:]

Unnamed: 0,q,r,s
a,10,17,44
b,15,42,7
c,23,43,44
d,32,24,16
e,16,15,24


In [29]:
# Select multiple rows using [] 
df3['a':'c']

Unnamed: 0,p,q,r,s
a,16,10,17,44
b,31,15,42,7
c,1,23,43,44


In [30]:
#  Subset multiple rows using .loc 
df3.loc['b':'d',:]

Unnamed: 0,p,q,r,s
b,31,15,42,7
c,1,23,43,44
d,31,32,24,16


In [31]:
# or df3.loc[['B', 'C', 'D'], :] 
df3.loc[['a','d'],:]

Unnamed: 0,p,q,r,s
a,16,10,17,44
d,31,32,24,16


In [32]:
df3

Unnamed: 0,p,q,r,s
a,16,10,17,44
b,31,15,42,7
c,1,23,43,44
d,31,32,24,16
e,4,16,15,24


In [33]:
#  Subset multiple rows using .iloc 
df3.iloc[:,3]

a    44
b     7
c    44
d    16
e    24
Name: s, dtype: int32

In [34]:
df3.iloc[4,:]

p     4
q    16
r    15
s    24
Name: e, dtype: int32

In [35]:
#  Mixed subsetting with ix
df3.ix[0:2,'p':'q']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,p,q
a,16,10
b,31,15


In [36]:
df3.ix[3,['r','s']]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


r    24
s    16
Name: d, dtype: int32

In [37]:
df3.ix[3:,['q','r']]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,q,r
d,32,24
e,16,15


For a beginner, so many ways of subsetting data may seem intimidating at first and confusing at worst. The prudent thing to do here would be to pick your favorite method of subsetting data from a DataFrame, and stick to it. If you find your more comfortable using ix over loc, iloc , by all means, use that method. Remember that these are just tools and your focus should be more on the analysis and less on the selection of the right tool for the job. 
## Adding/Removing/Renaming Columns or Rows
 
The DataFrame methods assign, drop and rename come in handy for these tasks. New variables may  be created by simply using a column label that doesn't exist in our data with the square bracket(or the loc ) accessor. This method permanently adds a new column to the data. assign is for creating new variables on the fly, or for deriving new columns from existing ones. This method returns a copy of the DataFrame, so you should overwrite the original if you want to retain the created column in subsequent operations. Note the peculiar syntax in the code box below (lambda functions are required)

In [38]:
# method for creating permanent new columns
df3['t']=df3['p']+df3['q']

In [39]:
df3

Unnamed: 0,p,q,r,s,t
a,16,10,17,44,26
b,31,15,42,7,46
c,1,23,43,44,24
d,31,32,24,16,63
e,4,16,15,24,20


In [40]:
# assign for creating columns on the fly 
df3.assign(u=lambda x:x['p']+x['q'])


Unnamed: 0,p,q,r,s,t,u
a,16,10,17,44,26,26
b,31,15,42,7,46,46
c,1,23,43,44,24,24
d,31,32,24,16,63,63
e,4,16,15,24,20,20


#
drop returns a copy of the DataFrame after deleting the rows and columns specified (as a list of index or column labels). The axis= parameter controls which axis (row or column) we want to drop the Series from.  The inplace= parameter decides whether the change must be made permanent. Also see dropna , which helps us get rid of rows with missing data.


In [41]:
# The DataFrame
df3

Unnamed: 0,p,q,r,s,t
a,16,10,17,44,26
b,31,15,42,7,46
c,1,23,43,44,24
d,31,32,24,16,63
e,4,16,15,24,20


In [42]:
#  Dropping a single row (axis=0 is set by default) 
df3.drop('e')

Unnamed: 0,p,q,r,s,t
a,16,10,17,44,26
b,31,15,42,7,46
c,1,23,43,44,24
d,31,32,24,16,63


In [43]:
#  We can drop multiple columns by providing a list of labels
df3.drop(['a','c'])

Unnamed: 0,p,q,r,s,t
b,31,15,42,7,46
d,31,32,24,16,63
e,4,16,15,24,20


In [44]:
#  This has the same effect; axis=0 is default (can be omitted) 
df3.drop(['b','e'],axis=0)

Unnamed: 0,p,q,r,s,t
a,16,10,17,44,26
c,1,23,43,44,24
d,31,32,24,16,63


In [45]:
df3.drop(['p','r'],axis=1)   # Pass axis=1 to drop columns 

Unnamed: 0,q,s,t
a,10,44,26
b,15,7,46
c,23,44,24
d,32,16,63
e,16,24,20


# 
rename takes a DataFrame as input and a dictionary that maps old names to new names for columns. This method is particularly useful right in the beginning of data analysis, as sometimes when we get data we find that the column names are all messed up (have spaces or unwanted characters in them.) [Pro Tip] Use dictionary comprehensions to create the substitution dict.
 

In [46]:
df3.rename(columns={'r':'r_new','s':'s_new'})

Unnamed: 0,p,q,r_new,s_new,t
a,16,10,17,44,26
b,31,15,42,7,46
c,1,23,43,44,24
d,31,32,24,16,63
e,4,16,15,24,20


## Math/Stats Operations 
These sets of methods is one of the major reasons why so many people love pandas for data wrangling. By default, when you call a mathematical operation (like sum )  or a statistical operation (like std ) on a DataFrame the results are produced for all numeric columns. Other languages like R required you to either use an apply function or to use one of their specialized functions like colMeans, rowMeans . Pandas, however, requires that you only pass the axis= parameter to control whether math/stat summaries should be produced for rows or columns. These methods also take a skipna= parameter that signals whether to exclude missing data ( True by default.)

In [47]:
# Column sums
df3.sum(axis=0)

p     83
q     96
r    141
s    135
t    179
dtype: int64

In [48]:
# Row sums
df3.sum(axis=1)

a    113
b    141
c    135
d    166
e     79
dtype: int64

#
Try the above with other methods like mean, std, var to produce statistical summaries of your data. Below is a list of all math/stat methods available to objects of the DataFrame class.

## The describe() method 
One method, however, stands apart from the rest in its usefulness. The .describe() method applied to a DataFrame wil produce summary statistics for all numeric variables in the data and return the result in a neat DataFrame. Note that here too, NAs are excluded by default.
 

In [49]:
df3.describe()

Unnamed: 0,p,q,r,s,t
count,5.0,5.0,5.0,5.0,5.0
mean,16.6,19.2,28.2,27.0,35.8
std,14.293355,8.526429,13.479614,16.643317,18.226355
min,1.0,10.0,15.0,7.0,20.0
25%,4.0,15.0,17.0,16.0,24.0
50%,16.0,16.0,24.0,24.0,26.0
75%,31.0,23.0,42.0,44.0,46.0
max,31.0,32.0,43.0,44.0,63.0


#
We can specift exactly which percentiles to evaluate (but the median will always be printed by default.)


In [9]:
df3.describe(percentiles=[.01,.02,.09,.99]).round(3)

Unnamed: 0,p,q,r,s
count,5.0,5.0,5.0,5.0
mean,23.8,13.8,26.2,29.6
std,11.841,15.023,8.643,11.992
min,6.0,1.0,13.0,15.0
1%,6.52,1.04,13.44,15.24
2%,7.04,1.08,13.88,15.48
9%,10.68,1.36,16.96,17.16
50%,26.0,7.0,27.0,29.0
99%,35.84,34.56,35.8,42.88
max,36.0,35.0,36.0,43.0


# 
For categorical data, describe() will give a simple summary of the number of unique values and most frequently occurring values
We can pass the include= parameter to describe to control whether the summaries are printed for numeric or categorical variables by default.
 

In [54]:
# Our DataFrame with mixed types
df2

Unnamed: 0,ints,floats,strings
a,0,0.1,a
b,1,0.2,b
c,2,0.3,c
d,3,0.4,d
e,4,0.5,e


In [56]:
# Default behavior 
# same as: df3.describe(include=['number']) 
df2.describe()

Unnamed: 0,ints,floats
count,5.0,5.0
mean,2.0,0.3
std,1.581139,0.158114
min,0.0,0.1
25%,1.0,0.2
50%,2.0,0.3
75%,3.0,0.4
max,4.0,0.5


In [12]:
#  To get summaries for categorical variables only 
df2.describe(include=['object'])

Unnamed: 0,strings
count,5
unique,5
top,c
freq,1


In [13]:
df2

Unnamed: 0,ints,floats,strings
a,0,0.1,a
b,1,0.2,b
c,2,0.3,c
d,3,0.4,d
e,4,0.5,e


In [58]:
# Get summaries for all variables 
df2.describe(include='all')

Unnamed: 0,ints,floats,strings
count,5.0,5.0,5
unique,,,5
top,,,a
freq,,,1
mean,2.0,0.3,
std,1.581139,0.158114,
min,0.0,0.1,
25%,1.0,0.2,
50%,2.0,0.3,
75%,3.0,0.4,


## Handling Missing Values 
 By missing data we simply mean NULL or not present for whatever reason. Many phenomena could give rise to missing data but mostly it is just a matter of  either the data existed and was not collected or it never existed. 
Pandas treats the NumPy np.nan and the Python None as missing values. The approach for handling missing values in DataFrames is the same as that for Series (a Dataframe is afterall just a list of Series objects.)

#
.These can be detected in a Series or DataFrame using isnull, notnull which return booleans. 
.To filter out missing data from a Series/DataFrame, or to remove rows (default action) or columns with missing data in a DataFrame, we use dropna with the axis= and inplace= parameters
.Missing Value imputation is done using the fillna method (along with options like  ffill, bfill )

In [59]:
# Create some missing data
df3

Unnamed: 0,p,q,r,s,t
a,16,10,17,44,26
b,31,15,42,7,46
c,1,23,43,44,24
d,31,32,24,16,63
e,4,16,15,24,20


In [73]:
df3.ix[::2]=np.nan
df3

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,p,q,r,s,t
a,,,,,
b,31.0,15.0,42.0,7.0,46.0
c,,,,,
d,31.0,32.0,24.0,16.0,63.0
e,,,,,


In [74]:
#  Drop rows with missing data 
df3.dropna()

Unnamed: 0,p,q,r,s,t
b,31.0,15.0,42.0,7.0,46.0
d,31.0,32.0,24.0,16.0,63.0


In [69]:
# Detect Missing data 
df3.isnull()

Unnamed: 0,p,q,r,s,t
a,True,True,True,True,True
b,False,False,False,False,False
c,True,True,True,True,True
d,False,False,False,False,False
e,True,True,True,True,True


In [70]:
#  Replace missings with 0 (manually)
df3[df3.isnull()]=10
df3

Unnamed: 0,p,q,r,s,t
a,10.0,10.0,10.0,10.0,10.0
b,31.0,15.0,42.0,7.0,46.0
c,10.0,10.0,10.0,10.0,10.0
d,31.0,32.0,24.0,16.0,63.0
e,10.0,10.0,10.0,10.0,10.0


In [71]:
#  Replace missings with 0 (using fillna) 
df3.fillna(10)

Unnamed: 0,p,q,r,s,t
a,10.0,10.0,10.0,10.0,10.0
b,31.0,15.0,42.0,7.0,46.0
c,10.0,10.0,10.0,10.0,10.0
d,31.0,32.0,24.0,16.0,63.0
e,10.0,10.0,10.0,10.0,10.0


## Sorting Data 
Sorting data is a basic task that allows us to figure out if a given variable has outliers by looking at the values at its extremes. Both sort functions in Pandas take an ascending= parameter to control the nature of the sort. By default, it takes True so if you want to get the Series/DataFrame sorted in descending order, pass ascending=False For Reordering rows or columns we use sort_index() For Sorting on column values use  sort_values() which takes a by= parameter through which we may specify the column(s) on which we want to sort the data.

In [81]:
df4=pd.DataFrame(np.random.randn(15).reshape(5,3),index=list('bcdea'),columns=list('gef'))
df4

Unnamed: 0,g,e,f
b,-0.211668,0.656578,0.330311
c,-0.040383,-0.051212,0.548691
d,0.477832,-0.272694,1.572767
e,0.373196,0.354922,2.097383
a,0.492184,-0.669103,-0.683471


In [82]:
#  without arguments, sort_index() will sort the index (rows) of the DataFrame 
df4.sort_index()

Unnamed: 0,g,e,f
a,0.492184,-0.669103,-0.683471
b,-0.211668,0.656578,0.330311
c,-0.040383,-0.051212,0.548691
d,0.477832,-0.272694,1.572767
e,0.373196,0.354922,2.097383


In [83]:
# To sort column names
df4.sort_index(axis=1)

Unnamed: 0,e,f,g
b,0.656578,0.330311,-0.211668
c,-0.051212,0.548691,-0.040383
d,-0.272694,1.572767,0.477832
e,0.354922,2.097383,0.373196
a,-0.669103,-0.683471,0.492184


In [85]:
#  Sort the data by the values of a column 
df4.sort_values(by=['e','g'])

Unnamed: 0,g,e,f
a,0.492184,-0.669103,-0.683471
d,0.477832,-0.272694,1.572767
c,-0.040383,-0.051212,0.548691
e,0.373196,0.354922,2.097383
b,-0.211668,0.656578,0.330311


In [86]:
#  Sort the data by the values of 2 columns
df4.sort_values(by=['f','g'],ascending=False)

Unnamed: 0,g,e,f
e,0.373196,0.354922,2.097383
d,0.477832,-0.272694,1.572767
c,-0.040383,-0.051212,0.548691
b,-0.211668,0.656578,0.330311
a,0.492184,-0.669103,-0.683471


## Handling Duplicates 
The methods duplicated, drop_duplicates help us in identifying rows that are duplicates of other rows, and to ignore those rows from the data.
 

In [87]:
df4

Unnamed: 0,g,e,f
b,-0.211668,0.656578,0.330311
c,-0.040383,-0.051212,0.548691
d,0.477832,-0.272694,1.572767
e,0.373196,0.354922,2.097383
a,0.492184,-0.669103,-0.683471


In [89]:
# Create a duplicate row in the data 
df4.loc['x',:]=df4.loc['d',:]
df4

Unnamed: 0,g,e,f
b,-0.211668,0.656578,0.330311
c,-0.040383,-0.051212,0.548691
d,0.477832,-0.272694,1.572767
e,0.373196,0.354922,2.097383
a,0.492184,-0.669103,-0.683471
x,0.477832,-0.272694,1.572767


In [90]:
# Detect duplicates 
df4.duplicated()

b    False
c    False
d    False
e    False
a    False
x     True
dtype: bool

In [91]:
# Ignore duplicates
df4.drop_duplicates()

Unnamed: 0,g,e,f
b,-0.211668,0.656578,0.330311
c,-0.040383,-0.051212,0.548691
d,0.477832,-0.272694,1.572767
e,0.373196,0.354922,2.097383
a,0.492184,-0.669103,-0.683471
