In [1]:
import pandas as pd
import numpy as np

# Clean Data tips

## Create Dataset

In [2]:
df = pd.DataFrame({'Column one':[10,20,30,40,50,2000],'Column two':[1000,0,30,40,50,60]})
df

Unnamed: 0,Column one,Column two
0,10,1000
1,20,0
2,30,30
3,40,40
4,50,50
5,2000,60


## Fix column name

In [3]:
df.columns = df.columns.str.replace(" ", "_")
df.columns = df.columns.str.lower()
df

Unnamed: 0,column_one,column_two
0,10,1000
1,20,0
2,30,30
3,40,40
4,50,50
5,2000,60


## Change Column datatypes


In [4]:
typechng = df
typechng.dtypes

column_one    int64
column_two    int64
dtype: object

In [5]:
typechng = typechng.astype({'column_one': 'str'}) # change one column 
typechng.dtypes

column_one    object
column_two     int64
dtype: object

## Replace column value 

In [6]:
replace1 = df
replace1.column_two.replace({1000:10}, inplace = True) ## change all values 1000 to 10 in column 2 
replace1


Unnamed: 0,column_one,column_two
0,10,10
1,20,0
2,30,30
3,40,40
4,50,50
5,2000,60


In [7]:
replace2 = df
replace2.replace({1000:10,2000:60}, inplace = True) ## change all values 1000 to 10 in and 2000 to 60 in all columns
replace2

Unnamed: 0,column_one,column_two
0,10,10
1,20,0
2,30,30
3,40,40
4,50,50
5,60,60


# Working with NaN

### Create Data Set

In [8]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
df
oneremovena1 = df

### Remove all rows with null in column 'one'

In [9]:
oneremovena = df[df.one.notna()]
oneremovena

Unnamed: 0,one,two,three
a,-1.463,-0.160588,-0.416928
c,1.578594,-1.018931,0.095473
e,-0.695207,-1.262941,1.116766
f,0.70504,1.462496,0.964011
h,-0.815406,0.307333,0.475999


### Propogate values forward 

In [10]:
onepropna = df 
onepropna.one.fillna(method= 'pad', inplace = True) # propogate column "one" to next row if empty 
onepropna

Unnamed: 0,one,two,three
a,-1.463,-0.160588,-0.416928
b,-1.463,,
c,1.578594,-1.018931,0.095473
d,1.578594,,
e,-0.695207,-1.262941,1.116766
f,0.70504,1.462496,0.964011
g,0.70504,,
h,-0.815406,0.307333,0.475999


# Group By Example
* max and count 

In [11]:
dfgrpby = pd.DataFrame({'one':[10,10,10,20,30,30,50,50,50],'two':[100,100,150,150,400,400,500,500,500], 'three':[1120,100,150,150,450,400,500,5000,400]})

maxdate = dfgrpby.groupby(['one', 'two']).max() # Get max date dataset 
maxcount = dfgrpby.groupby(['one', 'two']).count() # get count of dataset
display(maxdate)
display(maxcount)

Unnamed: 0_level_0,Unnamed: 1_level_0,three
one,two,Unnamed: 2_level_1
10,100,1120
10,150,150
20,150,150
30,400,450
50,500,5000


Unnamed: 0_level_0,Unnamed: 1_level_0,three
one,two,Unnamed: 2_level_1
10,100,2
10,150,1
20,150,1
30,400,2
50,500,3


# Numpy Logic Tests
* can be used for filtering 

In [12]:
test1 = [True, True, False, False]
test2 = [True, False, True, False]


print ('logical_and: {}'.format(np.logical_and(test1,test2)))
print ('logical_or: {}'.format(np.logical_or(test1,test2)))
print ('logical_xor: {}'.format(np.logical_xor(test1,test2)))

logical_and: [ True False False False]
logical_or: [ True  True  True False]
logical_xor: [False  True  True False]


# Filter results

All filters used create a boolean series. This series can be use in a vectorized operation to filter results in a dataframe by using boolean indexing. 

* Example 1 - Filter results with one logical oprator. 
* Example 2 - Filter results with multiple logical operators
* Example 4 - Between

and = & 
or = |
not = ~
xor = np.logical_xor 


In [13]:
fds = pd.DataFrame({'one':[10,10,10,20,30,30,50,50,50],'two':[100,100,150,150,400,400,500,500,500], 'three':[1120,100,150,150,450,400,500,5000,400]})
fds

Unnamed: 0,one,two,three
0,10,100,1120
1,10,100,100
2,10,150,150
3,20,150,150
4,30,400,450
5,30,400,400
6,50,500,500
7,50,500,5000
8,50,500,400


In [14]:
fds_one = fds[fds.one > 10]
fds_notone = fds[~(fds.one > 10)]
display(fds_one)
display(fds_notone)

Unnamed: 0,one,two,three
3,20,150,150
4,30,400,450
5,30,400,400
6,50,500,500
7,50,500,5000
8,50,500,400


Unnamed: 0,one,two,three
0,10,100,1120
1,10,100,100
2,10,150,150


In [15]:
fds_and = fds[(fds.one == 10) & (fds.three > 110)]
fds_or = fds[(fds.one == 30) | (fds.three < 500)]
fds_xor = fds[np.logical_xor(fds.one == 30, fds.three < 500)] #need to use np logical or function 
print('Logical and')
display(fds_and)
print('Logical or')
display(fds_or)
print('Logical xor')
display(fds_xor)

Logical and


Unnamed: 0,one,two,three
0,10,100,1120
2,10,150,150


Logical or


Unnamed: 0,one,two,three
1,10,100,100
2,10,150,150
3,20,150,150
4,30,400,450
5,30,400,400
8,50,500,400


Logical xor


Unnamed: 0,one,two,three
1,10,100,100
2,10,150,150
3,20,150,150
8,50,500,400


In [16]:
fds_between = fds[fds.three.between(200, 500)]
display(fds_between)

Unnamed: 0,one,two,three
4,30,400,450
5,30,400,400
6,50,500,500
8,50,500,400


### More efficient way to calculate is it use Numpy logical functions
- Numpy operations work directly on memory and are more efficent 
- Numpy logical operations are faster in very large datasets 
- In most cases normal logical operators can be used

In [17]:
fds_and = fds[np.logical_and(fds.one == 10, fds.three > 110)]
fds_or = fds[np.logical_or(fds.one == 30, fds.three < 500)]
fds_between = fds[np.logical_and(fds.three > 200, fds.three <= 500)]                  
print('Numpy Logical and')
display(fds_and)
print('Numpy Logical or')
display(fds_or)
print('Numpy between')
display(fds_or)

Numpy Logical and


Unnamed: 0,one,two,three
0,10,100,1120
2,10,150,150


Numpy Logical or


Unnamed: 0,one,two,three
1,10,100,100
2,10,150,150
3,20,150,150
4,30,400,450
5,30,400,400
8,50,500,400


Numpy between


Unnamed: 0,one,two,three
1,10,100,100
2,10,150,150
3,20,150,150
4,30,400,450
5,30,400,400
8,50,500,400


# Reshape Dataframes

In [18]:
onecoldf = pd.DataFrame({'one':[1,2,3,4,5,6,7,8]})
onecoldf

Unnamed: 0,one
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,8


In [19]:
fourcoldf = pd.DataFrame(onecoldf.values.reshape(-1,4))
fourcoldf

Unnamed: 0,0,1,2,3
0,1,2,3,4
1,5,6,7,8


In [20]:
twocoldf = pd.DataFrame(fourcoldf.values.reshape(-1, 2))
twocoldf

Unnamed: 0,0,1
0,1,2
1,3,4
2,5,6
3,7,8


# Pivot/Unpivot a data frame 

## Pivot DataFrame

In [21]:
dfPivot = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
                         "bar", "bar", "bar", "bar"],
                   "B": ["one", "one", "one", "two", "two",
                         "one", "one", "two", "two"],
                   "C": ["small", "large", "large", "small",
                         "small", "large", "small", "small",
                         "large"],
                   "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
                   "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})

dfPivot

Unnamed: 0,A,B,C,D,E
0,foo,one,small,1,2
1,foo,one,large,2,4
2,foo,one,large,2,5
3,foo,two,small,3,5
4,foo,two,small,3,6
5,bar,one,large,4,6
6,bar,one,small,5,8
7,bar,two,small,6,9
8,bar,two,large,7,9


In [22]:
#group by A and B with columns c and D as the data summed with missing values set to 0
table1 = pd.pivot_table(dfPivot, values='D', index=['A', 'B'],
                    columns=['C'], aggfunc=np.sum, fill_value=0)
table1

Unnamed: 0_level_0,C,large,small
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,4,5
bar,two,7,6
foo,one,4,1
foo,two,0,6


In [23]:
# groupby by A and B set columns to D and E, aggrigate D by the Mean and E by the min, max and mean 
table2 = pd.pivot_table(dfPivot, values=['D', 'E'], index=['A', 'C'],
                    aggfunc={'D': np.mean,
                             'E': [min, max, np.mean]})
table2

Unnamed: 0_level_0,Unnamed: 1_level_0,D,E,E,E
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,max,mean,min
A,C,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
bar,large,5.5,9.0,7.5,6.0
bar,small,5.5,9.0,8.5,8.0
foo,large,2.0,5.0,4.5,4.0
foo,small,2.333333,6.0,4.333333,2.0


## Unpivot DataFrame

In [24]:
dfunpivot = pd.DataFrame({'Heading': {0: 'a', 1: 'b', 2: 'c'},
                   'cat1': {0: 1, 1: 3, 2: 5},
                   'cat2': {0: 2, 1: 4, 2: 6},
                   'cat3': {0: 3, 1: 2, 2: 7}})
dfunpivot

Unnamed: 0,Heading,cat1,cat2,cat3
0,a,1,2,3
1,b,3,4,2
2,c,5,6,7


In [25]:
# unpivot single catagory
pd.melt(dfunpivot, id_vars=['Heading'], value_vars=['cat1'])

Unnamed: 0,Heading,variable,value
0,a,cat1,1
1,b,cat1,3
2,c,cat1,5


In [26]:
# unpivot multiple catagory
pd.melt(dfunpivot, id_vars=['Heading'], value_vars=['cat1','cat2'])

Unnamed: 0,Heading,variable,value
0,a,cat1,1
1,b,cat1,3
2,c,cat1,5
3,a,cat2,2
4,b,cat2,4
5,c,cat2,6


In [27]:
# unpivot all catagories
pd.melt(dfunpivot, id_vars='Heading', var_name ="Cat", value_name ="values")

Unnamed: 0,Heading,Cat,values
0,a,cat1,1
1,b,cat1,3
2,c,cat1,5
3,a,cat2,2
4,b,cat2,4
5,c,cat2,6
6,a,cat3,3
7,b,cat3,2
8,c,cat3,7
