In [1]:
import pandas as pd
import numpy as np

# Clean Data tips

## Create Dataset

In [2]:
df = pd.DataFrame({'Column one':[10,20,30,40,50,2000],'Column two':[1000,0,30,40,50,60]})
df

Unnamed: 0,Column one,Column two
0,10,1000
1,20,0
2,30,30
3,40,40
4,50,50
5,2000,60


## Fix column name

In [3]:
df.columns = df.columns.str.replace(" ", "_")
df.columns = df.columns.str.lower()
df

Unnamed: 0,column_one,column_two
0,10,1000
1,20,0
2,30,30
3,40,40
4,50,50
5,2000,60


## Change Column datatypes


In [4]:
typechng = df
typechng.dtypes

column_one    int64
column_two    int64
dtype: object

In [5]:
typechng = typechng.astype({'column_one': 'str'}) # change one column 
typechng.dtypes

column_one    object
column_two     int64
dtype: object

## Replace column value 

In [6]:
replace1 = df
replace1.column_two.replace({1000:10}, inplace = True) ## change all values 1000 to 10 in column 2 
replace1


Unnamed: 0,column_one,column_two
0,10,10
1,20,0
2,30,30
3,40,40
4,50,50
5,2000,60


In [7]:
replace2 = df
replace2.replace({1000:10,2000:60}, inplace = True) ## change all values 1000 to 10 in and 2000 to 60 in all columns
replace2

Unnamed: 0,column_one,column_two
0,10,10
1,20,0
2,30,30
3,40,40
4,50,50
5,60,60


# Working with NaN

### Create Data Set

In [8]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
df
oneremovena1 = df

### Remove all rows with null in column 'one'

In [9]:
oneremovena = df[df.one.notna()]
oneremovena

Unnamed: 0,one,two,three
a,1.355036,1.509586,-0.687637
c,0.882154,0.269834,0.136956
e,-1.200436,0.767644,-0.164924
f,-0.028138,0.670236,0.932181
h,-0.744708,-2.039686,1.424826


### Propogate values forward 

In [10]:
onepropna = df 
onepropna.one.fillna(method= 'pad', inplace = True) # propogate column "one" to next row if empty 
onepropna

Unnamed: 0,one,two,three
a,1.355036,1.509586,-0.687637
b,1.355036,,
c,0.882154,0.269834,0.136956
d,0.882154,,
e,-1.200436,0.767644,-0.164924
f,-0.028138,0.670236,0.932181
g,-0.028138,,
h,-0.744708,-2.039686,1.424826


# Group By Example
* max and count 

In [11]:
dfgrpby = pd.DataFrame({'one':[10,10,10,20,30,30,50,50,50],'two':[100,100,150,150,400,400,500,500,500], 'three':[1120,100,150,150,450,400,500,5000,400]})

maxdate = dfgrpby.groupby(['one', 'two']).max() # Get max date dataset 
maxcount = dfgrpby.groupby(['one', 'two']).count() # get count of dataset
display(maxdate)
display(maxcount)

Unnamed: 0_level_0,Unnamed: 1_level_0,three
one,two,Unnamed: 2_level_1
10,100,1120
10,150,150
20,150,150
30,400,450
50,500,5000


Unnamed: 0_level_0,Unnamed: 1_level_0,three
one,two,Unnamed: 2_level_1
10,100,2
10,150,1
20,150,1
30,400,2
50,500,3


# Numpy Logic Tests
* can be used for filtering 

In [12]:
test1 = [True, True, False, False]
test2 = [True, False, True, False]


print ('logical_and: {}'.format(np.logical_and(test1,test2)))
print ('logical_or: {}'.format(np.logical_or(test1,test2)))
print ('logical_xor: {}'.format(np.logical_xor(test1,test2)))

logical_and: [ True False False False]
logical_or: [ True  True  True False]
logical_xor: [False  True  True False]


In [13]:
fds = pd.DataFrame({'one':[10,10,10,20,30,30,50,50,50],'two':[100,100,150,150,400,400,500,500,500], 'three':[1120,100,150,150,450,400,500,5000,400]})
fds

Unnamed: 0,one,two,three
0,10,100,1120
1,10,100,100
2,10,150,150
3,20,150,150
4,30,400,450
5,30,400,400
6,50,500,500
7,50,500,5000
8,50,500,400


# Filter results
* Use np.Logical_and to create a boolean series with both conditions 
* use dataframe.loc to filter out all false values 

In [14]:
fds_and = fds.loc[np.logical_and(fds.one == 10, fds.three > 110)]
fds_or = fds.loc[np.logical_or(fds.one == 30, fds.three < 500)]
fds_xor = fds.loc[np.logical_xor(fds.one == 30, fds.three < 500)]
print('Logical and')
display(fds_and)
print('Logical or')
display(fds_or)
print('Logical xor')
display(fds_xor)

Logical and


Unnamed: 0,one,two,three
0,10,100,1120
2,10,150,150


Logical or


Unnamed: 0,one,two,three
1,10,100,100
2,10,150,150
3,20,150,150
4,30,400,450
5,30,400,400
8,50,500,400


Logical xor


Unnamed: 0,one,two,three
1,10,100,100
2,10,150,150
3,20,150,150
8,50,500,400


In [15]:
fds_between = fds.loc[fds.three.between(200, 500)]
display(fds_between)

Unnamed: 0,one,two,three
4,30,400,450
5,30,400,400
6,50,500,500
8,50,500,400


# Reshape Dataframes

In [16]:
onecoldf = pd.DataFrame({'one':[1,2,3,4,5,6,7,8]})
onecoldf

Unnamed: 0,one
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,8


In [17]:
fourcoldf = pd.DataFrame(onecoldf.values.reshape(-1,4))
fourcoldf

Unnamed: 0,0,1,2,3
0,1,2,3,4
1,5,6,7,8


In [18]:
fourcoldf = pd.DataFrame({'one':[1,2], 'two':[3,4], 'three':[5,6], 'four': [7,8]})
fourcoldf

Unnamed: 0,one,two,three,four
0,1,3,5,7
1,2,4,6,8


In [19]:
twocoldf = pd.DataFrame(fourcoldf.values.reshape(-1, 2))
twocoldf

Unnamed: 0,0,1
0,1,3
1,5,7
2,2,4
3,6,8
