# Data Cleaning and Preparation
## DAT540 Introduction to Data Science
## University of Stavanger
### L10
#### Antorweep Chakravorty (antorweep.chakravorty@uis.no)

In [2]:
import numpy as np
import pandas as pd

- **Data Transformation**
- *Removing Duplicates*
- Duplicate rows may be found in a DataFrame for various reasons
- *duplicated* instance method returns a boolean Series indicating whether each row is duplicate or has been observed before
- *drop_duplicates* return a DataFrame where the duplicated array is False
- The *subset* argument can be provided to either of the methods for detecting duplicates based on the filtered columns / rows
- *keep* argument in drop_duplicates is used to retain the "first" (default) or "last" observed value

In [3]:
# Get the adult dataset from UCI Machine Learning repo
adulturl =  'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
adult = pd.read_csv(adulturl, header=None)
adult.shape

(32561, 15)

In [4]:
adult.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


In [52]:
# The names of column indices 
namesurl = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names'
names = pd.read_table(namesurl, header = None)
print("shape", names.shape)
names.head(5)

shape (107, 1)


Unnamed: 0,0
0,| This data was extracted from the census bure...
1,| http://www.census.gov/ftp/pub/DES/www/welcom...
2,"| Donor: Ronny Kohavi and Barry Becker,"
3,| Data Mining and Visualization
4,| Silicon Graphics.


In [53]:
names

Unnamed: 0,0
0,| This data was extracted from the census bure...
1,| http://www.census.gov/ftp/pub/DES/www/welcom...
2,"| Donor: Ronny Kohavi and Barry Becker,"
3,| Data Mining and Visualization
4,| Silicon Graphics.
...,...
102,"sex: Female, Male."
103,capital-gain: continuous.
104,capital-loss: continuous.
105,hours-per-week: continuous.


In [54]:
names.tail(adult.shape[1])

Unnamed: 0,0
92,">50K, <=50K."
93,age: continuous.
94,"workclass: Private, Self-emp-not-inc, Self-emp..."
95,fnlwgt: continuous.
96,"education: Bachelors, Some-college, 11th, HS-g..."
97,education-num: continuous.
98,"marital-status: Married-civ-spouse, Divorced, ..."
99,"occupation: Tech-support, Craft-repair, Other-..."
100,"relationship: Wife, Own-child, Husband, Not-in..."
101,"race: White, Asian-Pac-Islander, Amer-Indian-E..."


In [55]:
# retrieve only the relevant rows
names = names.tail(adult.shape[1])
names


Unnamed: 0,0
92,">50K, <=50K."
93,age: continuous.
94,"workclass: Private, Self-emp-not-inc, Self-emp..."
95,fnlwgt: continuous.
96,"education: Bachelors, Some-college, 11th, HS-g..."
97,education-num: continuous.
98,"marital-status: Married-civ-spouse, Divorced, ..."
99,"occupation: Tech-support, Craft-repair, Other-..."
100,"relationship: Wife, Own-child, Husband, Not-in..."
101,"race: White, Asian-Pac-Islander, Amer-Indian-E..."


In [56]:
names = names.reset_index(drop=True)
names = list(names.applymap(lambda x: x.split(':')[0]).values.flatten())
print(names)

['>50K, <=50K.', 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']


In [57]:
adult.head(2)

Unnamed: 0,Salary,Age,Workclass,Fnlwgt,Education,Education-Num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-Gain,Capital-Loss,Weekly-Work-Hours,Native-Country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


In [58]:
# shift indices in regards to the order of columns in adult
names[-1:], names[:-1] = names[:1], names[1:]
print(names)

['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', '>50K, <=50K.']


In [59]:
adult.columns = names
adult.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,">50K, <=50K."
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


In [60]:
print('shape of adults before removing duplicates', adult.shape)
# check for duplicated on adults dataset using instance method duplicated
print('no. of duplicates', adult.duplicated().sum(axis=0))
# Remove the duplicate using drop_duplicates
adult.drop_duplicates(inplace = True)
print('shape of adults after removing duplicates', adult.shape)

shape of adults before removing duplicates (32537, 15)
no. of duplicates 0
shape of adults after removing duplicates (32537, 15)


- *Transforming Data Using a Function or Mapping*
- Allows transformation based on values in an array, Series, or column in a DataFrame
- The *map* instance method on a Series accepts a function or dict-like object containing a mapping. It returns a maps of the given Series to the values in the dict object
- The map method also accepts a function, that performs an element wise operation for each value in the series returning the mapping

In [61]:
# Transform gender to lower case
adult['sex'].map(lambda x: x.lower()).head()

0       male
1       male
2       male
3       male
4     female
Name: sex, dtype: object

- *Replacing Values*
- *replace* instance method, provides even a more simpler way for performing element wise transformation
- It allows replacement of an encountered value to the specified value in a Series or DataFrame
- The first argument (*to_replace*) looks for the values that has to be replaced. The second argument (*value*), specifies the values that would be replaced with
- The arguments, could be scalar or a list allowing us to replace multiple values with scalar or replacing each value with a different replacement
- A single dict would map the keys as the values to be replaced with the value for each key
- the inplace argument can be provided for both of the methods, to perform operations without making a copy

In [62]:
# Using replace instance method. Set regex=True
adult.replace(to_replace=['Male', 'Female'], value=['m','f'], regex=True).head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,">50K, <=50K."
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,m,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,m,0,0,13,United-States,<=50K


- *Renaming Axis Indexes*
- Axis labels can be transformed similarly using mapping
- Like a Series, the axis indices have a map method
- Alternatively, *rename* instance method can be used on a DataFrame to rename the columns and indices. A list or a dict can be provided to the method along with the inplace arg

In [63]:
# Let us change the column names of the adult dataset with each name starting with a capital letter. 
# Also strip any whitespaces
# The same can be done for the row indices
adult.columns = adult.columns.map(lambda x: x.title().strip())
adult.head(2)

Unnamed: 0,Age,Workclass,Fnlwgt,Education,Education-Num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-Gain,Capital-Loss,Hours-Per-Week,Native-Country,">50K, <=50K."
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


In [64]:
# Alternatively the rename instance method can be used with a dict to rename columns or index
adult.rename(columns={'>50K, <=50K.': 'Salary', 'Hours-Per-Week': 'Weekly-Work-Hours'}, inplace=True)
adult.head(2)

Unnamed: 0,Age,Workclass,Fnlwgt,Education,Education-Num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-Gain,Capital-Loss,Weekly-Work-Hours,Native-Country,Salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


- *Discretization and Binning*
- Continuous data is sometimes preferred to be discretized or separated into "bins" for analysis
- pandas offers a top-level *cut* method to segment and sort data values into bins
  - The first argument is a array-like 1-D object that we want to discretise
  - The second argument *bins* is a int, sequence of scalar or pandas.IntervalIndex. Providing a scaler int, creates the specified number of  equal length bins
  - The *right* bool argument indicates whether bins include the right most edge
  - *labels* array or bool specifies the labels for the returned bins (optional)
  - The *precision* (int, default 3) argument defines the precision at which to store and display the bins labels.
  - cut method returns a *Categorical* object with codes and categories
- *qcut* another top-level method bins the data on sample quantiles based on the distribution of the data

In [66]:
# Create bins for the ages into ageBins
# bins = [2, 3, 5, 12, 18, 21, 34, 50, 80, 100, 1000]
# label = labels=['Infant', 'Toddler', 'Early Childhood ', 'Middle Childhood', 'Late Childhood', 'Teen', 'Early Adulthood', 'Midlife', 'Mature Adulthood', 'Late Adulthood']
# right = True, to create inclusive bins
ageBins = pd.cut(adult['Age'], bins=[2, 3, 5, 12, 18, 21, 34, 50, 80, 100, 1000], right=True, labels=['Infant', 'Toddler', 'Early Childhood ', 'Middle Childhood', 'Late Childhood', 'Teen', 'Early Adulthood', 'Midlife', 'Mature Adulthood', 'Late Adulthood'])
# Use value_counts() to display the frequencies
ageBins.value_counts()

Early Adulthood     12043
Teen                10912
Midlife              6361
Late Childhood       2178
Middle Childhood      945
Mature Adulthood       98
Infant                  0
Toddler                 0
Early Childhood         0
Late Adulthood          0
Name: Age, dtype: int64

In [67]:
# cut chooses the bins to be evenly spaced according to the values themselves and not the frequency of those values. 
# qcut chooses the bins to be evenly spaced according to the frequencies of the bins
# demonstrate qcut and cut on Age with 4 bins. Use value_counts() to display the frequencies
print('pd.qcut:\n', pd.qcut(adult['Age'], 4).value_counts())
print('pd.cut:\n', pd.cut(adult['Age'], 4).value_counts())

pd.qcut:
 (16.999, 28.0]    8885
(37.0, 48.0]      8235
(28.0, 37.0]      7781
(48.0, 90.0]      7636
Name: Age, dtype: int64
pd.cut:
 (16.927, 35.25]    14910
(35.25, 53.5]      12705
(53.5, 71.75]       4455
(71.75, 90.0]        467
Name: Age, dtype: int64


- *Detecting and Filtering Outliers*
- Is a matter of applying array operations


In [68]:
# Perfrom describe on the adult data set to have an overview
adult.describe()

Unnamed: 0,Age,Fnlwgt,Education-Num,Capital-Gain,Capital-Loss,Weekly-Work-Hours
count,32537.0,32537.0,32537.0,32537.0,32537.0,32537.0
mean,38.585549,189780.8,10.081815,1078.443741,87.368227,40.440329
std,13.637984,105556.5,2.571633,7387.957424,403.101833,12.346889
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,236993.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [69]:
# From an overview, we notice that someone works only 1 hour and another 99 hours per week. 
# How many of such people are there. Is it the norm?
# Might be that there are more anomalies in the Weekly-Work-Hours Series
# Create bins using pd.cut on Weekly-Work-Hours and perform value_counts() to gain a deep overview
# use bins=[0, 1, 2, 3, 5, 40, 80, 90, 98, 1000] and right=True
# use maps on value_counts to generate the percentage
# is there anomalies?
pd.cut(adult['Weekly-Work-Hours'], bins=[0, 1, 2, 3, 5, 40, 80, 90, 98, 1000], right=True).value_counts().map(lambda x: x/adult['Weekly-Work-Hours'].value_counts().sum() * 100)

(5, 40]       69.938839
(40, 80]      28.791837
(3, 5]         0.350370
(80, 90]       0.301196
(98, 1000]     0.261241
(2, 3]         0.119864
(1, 2]         0.098350
(90, 98]       0.076836
(0, 1]         0.061468
Name: Weekly-Work-Hours, dtype: float64

- *Permutaion and Random Sampling*
- Permuting (randomly ordering) a Series or rows in a DataFrame can be performed using *numpy.random.permutation* top-level method
- Calling *permutation* with the length of the axis required for permuting produces an array of integers indicating new ordering
- *iloc* based indexing or the equivalent *take* instance method can be used to select the required indices or columns
- The *sample* instance method of a DataFrame or Series randomly retrieves the number of specified elements from an axis
  - the *replace* bool arg. specifies requirement for generating samples *with replacement*

In [70]:
# we will use numpy.random.permutation to generate a seq len 3 of permuted ints between 0 and 3
sampler = np.random.permutation(3)
print('sampler', sampler)
# Use iLoc argument of adult df to sample rows (alternatively we could use take instance method with the sampler)
adult.iloc[sampler]

sampler [0 2 1]


Unnamed: 0,Age,Workclass,Fnlwgt,Education,Education-Num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-Gain,Capital-Loss,Weekly-Work-Hours,Native-Country,Salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


In [71]:
# Use sample instance method to retrive 3 random rows from adult
adult.sample(3)

Unnamed: 0,Age,Workclass,Fnlwgt,Education,Education-Num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-Gain,Capital-Loss,Weekly-Work-Hours,Native-Country,Salary
6530,40,Private,321758,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States,>50K
3912,49,Private,82649,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,5013,0,45,United-States,<=50K
2549,43,Private,57600,Doctorate,16,Married-spouse-absent,Prof-specialty,Not-in-family,White,Female,0,0,40,?,<=50K


- *Computing Indicators/Dummy Variables*
- Converting a categorical variable into a "dummy" or "indicator" matrix
- Similar to *one hot encoding*
- If a column in a DataFrame has *k* distinct values, a matrix or DataFrame can be derived with k columns containing all 1s and 0s
- The pandas *get_dummies* instance method performs the required operation
  - a *prefix* argument allows to add a prefix to each column name
- *get_dummies* and *cut* methods in conjunction provides a useful recipe for statistical applications

- **String Manipulation**
- applying string and regular expressions concisely on whole arrays of data
- *String Object Methods*
- The *split* instance method is used to split a string into an array based on a specified delimiter
- *trim* instance method, trims white spaces 
- substrings can be concatinated together using the addition *+* symbol
- the python string method *index* finds the first occurrence of a character in a string
- *index* raises an exception if a string isn't found. Alternatively, the *find* method can also be used that returns *-1* if a index is not found
- string instance method *count* returns the number of times a char appears in a string
- *replace* string instance method substitutes occurrence of one pattern to another
- Built-in python string methods
<img src='images/string_methods.png'>

- *Regular Expressions*
- A single expression (*regex*) is a string formed according to the regular expression language
- regex describes a pattern to locate in a text that can be used for many purposes
- python provides the built-in *re* module for applying regex to strings
- The *re* module functions fall into three categories: pattern matching, substitution, and splitting
- *re.compile* compiles a regex to be reusable later. 
- regex methods:
  
  ```python
  regex = re.compile('\s*')
  regex.split(text)
  ```
  
<img src='images/regex_methods.png'>

- *Vectorized String Functions in pandas*
- Cleaning up a messy dataset for analysis often requires a lot of string munging and regularization
- string and regex methods applied to missing data using map, will fail on NA (null) values
- Series addresses this by providing array-oriented methods for string operations that skip NA values
- These methods are access through the Series *str* attribute
- Partial listing of vectorized string methods:

<img src='images/vectorizedstr_methods.png' width=400>

In [72]:
adult['Native-Country'].head(5)

0     United-States
1     United-States
2     United-States
3     United-States
4              Cuba
Name: Native-Country, dtype: object

In [73]:
# using findall method on 'Native-Country' to get the list of all occurrences of regex '\-*' 
#  We use the findall or any other vectorized string methods from the str object of a pandas object 
adult['Native-Country'].str.findall('\-*').head(5)

0    [, , , , , , , -, , , , , , , ]
1    [, , , , , , , -, , , , , , , ]
2    [, , , , , , , -, , , , , , , ]
3    [, , , , , , , -, , , , , , , ]
4                       [, , , , , ]
Name: Native-Country, dtype: object

In [74]:
# Create two DataFrames with MultiIndex 
# Gen a sequence of 9 randn
data1 = pd.DataFrame(np.random.randn(9),
                index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'], [1,2,3,1,3,1,2,2,3]])
# Gen a sequence of 6 randn
data2 = pd.DataFrame(np.random.randn(6),
                index=[['a', 'a', 'e', 'e', 'd', 'd'], [1,2,1,2,1,2]])

# Set index names
data1.index.names = ['lev.0', 'lev.1']
data2.index.names = ['lev.0', 'lev.1']

# Set col
print('date1:\n', data1)
print('date2:\n', data2)

# perform outer join using merge. 
# pd.merge(data1, data2, left_index=True, right_index=True, how='outer')

# perform outer join using join instance method. Same as merge
data1.join(data2, lsuffix='_l', rsuffix='_r', how='outer')

date1:
                     0
lev.0 lev.1          
a     1     -1.975892
      2      0.817396
      3     -0.201078
b     1     -0.757637
      3     -1.524694
c     1      0.078530
      2      0.248779
d     2     -1.387155
      3      0.536233
date2:
                     0
lev.0 lev.1          
a     1      0.900342
      2     -0.797347
e     1     -0.936645
      2      0.314306
d     1     -0.102533
      2     -1.432609


Unnamed: 0_level_0,Unnamed: 1_level_0,0_l,0_r
lev.0,lev.1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,-1.975892,0.900342
a,2,0.817396,-0.797347
a,3,-0.201078,
b,1,-0.757637,
b,3,-1.524694,
c,1,0.07853,
c,2,0.248779,
d,1,,-0.102533
d,2,-1.387155,-1.432609
d,3,0.536233,


In [75]:
adult.to_csv('data/adult_sanitized.csv')