# Data Processing

## Create a simulated dataframe:

In [1]:
import pandas as pd

In [2]:
raw_data = {'patient': [1, 1, 1, 2, 2],
            'obs': [1, 2, 3, 1, 2],
            'treatment': [0, 1, 0, 1, 0],
            'score': ['strong', 'weak', 'normal', 'weak', 'strong']}
df = pd.DataFrame(raw_data, columns = ['patient', 'obs', 'treatment', 'score'])

In [3]:
df.head()

Unnamed: 0,patient,obs,treatment,score
0,1,1,0,strong
1,1,2,1,weak
2,1,3,0,normal
3,2,1,1,weak
4,2,2,0,strong


## How to: convert categorical data to numerical labels in a pandas dataframe:

In [4]:
from sklearn import preprocessing

In [5]:
# Create a label encoder
# Create a label (category) encoder object
le = preprocessing.LabelEncoder()

In [6]:
# Fit the encoder to the pandas column
le.fit(df['score'])

LabelEncoder()

In [7]:
# View the labels (if you want)
list(le.classes_)

['normal', 'strong', 'weak']

In [8]:
le.classes_

array(['normal', 'strong', 'weak'], dtype=object)

In [9]:
# Apply the fitted encoder to the pandas column
le.transform(df['score']) 

array([1, 2, 0, 2, 1])

In [10]:
# Convert some integers into their category names
list(le.inverse_transform([0, 2, 1]))

['normal', 'weak', 'strong']

## How to: delete observations with missing values:

In [11]:
import numpy as np

In [12]:
# Suppose we have a matrix with the following features:
X = np.array([[1.1, 11.1], 
              [2.2, 22.2], 
              [3.3, 33.3], 
              [4.4, 44.4], 
              [np.nan, 55]])

In [13]:
# Remove observations with missing values
X[~np.isnan(X).any(axis=1)]

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4]])

## How to: Delete missing values:

In [14]:
# Create feature matrix
X = np.array([[1, 2], 
              [6, 3], 
              [8, 4], 
              [9, 5], 
              [np.nan, 4]])

### How to delete missing values with numpy

In [15]:
# Remove observations with missing values
X[~np.isnan(X).any(axis=1)]

array([[1., 2.],
       [6., 3.],
       [8., 4.],
       [9., 5.]])

### How to delete missing values with pandas

In [16]:
# Load data as a data frame
df = pd.DataFrame(X, columns=['feature_1', 'feature_2'])

# Remove observations with missing values
df.dropna()

Unnamed: 0,feature_1,feature_2
0,1.0,2.0
1,6.0,3.0
2,8.0,4.0
3,9.0,5.0


## How to: Impute missing values with means:

In [17]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

In [18]:
# Create an empty dataset
df = pd.DataFrame()

# Create two variables called x0 and x1. Make the first value of x1 a missing value
df['x0'] = [0.3051,0.4949,0.6974,0.3769,0.2231,0.341,0.4436,0.5897,0.6308,0.5]
df['x1'] = [np.nan,0.2654,0.2615,0.5846,0.4615,0.8308,0.4962,0.3269,0.5346,0.6731]

# View the dataset
df

Unnamed: 0,x0,x1
0,0.3051,
1,0.4949,0.2654
2,0.6974,0.2615
3,0.3769,0.5846
4,0.2231,0.4615
5,0.341,0.8308
6,0.4436,0.4962
7,0.5897,0.3269
8,0.6308,0.5346
9,0.5,0.6731


In [19]:
# Create an imputer object that looks for 'Nan' values, then replaces them with the mean value of the feature by columns (axis=0)
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Train the imputor on the df dataset
mean_imputer = mean_imputer.fit(df)

# Apply the imputer to the df dataset
imputed_df = mean_imputer.transform(df.values)

# View the data
imputed_df

array([[0.3051    , 0.49273333],
       [0.4949    , 0.2654    ],
       [0.6974    , 0.2615    ],
       [0.3769    , 0.5846    ],
       [0.2231    , 0.4615    ],
       [0.341     , 0.8308    ],
       [0.4436    , 0.4962    ],
       [0.5897    , 0.3269    ],
       [0.6308    , 0.5346    ],
       [0.5       , 0.6731    ]])

## How to: Impute missing class labels

In [20]:
# Load libraries
import numpy as np
from sklearn.impute import SimpleImputer

In [21]:
# Create feature matrix with categorical feature
X = np.array([[0, 2.10, 1.45], 
              [1, 1.18, 1.33], 
              [0, 1.22, 1.27],
              [0, -0.21, -1.19],
              [np.nan, 0.87, 1.31],
              [np.nan, -0.67, -0.22]])

In [22]:
#Fill missing values class with most frequent class:
# Create Imputer object
imputer = SimpleImputer(strategy='most_frequent')

# Fill missing values with most frequent class
imputer.fit_transform(X)

array([[ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 0.  , -0.21, -1.19],
       [ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22]])

## How to: Calculate difference between dates and times:

In [23]:
# Create data frame
df = pd.DataFrame()

# Create two datetime features
df['Arrived'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-04-2017')]
df['Left'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-06-2017')]

In [24]:
# Calculate duration between features
df['Left'] - df['Arrived']

0   0 days
1   2 days
dtype: timedelta64[ns]

In [25]:
#To strip 'days' from the field:
df['days'] = df['Left'] - df['Arrived']
df['days'] = df['days'].dt.days
df.head()

Unnamed: 0,Arrived,Left,days
0,2017-01-01,2017-01-01,0
1,2017-01-04,2017-01-06,2


## How to: Rescale a continuous feature using minmax scaling:

In [26]:
from sklearn import preprocessing
import numpy as np

In [27]:
# Create feature
x = np.array([[-500.5], 
              [-100.1], 
              [0], 
              [100.1], 
              [900.9]])

In [28]:
# Create scaler
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))

# Scale feature
x_scale = minmax_scale.fit_transform(x)

# Show feature
x_scale

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

## How to: standardize a feature:

In [29]:
# Load libraries
from sklearn import preprocessing
import numpy as np

In [30]:
# Create feature
x = np.array([[-500.5], 
              [-100.1], 
              [0], 
              [100.1], 
              [900.9]])

In [31]:
# Create scaler
scaler = preprocessing.StandardScaler()

# Transform the feature
standardized = scaler.fit_transform(x)

# Show feature
standardized

array([[-1.26687088],
       [-0.39316683],
       [-0.17474081],
       [ 0.0436852 ],
       [ 1.79109332]])