<a href="https://colab.research.google.com/github/SuperNZH/Deep-Learning-Practice/blob/main/Reuse_Function/reuse.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Reusable Functions for Machine Learning and Colab platform**

This notebook will be used for record every useful reuseable function during my work and study

## **Google Drive**

### **Mount Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Read data

In [None]:
import pandas as pd

In [None]:
path = '/content/drive/MyDrive/Colab_Notebooks/dataset/data.csv'
data = pd.read_csv(path)

## EDA

### Seperate the variable with different types(Object and Num)

In [None]:
categorical = [var for var in data.columns if data[var].dtype == 'object']
print('There are {} categorical variables\n'.format(len(categorical)))
print('The categorical variables are: ', categorical)

numerical = [var for var in data.columns if data[var].dtype != 'object']

### Display the frequency of variables

In [None]:
for var in data:
  print(data[var].value_counts())

### Check the missing values

In [None]:
data.isnull().sum()

data[categorical].isnull().sum()

### Check the percent of missing value in variables

In [None]:
# output the percentage of missing values in the numerical variables in training set
for var in numerical:
  if X_train[var].isnull().mean()>0:
    print(var, (round(X_train[var].isnull().mean(), 4))*100, "%")

### Check the Cardinality

In [None]:
for var in categorical:
  print(var, " contains ", len(data[var].unique()), " different labels")

### Split dataset

In [None]:
# Seperate the X and y
X = data.drop(['target_var'], axis = 1)
y = data['target_var']

# Train, test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
print("The Data Size: ", X_train.shape, X_test.shape)

## Data Engineering

### Parse the Datetime format data

In [None]:
data['Date'] = pd.to_datetime(data['Date'])
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day
data.drop('Date', axis = 1, inplace = True)

### Impute missing categorical variables with most frequent values

In [None]:
for feature in [X_train, X_test]:
  feature['WindGustDir'].fillna(X_train['WindGustDir'].mode()[0], inplace=True)
  feature['WindDir9am'].fillna(X_train['WindDir9am'].mode()[0], inplace=True)
  feature['WindDir3pm'].fillna(X_train['WindDir3pm'].mode()[0], inplace=True)
  feature['RainToday'].fillna(X_train['RainToday'].mode()[0], inplace=True)

### Drop columns with missing values

In [None]:
# Simplest but more potential problems
cols_with_missing = [col for col in data.columns 
                     if data[col].isnull().any()] 
data.drop(cols_with_missing, axis=1, inplace=True)
data.drop(cols_with_missing, axis=1, inplace=True)

### Process categorical variables

#### Ordinal encoding

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# make a copy
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# Apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])

#### One hot encoding(Nominal variable)

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

In [None]:
# Can also use Pandas get_dummies() to transfer

OH_cols_train = pd.get_dummies(X_train['X1'], prefix='Bla')
OH_cols_test = pd.get_dummies(X_test['X1'], prefix='Bla')

X_train = pd.concat([X_train, OH_cols_train], axis=1)
X_test = pd.concat([X_test, OH_cols_test], axis=1)

X_train.drop('X1', axis=1, inplace=True)
X_test.drop('X1', axis=1, inplace=True)