In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
import yaml
import ast

In [14]:
%run ./src/database.py
%run ./src/util.py
%run ./src/data_preprocessing.py
%run ./src/feature_engineering.py
%run ./src/visualisation.py

In [15]:
YAML_FILEPATHNAME = "./config.yaml"
PRE_CRUISE_DB = 0
POST_CRUISE_DB = 1

In [16]:
# Read YAML file
yaml_data = read_yaml(YAML_FILEPATHNAME)

# Preprocessing

In [17]:
DATA_PATH = yaml_data['data_path']
TEST_SIZE = yaml_data['test_size']
RANDOM_STATE = yaml_data['random_state']
TARGET_VARIABLE = yaml_data['target_variable']

In [18]:
# Read source data (Pre_cruise, Post_cruise) with removal of duplicates
ds_pre_cruise = Database(DATA_PATH)
df_pre_cruise = ds_pre_cruise.db_read(yaml_data['databases'][PRE_CRUISE_DB])

ds_post_cruise = Database(DATA_PATH)
df_post_cruise = ds_post_cruise.db_read(yaml_data['databases'][POST_CRUISE_DB])

In [19]:
# Merge Pre_cruise and Post_cruise to form df_cruise with Index as the key
df_cruise = merge_dataframe (df_pre_cruise, df_post_cruise)

In [20]:
# Split composite field to increase interpretability
if yaml_data['composite_fields_to_split']:
    for composite_field in yaml_data['composite_fields_to_split']:
        dp_split_column(df_cruise, composite_field['composite_field'], ast.literal_eval(composite_field['new_column_list']), 
                     composite_field['delimiter'])

In [21]:
# Remove IDs columns as in general they will have no predictive power
dp_remove_id_columns(df_cruise,ast.literal_eval(yaml_data['ID_columns']))
# util_remove_col(df_cruise,ast.literal_eval(yaml_data['ID_columns']))

In [25]:
# Split composite field to increase interpretability
if yaml_data['convert_object_to_datetime']:
    for field in yaml_data['convert_object_to_datetime']:
        dp_convert_object_to_datetime (df_cruise, ast.literal_eval(field['column_list']), field['format'])

In [None]:
# Removal of columns with high missing values
dp_remove_columns_with_high_missing(df_cruise, 0.4)

In [None]:
# Removal of rows from Continuous variable which has missing value 
remove_missing_value(df_cruise,['Date of Birth', TARGET_VARIABLE, 'Distance','Logging'])

In [None]:
dp_replace_value(df_cruise, ["Gender"], 'F', 'Female', True)
dp_replace_value(df_cruise, ["Gender"], 'm', 'Male', True)

In [None]:
numeric_ordinal_list =['Embarkation/Disembarkation time convenient', 'Ease of Online booking', 'Gate location', 
    'Online Check-in', 'Cabin Comfort', 'Cabin service', 'Baggage handling', 'Port Check-in Service', 
    'Onboard Service', 'Cleanliness']
dp_replace_value(df_cruise, numeric_ordinal_list, 0, None)

In [None]:
dp_replace_value(df_cruise, ["Cruise Name"], 'L', 'Lapras', True)
dp_replace_value(df_cruise, ["Cruise Name"], 'b', 'Blastoise', True)

In [None]:
categorical_ordinal_list  = ['Onboard Wifi Service', 'Onboard Dining Service','Onboard Entertainment']
dp_restrict_val(df_cruise, numeric_ordinal_list, [1, 2, 3, 4, 5])

dp_restrict_val(df_cruise, ["Gender"], ['Female','Male'])
dp_restrict_val(df_cruise, categorical_ordinal_list, ['Not at all important', 'A little important', 
                                                                       'Somewhat important', 'Very important', 
                                                                       'Extremely important'])
dp_restrict_val(df_cruise, ["Cruise Name"], ['Lapras','Blastoise'])
dp_restrict_val(df_cruise, ["Ticket Type"], ['Deluxe','Luxury', 'Standard'])
dp_restrict_val(df_cruise, ["Source"], ['Indirect','Direct'])
dp_restrict_val(df_cruise, ["Dining"], [0.0, 0.1])
dp_restrict_val(df_cruise, ["Traffic"], ['Search Engine','Company Website','Email Marketing','Social Media'])
dp_restrict_val(df_cruise, ["Dist_Metrics"], ['KM','Miles'])

In [None]:
# Label encode non-numeric categorical columns to get numeric values
list_non_numeric_col = ['Gender','Onboard Wifi Service','Onboard Dining Service','Onboard Entertainment','Cruise Name',
                            'Ticket Type','Source','Traffic','Dist_Metrics']
label_encoder(df_cruise,list_non_numeric_col)

In [None]:
# Derive year from date column to enhance model efficiency, mitigate noise
convert_datetime_to_year(df_cruise, ['Date of Birth','Logging'],['Year of Birth','Year of Logging'])

# Feature Engineering

In [None]:
%run ./src/visualisation.py
vs_plot_corr_chart(df_cruise)

In [None]:
df_cruise.info()

In [None]:
# Standardise distance by converting Mile to KM
convert_miles_to_KM(df_cruise, "Distance")
df_cruise["Distance"] = abs(df_cruise["Distance"])

In [None]:
# Derive Age from Year of Birth and Year of Logging
calc_year_diff(df_cruise, 'Year of Logging', 'Year of Birth', 'Age')

In [None]:
df_cruise.info()

In [None]:
%run ./src/feature_engineering.py
#### Denote Missing column
denote_missing_col(df_cruise)

In [None]:
%run ./src/visualisation.py

missing_list = ['tot_missing_col']

df_cruise[missing_list]

vs_countplot_both(df_cruise, 'tot_missing_col')

In [None]:
df_cruise['tot_missing_col'].describe()

#### Gender

In [None]:
# impute_missing_value(df_cruise,  impute_type="random", col_list=['Gender'], none_val=2)

#### Embarkation/Disembarkation time convenient

In [None]:
# vs_countplot_both(df_cruise, 'Embarkation/Disembarkation time convenient')

In [None]:
df_cruise.info()

In [None]:
impute_missing_value(df_cruise, impute_type="mean")

In [None]:
vs_plot_corr_chart(df_cruise)

# Model Building

In [None]:
df_cruise.info()

In [None]:
X = df_cruise.drop([TARGET_VARIABLE],axis=1)
y = df_cruise[TARGET_VARIABLE]

In [None]:
y = pd.DataFrame(y)
vs_pieplot(y, TARGET_VARIABLE)

In [None]:
# Perform SMOTE
X, y = fe_SMOTE(X, y, RANDOM_STATE)

In [None]:
y = pd.DataFrame(y)
vs_pieplot(y, TARGET_VARIABLE)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE , random_state=RANDOM_STATE)
y_train = y_train.values.ravel()

In [None]:
log_regression = LogisticRegression(random_state=RANDOM_STATE, max_iter=10000)
log_regression.fit(X_train, y_train)
y_train_pred_log = log_regression.predict(X_train)
y_test_pred_log = log_regression.predict(X_test)

In [None]:
# random_forest = RandomForestClassifier()
# random_forest.fit(X_train, y_train)
# y_train_pred_rf = random_forest.predict(X_train)
# y_test_pred_rf = random_forest.predict(X_test)

# Model Evaluation

In [None]:
from sklearn import metrics
print("Classification report - Train")
print(classification_report(y_train, y_train_pred_log))
print("Classification report - Test")
print(classification_report(y_test, y_test_pred_log))
print(" ")
print("Classification report - Test")
print(confusion_matrix(y_train, y_train_pred_log))
print("Confusion Matrix - Test")
print(confusion_matrix(y_test, y_test_pred_log))
print(" ")
print("Train Accuracy: ",format(metrics.accuracy_score(y_train, y_train_pred_log), '.4f'))
# print("Train Precision: ",format(metrics.precision_score(y_train, y_train_pred_log, average='micro'), '.4f'))
# print("Train Recall:",format(metrics.recall_score(y_train, y_train_pred_log,average='micro'), '.4f'))
# print(" ")
print("Test Accuracy:",format(metrics.accuracy_score(y_test, y_test_pred_log), '.4f'))
# print("Test Precision:",format(metrics.precision_score(y_test, y_test_pred_log,average='micro'), '.4f'))
# print("Test Recall:",format(metrics.recall_score(y_test, y_test_pred_log,average='micro'), '.4f'))

print("")


In [None]:
# from sklearn import metrics
# print("Classification report - Train")
# print(classification_report(y_train, y_train_pred_rf))
# print("Classification report - Test")
# print(classification_report(y_test, y_test_pred_rf))
# print(" ")
# print("Classification report - Test")
# print(confusion_matrix(y_train, y_train_pred_rf))
# print("Confusion Matrix - Test")
# print(confusion_matrix(y_test, y_test_pred_rf))
# print(" ")
# print("Train Accuracy: ",format(metrics.accuracy_score(y_train, y_train_pred_rf), '.4f'))
# # print("Train Precision: ",format(metrics.precision_score(y_train, y_train_pred_log, average='micro'), '.4f'))
# # print("Train Recall:",format(metrics.recall_score(y_train, y_train_pred_log,average='micro'), '.4f'))
# # print(" ")
# print("Test Accuracy:",format(metrics.accuracy_score(y_test, y_test_pred_rf), '.4f'))
# # print("Test Precision:",format(metrics.precision_score(y_test, y_test_pred_log,average='micro'), '.4f'))
# # print("Test Recall:",format(metrics.recall_score(y_test, y_test_pred_log,average='micro'), '.4f'))

# print("")
