In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
import yaml
import ast

In [2]:
%run ./src/database.py
%run ./src/util.py
%run ./src/data_preprocessing.py
%run ./src/feature_engineering.py
%run ./src/visualisation.py

In [3]:
YAML_FILEPATHNAME = "./config.yaml"
PRE_CRUISE_DB = 0
POST_CRUISE_DB = 1

# Read yaml config and data

In [4]:
# Read YAML file
yaml_data = read_yaml(YAML_FILEPATHNAME)
DISPLAY_STUB = yaml_data['display_stub']
DATA_PATH = yaml_data['data_path']
TEST_SIZE = yaml_data['test_size']
RANDOM_STATE = yaml_data['random_state']
TARGET_VARIABLE = yaml_data['target_variable']
DB_INFO = yaml_data['databases']
COMPOSITE_FIELD_INFO = yaml_data['composite_fields_to_split']
ID_FIELDS = ast.literal_eval(yaml_data['ID_columns'])
DATETIME_FIELD_INFO = yaml_data['convert_obj_datetime']
NUMERIC_FIELD_INFO = yaml_data['convert_obj_numeric']
MISSING_VAL_THRESHOLD =  yaml_data['pct_missing_threshold']
CONTINUOUS_VARIABLE = ast.literal_eval(yaml_data['continuous_variables'])
DIRTY_DATA_INFO = yaml_data['dirty_data_setting']
VALID_DATA_INFO = yaml_data['valid_data_setting']
NON_NUMERIC_COL = yaml_data['non_numeric_cols']
DATE_YYYY_INFO = yaml_data['convert_date_yyyy']

In [5]:
# Read Pre_cruise data
df_pre_cruise = db_read(DATA_PATH, DB_INFO[PRE_CRUISE_DB])

In [6]:
# Read Post_cruise data
df_post_cruise = db_read(DATA_PATH, DB_INFO[POST_CRUISE_DB])

# Preprocessing

In [7]:
# Merge Pre_cruise and Post_cruise to form df_cruise with Index as the key
df_cruise = db_merge_db (df_pre_cruise, df_post_cruise)

In [8]:
dp = DataProcessing(df_cruise, DISPLAY_STUB)
print(dp.dataframe['Onboard Wifi Service'].info())

<class 'pandas.core.series.Series'>
Index: 130875 entries, 0 to 133745
Series name: Onboard Wifi Service
Non-Null Count   Dtype 
--------------   ----- 
111800 non-null  object
dtypes: object(1)
memory usage: 2.0+ MB
None


In [9]:
# Replace all np.nan to None
dp.replace_nan_none()
print(dp.dataframe['Onboard Wifi Service'].info())

<class 'pandas.core.series.Series'>
Index: 130875 entries, 0 to 133745
Series name: Onboard Wifi Service
Non-Null Count   Dtype 
--------------   ----- 
111800 non-null  object
dtypes: object(1)
memory usage: 2.0+ MB
None


In [10]:
# Split composite field
dp.split_composite_field(COMPOSITE_FIELD_INFO)
print(dp.dataframe['Onboard Wifi Service'].info())

<class 'pandas.core.series.Series'>
Index: 130875 entries, 0 to 133745
Series name: Onboard Wifi Service
Non-Null Count   Dtype 
--------------   ----- 
111800 non-null  object
dtypes: object(1)
memory usage: 2.0+ MB
None


In [11]:
# Remove IDs columns as in general they will have no predictive power
dp.rm_id_cols(ID_FIELDS)
print(dp.dataframe['Onboard Wifi Service'].info())

<class 'pandas.core.series.Series'>
Index: 130875 entries, 0 to 133745
Series name: Onboard Wifi Service
Non-Null Count   Dtype 
--------------   ----- 
111800 non-null  object
dtypes: object(1)
memory usage: 2.0+ MB
None


In [12]:
# Column pruning due to high missing values.
dp.rm_cols_high_missing(MISSING_VAL_THRESHOLD)

In [13]:
# Convert fields from Object to Datetime
dp.obj_to_datetime(DATETIME_FIELD_INFO)

In [14]:
# Convert fields from Object to Numeric 
dp.numeric_conversion(NUMERIC_FIELD_INFO)

In [16]:
# Remove of rows from Target Variable 
dp.rm_rows_target_var(TARGET_VARIABLE)

In [17]:
# Removal of rows from Continuous variable which has missing value 
dp.remove_missing(CONTINUOUS_VARIABLE)

In [18]:
# Dirty Data Cleansing
dp.dirty_data_processing(DIRTY_DATA_INFO)

In [19]:
# Restriction of Column vales
dp.valid_data_processing(VALID_DATA_INFO)

In [21]:
# Label encode non-numeric categorical columns to get numeric values
dp.label_encoder(NON_NUMERIC_COL)

In [22]:
# Derive year from date column to enhance model efficiency, mitigate noise
dp.yyyy_from_date(DATE_YYYY_INFO)

In [23]:
# Get Dataframe
df_cruise = dp.get_dataframe()

# Feature Engineering

In [25]:
df_cruise.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20188 entries, 24 to 133743
Data columns (total 23 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Gender                                      20188 non-null  int32  
 1   Onboard Wifi Service                        20188 non-null  int32  
 2   Embarkation/Disembarkation time convenient  20188 non-null  Int32  
 3   Ease of Online booking                      20188 non-null  Int32  
 4   Gate location                               20188 non-null  Int32  
 5   Onboard Dining Service                      20188 non-null  int32  
 6   Online Check-in                             20188 non-null  Int32  
 7   Cabin Comfort                               20188 non-null  Int32  
 8   Onboard Entertainment                       20188 non-null  int32  
 9   Cabin service                               20188 non-null  Int32  
 10  Baggage handl

In [None]:
df_cruise.head()

In [26]:
# Standardise distance by converting Mile to KM
convert_miles_to_KM(df_cruise, "Distance")
df_cruise["Distance"] = abs(df_cruise["Distance"])

In [None]:
# Derive Age from Year of Birth and Year of Logging
calc_year_diff(df_cruise, 'Year of Logging', 'Year of Birth', 'Age')

In [None]:
df_cruise.info()

In [None]:
%run ./src/feature_engineering.py
#### Denote Missing column
denote_missing_col(df_cruise)

In [None]:
%run ./src/visualisation.py

missing_list = ['tot_missing_col']

df_cruise[missing_list]

vs_countplot_both(df_cruise, 'tot_missing_col')

In [None]:
df_cruise['tot_missing_col'].describe()

#### Gender

In [None]:
# impute_missing_value(df_cruise,  impute_type="random", col_list=['Gender'], none_val=2)

#### Embarkation/Disembarkation time convenient

In [None]:
# vs_countplot_both(df_cruise, 'Embarkation/Disembarkation time convenient')

In [None]:
df_cruise.info()

In [None]:
impute_missing_value(df_cruise, impute_type="mean")

In [None]:
vs_plot_corr_chart(df_cruise)

# Model Building

In [None]:
df_cruise.info()

In [None]:
X = df_cruise.drop([TARGET_VARIABLE],axis=1)
y = df_cruise[TARGET_VARIABLE]

In [None]:
y = pd.DataFrame(y)
vs_pieplot(y, TARGET_VARIABLE)

In [None]:
# Perform SMOTE
X, y = fe_SMOTE(X, y, RANDOM_STATE)

In [None]:
y = pd.DataFrame(y)
vs_pieplot(y, TARGET_VARIABLE)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE , random_state=RANDOM_STATE)
y_train = y_train.values.ravel()

In [None]:
log_regression = LogisticRegression(random_state=RANDOM_STATE, max_iter=10000)
log_regression.fit(X_train, y_train)
y_train_pred_log = log_regression.predict(X_train)
y_test_pred_log = log_regression.predict(X_test)

In [None]:
# random_forest = RandomForestClassifier()
# random_forest.fit(X_train, y_train)
# y_train_pred_rf = random_forest.predict(X_train)
# y_test_pred_rf = random_forest.predict(X_test)

# Model Evaluation

In [None]:
from sklearn import metrics
print("Classification report - Train")
print(classification_report(y_train, y_train_pred_log))
print("Classification report - Test")
print(classification_report(y_test, y_test_pred_log))
print(" ")
print("Classification report - Test")
print(confusion_matrix(y_train, y_train_pred_log))
print("Confusion Matrix - Test")
print(confusion_matrix(y_test, y_test_pred_log))
print(" ")
print("Train Accuracy: ",format(metrics.accuracy_score(y_train, y_train_pred_log), '.4f'))
# print("Train Precision: ",format(metrics.precision_score(y_train, y_train_pred_log, average='micro'), '.4f'))
# print("Train Recall:",format(metrics.recall_score(y_train, y_train_pred_log,average='micro'), '.4f'))
# print(" ")
print("Test Accuracy:",format(metrics.accuracy_score(y_test, y_test_pred_log), '.4f'))
# print("Test Precision:",format(metrics.precision_score(y_test, y_test_pred_log,average='micro'), '.4f'))
# print("Test Recall:",format(metrics.recall_score(y_test, y_test_pred_log,average='micro'), '.4f'))

print("")


In [None]:
# from sklearn import metrics
# print("Classification report - Train")
# print(classification_report(y_train, y_train_pred_rf))
# print("Classification report - Test")
# print(classification_report(y_test, y_test_pred_rf))
# print(" ")
# print("Classification report - Test")
# print(confusion_matrix(y_train, y_train_pred_rf))
# print("Confusion Matrix - Test")
# print(confusion_matrix(y_test, y_test_pred_rf))
# print(" ")
# print("Train Accuracy: ",format(metrics.accuracy_score(y_train, y_train_pred_rf), '.4f'))
# # print("Train Precision: ",format(metrics.precision_score(y_train, y_train_pred_log, average='micro'), '.4f'))
# # print("Train Recall:",format(metrics.recall_score(y_train, y_train_pred_log,average='micro'), '.4f'))
# # print(" ")
# print("Test Accuracy:",format(metrics.accuracy_score(y_test, y_test_pred_rf), '.4f'))
# # print("Test Precision:",format(metrics.precision_score(y_test, y_test_pred_log,average='micro'), '.4f'))
# # print("Test Recall:",format(metrics.recall_score(y_test, y_test_pred_log,average='micro'), '.4f'))

# print("")
