In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
import yaml
import ast

In [2]:
%run ./src/database.py
%run ./src/util.py
%run ./src/data_preprocessing.py
%run ./src/feature_engineering.py
%run ./src/visualisation.py
%run ./src/model_build.py

In [3]:
YAML_FILEPATHNAME = "./config.yaml"
PRE_CRUISE_DB = 0
POST_CRUISE_DB = 1
IS_NOTEBOOK = True

# Read yaml config and data

In [4]:
# Read YAML file
yaml_data = read_yaml(YAML_FILEPATHNAME)
DISPLAY_STUB = yaml_data['display_stub']
DATA_PATH = yaml_data['data_path']
TEST_SIZE = yaml_data['test_size']
RANDOM_STATE = yaml_data['random_state']
TARGET_VARIABLE = yaml_data['target_variable']
DB_INFO = yaml_data['databases']
COMPOSITE_FIELD_INFO = yaml_data['composite_fields_to_split']
ID_FIELDS = ast.literal_eval(yaml_data['ID_columns'])
DATETIME_FIELD_INFO = yaml_data['convert_obj_datetime']
NUMERIC_FIELD_INFO = yaml_data['convert_obj_numeric']
MISSING_VAL_THRESHOLD =  yaml_data['pct_missing_threshold']
CONTINUOUS_VARIABLE = ast.literal_eval(yaml_data['continuous_variables'])
DIRTY_DATA_INFO = yaml_data['dirty_data_setting']
VALID_DATA_INFO = yaml_data['valid_data_setting']
NON_NUMERIC_COL = yaml_data['non_numeric_cols']
DATE_YYYY_INFO = yaml_data['convert_date_yyyy']

In [5]:
# Read Pre_cruise data
df_pre_cruise = db_read(DATA_PATH, DB_INFO[PRE_CRUISE_DB])

In [6]:
# Read Post_cruise data
df_post_cruise = db_read(DATA_PATH, DB_INFO[POST_CRUISE_DB])

# Preprocessing

In [7]:
# Merge Pre_cruise and Post_cruise to form df_cruise with Index as the key
df_cruise = db_merge_db (df_pre_cruise, df_post_cruise)

In [8]:
dp = DataProcessing(df_cruise, DISPLAY_STUB)

In [9]:
# Replace all np.nan to None
dp.replace_nan_none()

In [10]:
# Split composite field
dp.split_composite_field(COMPOSITE_FIELD_INFO)

In [11]:
# Remove IDs columns as in general they will have no predictive power
dp.rm_id_cols(ID_FIELDS)

In [12]:
# Column pruning due to high missing values.
dp.rm_cols_high_missing(MISSING_VAL_THRESHOLD)

In [13]:
# Convert fields from Object to Datetime
dp.obj_to_datetime(DATETIME_FIELD_INFO)

In [14]:
# Convert fields from Object to Numeric 
dp.numeric_conversion(NUMERIC_FIELD_INFO)

In [15]:
# Remove of rows from Target Variable 
dp.rm_rows_target_var(TARGET_VARIABLE)

In [16]:
# Removal of rows from Continuous variable which has missing value 
dp.remove_missing(CONTINUOUS_VARIABLE)

In [17]:
# Dirty Data Cleansing
dp.dirty_data_processing(DIRTY_DATA_INFO)

In [18]:
# Restriction of Column vales
dp.valid_data_processing(VALID_DATA_INFO)

In [19]:
# Label encode non-numeric categorical columns to get numeric values
dp.label_encoder(NON_NUMERIC_COL)

In [20]:
# Derive year from date column to enhance model efficiency, mitigate noise
dp.yyyy_from_date(DATE_YYYY_INFO)

# Feature Engineering

In [21]:
fe = feature_engineering(dp.get_dataframe())

In [29]:
dp.get_dataframe()

Unnamed: 0_level_0,Gender,Onboard Wifi Service,Embarkation/Disembarkation time convenient,Ease of Online booking,Gate location,Onboard Dining Service,Online Check-in,Cabin Comfort,Onboard Entertainment,Cabin service,...,Onboard Service,Cleanliness,Cruise Name,Ticket Type,Dining,Source,Traffic,Distance,Dist_Metrics,Age
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0,5,4.0,4.0,4.0,4,4.0,4.0,2,3.0,...,2.0,4.0,1,0,0,0,0,450.61632,0,53
4,1,4,4.0,2.0,3.0,3,2.0,3.0,3,3.0,...,2.0,3.0,1,2,1,0,0,1842.69888,1,63
6,1,4,5.0,3.0,3.0,5,3.0,4.0,5,3.0,...,4.0,4.0,1,2,1,0,1,976.871808,0,37
10,1,0,3.0,3.0,1.0,1,2.0,3.0,1,3.0,...,1.0,5.0,1,2,0,0,1,955.950336,0,15
11,2,3,5.0,1.0,3.0,3,1.0,1.0,2,3.0,...,4.0,1.0,2,2,0,0,0,1290.693888,0,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133731,1,5,5.0,4.0,5.0,3,4.0,3.0,3,5.0,...,4.0,3.0,1,0,0,0,0,1506.345984,0,65
133734,0,0,1.0,1.0,1.0,3,4.0,4.0,2,2.0,...,2.0,2.0,2,1,1,0,1,3057.7536,0,44
133739,0,2,5.0,2.0,1.0,2,4.0,5.0,5,4.0,...,4.0,4.0,0,2,0,0,0,4372.587648,0,61
133743,1,1,5.0,5.0,5.0,1,5.0,5.0,1,4.0,...,4.0,5.0,0,1,0,0,1,3133.392768,0,11


In [22]:
# Standardise distance by converting Mile to KM
fe.convert_miles_to_KM("Distance")

In [23]:
# Derive Age from Year of Birth and Year of Logging
fe.calc_year_diff('Year of Logging', 'Year of Birth', 'Age')

In [24]:
# Compute number of column
# fe.denote_missing_col()

#### Gender

In [25]:
# fe.impute_missing_value(impute_type="random", col_list=['Gender'], none_val=2)

In [26]:
fe.impute_missing_value(impute_type="mean")

# Model Building

In [27]:
# lr = Logistic_Regression(fe.get_dataframe())
# X, y = lr.prepare_data(TARGET_VARIABLE)
# hyperparameter_dict = {
#     'random_state': RANDOM_STATE,
#     'max_iter': 10000
# }

# lr.model_processing(X, y, TEST_SIZE, RANDOM_STATE, hyperparameter_dict, IS_NOTEBOOK)

In [31]:
output_csv(DATA_PATH, fe.get_dataframe(), "df_temp")

In [28]:
dtc = Decision_Tree(fe.get_dataframe())
X, y = dtc.prepare_data(TARGET_VARIABLE)
hyperparameter_dict = {
}
dtc.model_processing(X, y, TEST_SIZE, RANDOM_STATE, hyperparameter_dict, IS_NOTEBOOK)

TypeError:        Gender  Onboard Wifi Service  \
0           1                     1   
1           1                     1   
2           1                     3   
3           2                     5   
4           2                     2   
...       ...                   ...   
94870       1                     0   
94871       1                     1   
94872       1                     0   
94873       0                     0   
94874       0                     3   

       Embarkation/Disembarkation time convenient  Ease of Online booking  \
0                                        3.000000                 5.00000   
1                                        3.000000                 5.00000   
2                                        1.000000                 1.00000   
3                                        3.000000                 4.00000   
4                                        4.000000                 2.00000   
...                                           ...                     ...   
94870                                    5.000000                 2.78026   
94871                                    1.218130                 1.21813   
94872                                    2.000000                 2.00000   
94873                                    4.780803                 2.00000   
94874                                    1.000000                 1.00000   

       Gate location  Onboard Dining Service  Online Check-in  Cabin Comfort  \
0           2.000000                       5         5.000000       3.000000   
1           1.000000                       0         5.000000       2.000000   
2           1.000000                       0         4.000000       3.000000   
3           3.000000                       0         4.000000       3.000000   
4           3.000000                       5         4.000000       4.000000   
...              ...                     ...              ...            ...   
94870       3.560519                       2         2.000000       1.878962   
94871       1.218130                       0         2.000000       2.000000   
94872       2.850503                       3         2.000000       1.000000   
94873       4.000000                       3         2.109599       1.000000   
94874       3.279792                       5         1.000000       4.000000   

       Onboard Entertainment  Cabin service  ...  Port Check-in Service  \
0                          5       3.000000  ...               4.000000   
1                          0       3.000000  ...               2.000000   
2                          2       4.000000  ...               5.000000   
3                          0       5.000000  ...               3.000000   
4                          2       2.000000  ...               4.000000   
...                      ...            ...  ...                    ...   
94870                      2       4.780260  ...               3.439481   
94871                      0       1.000000  ...               4.563740   
94872                      3       4.149497  ...               3.000000   
94873                      3       1.109599  ...               1.219197   
94874                      5       1.279792  ...               3.160623   

       Onboard Service  Cleanliness  Cruise Name  Dining  Source  Traffic  \
0             4.000000     4.000000            0       0       1        3   
1             2.000000     2.000000            0       0       1        3   
2             4.000000     3.000000            0       0       0        1   
3             4.000000     2.000000            1       1       0        0   
4             2.000000     5.000000            1       1       0        1   
...                ...          ...          ...     ...     ...      ...   
94870         4.780260     3.439481            1       0       0        1   
94871         2.109065     2.109065            0       0       0        0   
94872         4.000000     1.000000            0       0       1        3   
94873         3.219197     2.780803            1       0       0        0   
94874         2.000000     4.000000            0       0       1        2   

          Distance  Dist_Metrics  Age  
0      1813.730688             0   27  
1       984.918528             0   23  
2       593.847936             0   52  
3        450.61632             0   53  
4      1369.551744             0   61  
...            ...           ...  ...  
94870   2207.66633             0   45  
94871   951.122304             0   41  
94872  1824.755505             0   20  
94873   593.671554             0   31  
94874  2095.174104             0   29  

[94875 rows x 21 columns] is not an estimator instance.

In [None]:

# columns = X_train.columns
# os_data_X,os_data_y=os.fit_resample(X_train, y_train)
# os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
# os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])


# Model Evaluation