### This jupyter notebook is created for the purpose to check my pipeline.

### **Author : Umidjon Sattorov student at Mohirdev platform**

In [1]:
#Importing all necessary libraries and modules
#Data processing
import pandas as pd

#Feature engineering 
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

#Modelling
from catboost import CatBoostClassifier, Pool

#Metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report

#Saving machine learning model into pickle format
import dill
from datetime import datetime

#Dropping unnecessary columns from the dataset
def drop_unimportant(df : pd.DataFrame) -> pd.DataFrame : 
    left_cols = ['Customer Type', 'Age', 'Type of Travel', 'Flight Distance', 'Inflight wifi service', 'Ease of Online booking', 'Food and drink', 'Online boarding', 'Seat comfort', 'Inflight entertainment', 'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service', 'Inflight service', 'Cleanliness', 'Class']
    df = df[left_cols]
    return df

#Encoding binary features
def encode_binary(df : pd.DataFrame) -> pd.DataFrame :
    df['Customer Type'] = df['Customer Type'].apply(lambda x : 1 if x == 'Loyal Customer' else 0)
    df['Type of Travel'] = df['Type of Travel'].apply(lambda x : 1 if x == 'Business travel' else 0)
    return df


In [2]:
print('Customer satisfaction predictor pipeline !')

Customer satisfaction predictor pipeline !


In [3]:
#Data loading
df = pd.read_csv('./data/imputed_train_dataset.csv', sep = ',')

# Preprocess the entire DataFrame first
X = df.drop(columns = 'satisfaction')
y = df[['satisfaction']]

In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 23 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   id                                 10000 non-null  int64  
 1   Gender                             10000 non-null  object 
 2   Customer Type                      10000 non-null  object 
 3   Age                                10000 non-null  int64  
 4   Type of Travel                     10000 non-null  object 
 5   Class                              10000 non-null  object 
 6   Flight Distance                    10000 non-null  int64  
 7   Inflight wifi service              10000 non-null  int64  
 8   Departure/Arrival time convenient  10000 non-null  int64  
 9   Ease of Online booking             10000 non-null  int64  
 10  Gate location                      10000 non-null  int64  
 11  Food and drink                     10000 non-null  int6

In [21]:
#Feature engineering
ohe_cols = ['Class']
std_scaler = ['Age', 'Flight Distance', 'Inflight wifi service', 'Ease of Online booking', 'Food and drink', 'Online boarding', 'Seat comfort', 'Inflight entertainment', 'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service', 'Inflight service', 'Cleanliness']
remaining_cols = ['Customer Type', 'Type of Travel']

In [22]:
dropper_feature_changer = Pipeline(steps = [
    ('drop_cols', FunctionTransformer(drop_unimportant)),
    ('binary_encoding', FunctionTransformer(encode_binary))
])

In [23]:
numerical_transformer = Pipeline(steps = [
    ('scaler', StandardScaler())
])
ohe_transformation = Pipeline(steps = [
    ('ohe', OneHotEncoder(handle_unknown = 'ignore'))
])
remaining_transformation = Pipeline(steps = [
    ('remaining_features', FunctionTransformer(lambda x : x))
])

In [24]:
column_transformer = ColumnTransformer(transformers=[
    ('numerical', numerical_transformer, std_scaler),
    ('ohe_transformation', ohe_transformation, ohe_cols),
    ('remaining_features', remaining_transformation, remaining_cols)
])

preprocessor = Pipeline(steps = [
    ('feature_change', dropper_feature_changer),
    ('column_transformer', column_transformer)
])

In [34]:
cat_model = CatBoostClassifier(
       iterations = 1500,
       learning_rate = 0.01,
       depth = 10,
       eval_metric = 'AUC',
       random_seed = 1,
)
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', cat_model)
])

In [35]:
#Fitting perfect pipeline for whole dataset
pipe.fit(X = X, y = y)

pred = pipe.predict(X = X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Customer Type'] = df['Customer Type'].apply(lambda x : 1 if x == 'Loyal Customer' else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Type of Travel'] = df['Type of Travel'].apply(lambda x : 1 if x == 'Business travel' else 0)


0:	total: 18.7ms	remaining: 28s
1:	total: 37.1ms	remaining: 27.8s
2:	total: 55.5ms	remaining: 27.7s
3:	total: 73.9ms	remaining: 27.6s
4:	total: 91.3ms	remaining: 27.3s
5:	total: 98.6ms	remaining: 24.5s
6:	total: 117ms	remaining: 25s
7:	total: 136ms	remaining: 25.3s
8:	total: 154ms	remaining: 25.5s
9:	total: 175ms	remaining: 26.1s
10:	total: 193ms	remaining: 26.2s
11:	total: 212ms	remaining: 26.3s
12:	total: 222ms	remaining: 25.4s
13:	total: 241ms	remaining: 25.6s
14:	total: 261ms	remaining: 25.9s
15:	total: 281ms	remaining: 26s
16:	total: 298ms	remaining: 26s
17:	total: 317ms	remaining: 26.1s
18:	total: 334ms	remaining: 26s
19:	total: 352ms	remaining: 26.1s
20:	total: 371ms	remaining: 26.1s
21:	total: 390ms	remaining: 26.2s
22:	total: 408ms	remaining: 26.2s
23:	total: 427ms	remaining: 26.3s
24:	total: 448ms	remaining: 26.4s
25:	total: 471ms	remaining: 26.7s
26:	total: 495ms	remaining: 27s
27:	total: 517ms	remaining: 27.2s
28:	total: 541ms	remaining: 27.4s
29:	total: 564ms	remaining: 27

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Customer Type'] = df['Customer Type'].apply(lambda x : 1 if x == 'Loyal Customer' else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Type of Travel'] = df['Type of Travel'].apply(lambda x : 1 if x == 'Business travel' else 0)


In [36]:
train_auc_cat = roc_auc_score(y, pred)
print(f"The ROC AUC score of CatBoostClassifier : {train_auc_cat}")

The ROC AUC score of CatBoostClassifier : 0.9981999999999999


In [37]:
pipe

In [39]:
model_filename = f'./models/customer_satisfaction.pkl'
dill.dump({'model' : pipe,
'metadata' :{
    'name' : 'Flight cost predictor',
    'author' : 'Umidjon Sattorov',
    'version' : 1,
    'date' : datetime.now(),
    'type' : type(pipe.named_steps['classifier']).__name__,
    'roc_auc score' : roc_auc_score(y_true = y, y_score = pred)
}
}, open('./models/customer_satisfaction.pkl', 'wb'))

print(f'Model is saved as {model_filename} in models directory')

Model is saved as ./models/customer_satisfaction.pkl in models directory
