In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [60]:
df = pd.read_csv("cust_satisfaction.csv")
df.head()

Unnamed: 0,Gender,Customer Type,Type of Travel,Class,satisfaction,Age,Flight Distance,Inflight entertainment,Baggage handling,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
0,Male,Loyal Customer,Personal Travel,Eco Plus,neutral or dissatisfied,13,460,5,4,5,25,18.0
1,Male,disloyal Customer,Business travel,Business,neutral or dissatisfied,25,235,1,3,1,1,6.0
2,Female,Loyal Customer,Business travel,Business,satisfied,26,1142,5,4,5,0,0.0
3,Female,Loyal Customer,Business travel,Business,neutral or dissatisfied,25,562,2,3,2,11,9.0
4,Male,Loyal Customer,Business travel,Business,satisfied,61,214,3,4,3,0,0.0


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Gender                      103904 non-null  object 
 1   Customer Type               103904 non-null  object 
 2   Type of Travel              103904 non-null  object 
 3   Class                       103904 non-null  object 
 4   satisfaction                103904 non-null  object 
 5   Age                         103904 non-null  int64  
 6   Flight Distance             103904 non-null  int64  
 7   Inflight entertainment      103904 non-null  int64  
 8   Baggage handling            103904 non-null  int64  
 9   Cleanliness                 103904 non-null  int64  
 10  Departure Delay in Minutes  103904 non-null  int64  
 11  Arrival Delay in Minutes    103594 non-null  float64
dtypes: float64(1), int64(6), object(5)
memory usage: 9.5+ MB


In [62]:
df.isnull().sum()
df.dropna(inplace=True)

In [63]:
df.duplicated().sum()
df.drop_duplicates(inplace=True)

In [64]:
df["Customer Type"].value_counts()

Customer Type
Loyal Customer       84517
disloyal Customer    18905
Name: count, dtype: int64

In [65]:
loyal_customer = df[df["Customer Type"] == "Loyal Customer"]
disloyal_customer = df[df["Customer Type"] == "disloyal Customer"]

In [66]:
loyal_customer = loyal_customer.sample(20000)
loyal_customer.shape

(20000, 12)

In [67]:
balance_df = pd.concat([loyal_customer, disloyal_customer], axis=0)
balance_df.shape

(38905, 12)

In [68]:
cat_col = balance_df.select_dtypes(include=["object"])
cat_col.head()

Unnamed: 0,Gender,Customer Type,Type of Travel,Class,satisfaction
17105,Male,Loyal Customer,Personal Travel,Eco,neutral or dissatisfied
100058,Male,Loyal Customer,Business travel,Business,satisfied
15707,Female,Loyal Customer,Personal Travel,Eco,satisfied
17364,Female,Loyal Customer,Business travel,Business,satisfied
40636,Female,Loyal Customer,Personal Travel,Eco,satisfied


In [69]:
num_col = balance_df.select_dtypes(exclude=["object"])
num_col.head()

Unnamed: 0,Age,Flight Distance,Inflight entertainment,Baggage handling,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
17105,65,483,4,4,4,0,0.0
100058,41,451,4,4,4,3,0.0
15707,20,628,1,4,1,16,4.0
17364,50,913,5,5,1,11,0.0
40636,19,317,2,4,2,0,0.0


In [70]:
pd.get_dummies(cat_col, drop_first=True).astype(int).head()

Unnamed: 0,Gender_Male,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Class_Eco,Class_Eco Plus,satisfaction_satisfied
17105,1,0,1,1,0,0
100058,1,0,0,0,0,1
15707,0,0,1,1,0,1
17364,0,0,0,0,0,1
40636,0,0,1,1,0,1


In [71]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
ohe = OneHotEncoder(drop="if_binary")
cat_col_encoded = ohe.fit_transform(cat_col).toarray()
cat_col_encoded

array([[1., 0., 1., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 1., 0., 1.],
       ...,
       [0., 1., 0., ..., 1., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.]], shape=(38905, 7))

In [72]:
column_name = list(ohe.get_feature_names_out())
column_name

['Gender_Male',
 'Customer Type_disloyal Customer',
 'Type of Travel_Personal Travel',
 'Class_Business',
 'Class_Eco',
 'Class_Eco Plus',
 'satisfaction_satisfied']

In [73]:
one_hot = pd.DataFrame(cat_col_encoded, columns=column_name)
one_hot.head()

Unnamed: 0,Gender_Male,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Class_Business,Class_Eco,Class_Eco Plus,satisfaction_satisfied
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,1.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,1.0,0.0,1.0


In [74]:
one_hot = one_hot.reset_index(drop=True)
num_col = num_col.reset_index(drop=True)
final_df = pd.concat([one_hot, num_col], axis=1)
final_df.head()

Unnamed: 0,Gender_Male,Customer Type_disloyal Customer,Type of Travel_Personal Travel,Class_Business,Class_Eco,Class_Eco Plus,satisfaction_satisfied,Age,Flight Distance,Inflight entertainment,Baggage handling,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,65,483,4,4,4,0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,41,451,4,4,4,3,0.0
2,0.0,0.0,1.0,0.0,1.0,0.0,1.0,20,628,1,4,1,16,4.0
3,0.0,0.0,0.0,1.0,0.0,0.0,1.0,50,913,5,5,1,11,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0,1.0,19,317,2,4,2,0,0.0


In [75]:
##x and y ---> train test split -->algo train
x= final_df.drop("satisfaction_satisfied",axis=1)
y = final_df["satisfaction_satisfied"]

In [76]:
from sklearn.model_selection import train_test_split
x_train , x_test, y_train , y_test = train_test_split(x,y,test_size=0.2 , random_state=42)


In [77]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train , y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [78]:
lr.score(x_train , y_train)


0.7641048708392237

In [79]:
lr.score(x_test,y_test)

0.7628839480786531

In [80]:
x_test =x_test.dropna()
y_test =y_test[x_test.index]

lr.score(x_test , y_test)

0.7628839480786531

In [81]:
###Precision ---> out of all predicted classes , how many were actually positive--> TP/TP+FP
#REcall(TPR)___> out of act positive, how many were correctly predict --> TP/TP+FN
## F1 Score ---> a balance b/w p&r , it focus on the +ve clas performance ---> imbalance data
##           F1 = 2 X( PXR/ P+R)

In [83]:
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_test,lr.predict(x_test)))

[[4122  885]
 [ 960 1814]]


In [84]:
print(classification_report(y_test,lr.predict(x_test)))

              precision    recall  f1-score   support

         0.0       0.81      0.82      0.82      5007
         1.0       0.67      0.65      0.66      2774

    accuracy                           0.76      7781
   macro avg       0.74      0.74      0.74      7781
weighted avg       0.76      0.76      0.76      7781

