In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_absolute_error, max_error, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
import pickle

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
!git clone https://github.com/Xbeck/AvioKompaniya.git

Cloning into 'AvioKompaniya'...
remote: Enumerating objects: 9, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 9 (delta 1), reused 8 (delta 0), pack-reused 0[K
Unpacking objects: 100% (9/9), done.


In [3]:
train_data = pd.read_csv("/content/AvioKompaniya/DataSet/train_dataset.csv")
test_data = pd.read_csv("/content/AvioKompaniya/DataSet/test_dataset.csv")
sample_submission = pd.read_csv("/content/AvioKompaniya/DataSet/sample_submission.csv")

In [4]:
train_data['satisfaction'].value_counts()

0    5000
1    5000
Name: satisfaction, dtype: int64

In [5]:
train_data.shape, test_data.shape, sample_submission.shape

((10000, 24), (4000, 23), (4000, 2))

In [6]:
train_data.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,1,Male,disloyal Customer,33,Business travel,Eco,571,2,3,2,...,4,3,1,3,4,3,4,10,3.0,0
1,2,Female,Loyal Customer,49,Business travel,Business,1431,4,1,4,...,5,5,5,5,3,5,3,0,0.0,1
2,3,Female,Loyal Customer,43,Business travel,Eco,867,1,4,4,...,1,1,1,1,1,1,2,0,18.0,0
3,4,Female,Loyal Customer,27,Business travel,Business,1550,3,3,3,...,2,4,4,5,5,4,2,0,0.0,1
4,5,Male,Loyal Customer,11,Personal Travel,Eco,526,3,4,3,...,4,5,2,5,3,5,4,0,10.0,0


In [7]:
test_data.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
0,1,Female,Loyal Customer,25,Personal Travel,Eco,2704,2,2,2,...,5,2,4,4,3,5,1,5,152,191.0
1,2,Female,Loyal Customer,49,Business travel,Business,1623,1,1,1,...,4,2,2,2,2,4,2,3,60,52.0
2,3,Male,Loyal Customer,51,Business travel,Business,338,4,4,4,...,5,5,5,5,5,4,5,4,0,0.0
3,4,Male,Loyal Customer,34,Business travel,Eco Plus,95,0,0,0,...,1,5,4,4,1,3,3,5,0,0.0
4,5,Male,Loyal Customer,34,Personal Travel,Eco,602,4,4,4,...,5,5,4,5,4,5,5,5,130,140.0


In [8]:
sample_submission.head()

Unnamed: 0,id,satisfaction
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0


In [9]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 24 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   id                                 10000 non-null  int64  
 1   Gender                             10000 non-null  object 
 2   Customer Type                      10000 non-null  object 
 3   Age                                10000 non-null  int64  
 4   Type of Travel                     10000 non-null  object 
 5   Class                              10000 non-null  object 
 6   Flight Distance                    10000 non-null  int64  
 7   Inflight wifi service              10000 non-null  int64  
 8   Departure/Arrival time convenient  10000 non-null  int64  
 9   Ease of Online booking             10000 non-null  int64  
 10  Gate location                      10000 non-null  int64  
 11  Food and drink                     10000 non-null  int6

In [10]:
# Gender ustuni qiymatlarini raqamga aylantiris
dict = {'Female': 0, 'Male': 1}
train_data['Gender'] = train_data['Gender'].map(dict)
test_data['Gender'] = test_data['Gender'].map(dict)

# Customer Type ustuni uchun
dict = {'disloyal Customer': 0, 'Loyal Customer': 1}
train_data['Customer Type'] = train_data['Customer Type'].map(dict)
test_data['Customer Type'] = test_data['Customer Type'].map(dict)

# Type of Travel ustuni uchun
dict = {'Personal Travel': 0, 'Business travel': 1}
train_data['Type of Travel'] = train_data['Type of Travel'].map(dict)
test_data['Type of Travel'] = test_data['Type of Travel'].map(dict)

# Class ustuni uchun
dict = {'Eco Plus': -1, 'Eco': 0, 'Business': 1}
train_data['Class'] = train_data['Class'].map(dict)
test_data['Class'] = test_data['Class'].map(dict)

In [11]:
train_data.drop(columns=['id'], inplace=True)
test_data.drop(columns=['id'], inplace=True)
corr_matrix = train_data.corr().abs()
corr_matrix.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
Gender,1.0,0.02502,0.004407,0.021504,0.009974,0.017786,0.010111,0.016445,0.00758,0.017967,0.004991,0.045786,0.045933,0.009528,0.004277,0.019076,0.039071,0.01971,0.048099,0.008535,0.032548,0.027431,0.002601
Customer Type,0.02502,1.0,0.291192,0.280023,0.058353,0.226549,0.003499,0.196112,0.018328,0.000866,0.072036,0.195554,0.174279,0.126166,0.061039,0.071066,0.014837,0.032102,0.026231,0.101625,0.008944,0.006917,0.181592
Age,0.004407,0.291192,1.0,0.075133,0.141403,0.110823,0.020904,0.036071,0.026964,0.012077,0.021971,0.215613,0.169291,0.097421,0.073412,0.060124,0.040368,0.038761,0.036921,0.062409,0.015618,0.019086,0.144302
Type of Travel,0.021504,0.280023,0.075133,1.0,0.492668,0.276756,0.099728,0.243037,0.114107,0.035903,0.077951,0.241535,0.15068,0.186244,0.08064,0.153127,0.035709,0.006062,0.038847,0.110651,0.007013,0.007061,0.482266
Class,0.009974,0.058353,0.141403,0.492668,1.0,0.432971,0.008206,0.090536,0.07804,0.008855,0.081087,0.306592,0.215445,0.197936,0.221749,0.218564,0.187201,0.168781,0.163099,0.133407,0.007924,0.00321,0.46343
Flight Distance,0.017786,0.226549,0.110823,0.276756,0.432971,1.0,0.002341,0.02132,0.052807,0.017513,0.065939,0.213859,0.170203,0.150573,0.115167,0.140509,0.073161,0.066881,0.058939,0.100705,0.011425,0.002901,0.298703
Inflight wifi service,0.010111,0.003499,0.020904,0.099728,0.008206,0.002341,1.0,0.362355,0.730731,0.362702,0.119807,0.438034,0.10702,0.198351,0.133511,0.172526,0.123695,0.039923,0.113467,0.119484,0.019843,0.023086,0.277395
Departure/Arrival time convenient,0.016445,0.196112,0.036071,0.243037,0.090536,0.02132,0.362355,1.0,0.456498,0.479002,0.001982,0.046456,0.002102,0.007094,0.071368,0.027487,0.072969,0.078166,0.062814,0.001172,0.010846,0.008985,0.067638
Ease of Online booking,0.00758,0.018328,0.026964,0.114107,0.07804,0.052807,0.730731,0.456498,1.0,0.476405,0.017193,0.38822,0.016831,0.043574,0.053613,0.116333,0.044259,0.013875,0.032168,0.007301,0.001474,0.003715,0.164662
Gate location,0.017967,0.000866,0.012077,0.035903,0.008855,0.017513,0.362702,0.479002,0.476405,1.0,0.002645,0.00361,0.008359,0.001144,0.015083,0.008752,0.000493,0.037372,0.003065,0.01509,0.001907,0.000684,0.007169


#### Korelyatsiyasi 0.1 > dan kichiklarini tashlab yuboramiz

In [12]:
# Korelyatsiyasi 0.1 > dan kichiklarini tashlab yuboramiz
train_data.drop(['Gender', 'Departure/Arrival time convenient', 'Gate location', 'Departure Delay in Minutes', 'Arrival Delay in Minutes'], axis=1, inplace=True)
test_data.drop(['Gender', 'Departure/Arrival time convenient', 'Gate location', 'Departure Delay in Minutes', 'Arrival Delay in Minutes'], axis=1, inplace=True)

In [13]:
# data setni qismlarga bo'lamiz
train_set, test_set = train_test_split(train_data, test_size=0.1, random_state=12)

X_train = train_set.drop('satisfaction', axis=1)
y_train = train_set['satisfaction'].copy()

X_test = test_set.drop('satisfaction', axis=1)
y_test = test_set['satisfaction'].copy()

In [14]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((9000, 17), (9000,), (1000, 17), (1000,))

In [15]:
y_train.value_counts()

0    4514
1    4486
Name: satisfaction, dtype: int64

In [16]:
# # for loopi yordamida ustunlar qiymatini o va 1 oralig'iga tushirish
# columns_list = ['Customer Type', 'Age', 'Type of Travel', 'Class', 'Flight Distance', 'Inflight wifi service',
#                 'Ease of Online booking', 'Food and drink', 'Online boarding', 'Seat comfort', 'Inflight entertainment', 
#                 'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service', 'Inflight service', 'Cleanliness']


# for i in columns_list:
#   X_train[i] = X_train[[i]]/X_train[i].max()
#   X_test[i] = X_test[[i]]/X_test[i].max()
#   test_data[i] = test_data[[i]]/test_data[i].max()


# Raqamli ustunlar uchun
num_pipline = Pipeline([
      ('std_scaler', StandardScaler())
    ])

x_train_prepared = num_pipline.fit_transform(X_train)
x_test_prepared = num_pipline.fit_transform(X_test)
test_data_prepared = num_pipline.fit_transform(test_data)
# x_train_prepared = X_train
# x_test_prepared = X_test
x_train_prepared

array([[ 0.45007274,  1.11894034, -1.55420712, ...,  0.50688855,
         1.12185967,  1.28385824],
       [-2.22186306, -0.02013104,  0.64341489, ...,  0.50688855,
         0.26916674, -1.76327365],
       [ 0.45007274,  1.38695714,  0.64341489, ...,  0.50688855,
         0.26916674, -0.23970771],
       ...,
       [ 0.45007274,  0.11387736, -1.55420712, ..., -1.88451295,
         0.26916674, -1.76327365],
       [ 0.45007274,  0.78391935,  0.64341489, ...,  1.30402238,
         0.26916674, -0.23970771],
       [ 0.45007274,  1.85598653, -1.55420712, ...,  0.50688855,
         1.12185967, -1.76327365]])

In [17]:
# y_train = num_pipline.fit_transform(y_train.array.reshape(-1, 1))
# y_train

In [18]:
def try_model(model):
  # Training
  model = model.fit(x_train_prepared, y_train)

  # Test
  y_pred = model.predict(x_test_prepared)

  # Model natijasi y_pred bilan to'g'ri javob y_test ni solishtiramiz
  print("MAE = ", np.around(mean_absolute_error(y_pred, y_test), 3))
  print('RMSE = ', np.around(np.sqrt(mean_squared_error(y_test, y_pred)), 3))
  print("MAX = ", np.around(max_error(y_pred, y_test), 3)) 
  print("R2 = ", np.around(r2_score(y_pred, y_test), 3))    # modelni baholash, ball
  return model, y_pred

### LinearRegression

In [19]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()

# # Modelni o'qitish
# LR_model = lr_model.fit(x_prepared, y_train)
# y_pred = LR_model.predict(X_test)
LR_model, y_pred_lr = try_model(lr_model)

# modelni saqlab olish
with open('LR_model.pkl','wb') as f:
    pickle.dump(LR_model, f)

MAE =  0.247
RMSE =  0.324
MAX =  1.21
R2 =  0.244


### LogisticRegression

In [20]:
from sklearn.linear_model import LogisticRegression
logr_model = LogisticRegression(
                                  penalty='l2',
                                  tol=0.001, 
                                  intercept_scaling=1,
                                  fit_intercept=True,
                                  max_iter=100
                                )

# modelni o'qitish
LogR_model, y_pred_logr = try_model(logr_model)

# modelni saqlab olish
with open('LogR_model.pkl','wb') as f:
    pickle.dump(LogR_model, f)

MAE =  0.122
RMSE =  0.349
MAX =  1
R2 =  0.512


### DecisionTreeRegressor

In [21]:
from sklearn.tree import DecisionTreeRegressor

dt_model = DecisionTreeRegressor(
                                  criterion='squared_error',
                                  splitter='best', 
                                  max_depth=200,
                                  random_state=12, 
                                  max_leaf_nodes=10000
                                 )

# modelni o'qitish
DT_model, y_pred_dt = try_model(dt_model)

# modelni saqlab olish
with open('DT_model.pkl','wb') as f:
    pickle.dump(DT_model, f)

MAE =  0.068
RMSE =  0.261
MAX =  1.0
R2 =  0.728


### RandomForestRegressor

In [22]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(
                                  n_estimators=10000, 
                                  max_depth=20
                                )

# # modelni o'qitish
RF_model, y_pred_rt = try_model(rf_model)

# modelni saqlab olish
with open('RF_model.pkl','wb') as f:
    pickle.dump(RF_model, f)

MAE =  0.081
RMSE =  0.194
MAX =  0.997
R2 =  0.818


### MLPRegressor

In [23]:
from sklearn.neural_network import MLPRegressor

mlp_model = MLPRegressor(
                        hidden_layer_sizes=(600, 200), 
                        activation='relu',
                        solver='adam', 
                        alpha=0.001,
                        learning_rate_init=0.001,
                        max_iter=2000000, 
                        shuffle=True,
                        random_state=12,
                        max_fun=5000000
                      )

# # modelni o'qitish
MLP_model, y_pred_mlp = try_model(mlp_model)

# modelni saqlab olish
with open('MLP_model.pkl','wb') as f:
    pickle.dump(MLP_model, f)

MAE =  0.108
RMSE =  0.211
MAX =  1.254
R2 =  0.813


### KNeighborsClassifier

In [24]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(
                                 n_neighbors=9, 
                                 weights='distance', 
                                 algorithm='auto'
                                )

# # modelni o'qitish
KNN_model, y_pred_knn = try_model(knn_model)

# modelni saqlab olish
with open('KNN_model.pkl','wb') as f:
    pickle.dump(KNN_model, f)

MAE =  0.064
RMSE =  0.253
MAX =  1
R2 =  0.744


### RandomForestClassifier

In [25]:
from sklearn.ensemble import RandomForestClassifier

rfc_model = RandomForestClassifier( 
                                    n_estimators=500,   # o'mondagi daraxtlar soni
                                    criterion='entropy',   # gini, entropy, log_loss
                                    max_depth=19,     # daraxtning maksimal chuqurligi
                                    min_samples_split=2,    # ichki tugunni ajratish uchun zarur bo'lgan minimal namunalar soni
                                    min_samples_leaf=1,       # barg tugunida bo'lishi kerak bo'lgan minimal namunalar soni
                                    max_features='log2',    # sqrt, log2, None   --  eng yaxshi bo'linishni qidirishda e'tiborga olinadigan funksiyalar
                                    max_leaf_nodes=2000,
                                    n_jobs=3   # parallel bajariladigan ishlar soni
                                   )

# # modelni o'qitish
RFC_model, y_pred_rfc = try_model(rfc_model)

# modelni saqlab olish
with open('RFC_model.pkl','wb') as f:
    pickle.dump(RFC_model, f)




MAE =  0.048
RMSE =  0.219
MAX =  1
R2 =  0.808


In [26]:
y_pred_rfc = RFC_model.predict(test_data_prepared)

sample_submission['satisfaction'] = y_pred_rfc
sample_submission

Unnamed: 0,id,satisfaction
0,1,0
1,2,1
2,3,1
3,4,1
4,5,0
...,...,...
3995,3996,1
3996,3997,0
3997,3998,0
3998,3999,1


In [27]:
sample_submission.to_csv("sample_submission.csv", index=False)

### Korelyatsiyasi 0.3 > dan kichiklarini tashlab yuboramiz va Modellarni qayta ishlatib ko'ramiz

In [28]:
# # Korelyatsiyasi 0.3 > dan kichiklarini tashlab yuboramiz
# train_data.drop(['Gender', 'Customer Type', 'Flight Distance', 'Inflight wifi service', 'Departure/Arrival time convenient',
#                  'Ease of Online booking', 'Gate location', 'Food and drink', 'Baggage handling', 'Checkin service', 'Inflight service',
#                  'Departure Delay in Minutes', 'Arrival Delay in Minutes'], axis=1, inplace=True)
# test_data.drop(['Gender', 'Customer Type', 'Flight Distance', 'Inflight wifi service', 'Departure/Arrival time convenient',
#                 'Ease of Online booking', 'Gate location', 'Food and drink', 'Baggage handling', 'Checkin service', 'Inflight service',
#                 'Departure Delay in Minutes', 'Arrival Delay in Minutes'], axis=1, inplace=True)
