# Подготовка

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
data = pd.read_csv("drive/MyDrive/data for colab/pass_satisfaction_train.csv", sep=",")

# Знакомство с данными

In [None]:
data.head(5)

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,110028,0,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
1,24026,0,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
2,111157,0,Loyal Customer,26,Personal Travel,Eco,1180,3,4,2,...,1,3,4,4,4,4,1,0,0.0,neutral or dissatisfied
3,82113,1,Loyal Customer,47,Personal Travel,Eco,1276,2,4,2,...,2,3,3,4,3,5,2,9,23.0,neutral or dissatisfied
4,79485,0,Loyal Customer,41,Business travel,Business,853,1,2,2,...,1,1,2,1,4,1,2,0,0.0,neutral or dissatisfied


In [None]:
data.shape

(83123, 24)

In [None]:
data.dtypes

id                                     int64
Gender                                 int64
Customer Type                         object
Age                                    int64
Type of Travel                        object
Class                                 object
Flight Distance                        int64
Inflight wifi service                  int64
Departure/Arrival time convenient      int64
Ease of Online booking                 int64
Gate location                          int64
Food and drink                         int64
Online boarding                        int64
Seat comfort                           int64
Inflight entertainment                 int64
On-board service                       int64
Leg room service                       int64
Baggage handling                       int64
Checkin service                        int64
Inflight service                       int64
Cleanliness                            int64
Departure Delay in Minutes             int64
Arrival De

# Обработка данных

## Лишние переменные

Избавимся от переменной `id`.

In [None]:
data.drop(columns=["id"], inplace=True)

## Пропуски

Проверим пропущенные значения.

In [None]:
data.isnull().sum()[data.isnull().sum() > 0]

Arrival Delay in Minutes    239
dtype: int64

In [None]:
print(data["Arrival Delay in Minutes"].value_counts(normalize=True))
print(data["Arrival Delay in Minutes"].median())

0.0      0.561013
1.0      0.020981
2.0      0.020281
3.0      0.018785
4.0      0.018580
           ...   
624.0    0.000012
291.0    0.000012
729.0    0.000012
275.0    0.000012
369.0    0.000012
Name: Arrival Delay in Minutes, Length: 430, dtype: float64
0.0


In [None]:
# data.fillna(value={"Arrival Delay in Minutes": data["Arrival Delay in Minutes"].median()}, inplace=True)
data.drop(columns=["Arrival Delay in Minutes"], inplace=True)
# data.fillna(value={"Arrival Delay in Minutes": data["Departure Delay in Minutes"]}, inplace=True)

Проверим, что не осталось пропусков.

In [None]:
data.isnull().sum()[data.isnull().sum() > 0]

Series([], dtype: int64)

## Нормализация

In [None]:
selected_data = data.select_dtypes(exclude=["object"])
transformed_data = StandardScaler().fit_transform(selected_data)
scaled_data = pd.DataFrame(transformed_data, columns=selected_data.columns)
scaled_data.head(5)

Unnamed: 0,Gender,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes
0,-0.987316,-0.885557,-0.04915,-0.55139,-0.69788,-0.542036,-0.765968,1.350054,1.297217,1.183364,1.230108,0.477857,-0.266502,0.310561,0.551954,0.306068,1.304594,-0.390741
1,-0.987316,-0.951686,-0.630464,-0.55139,1.271003,1.604356,1.582278,-0.905605,-0.925655,-1.093352,-1.020432,-1.075501,1.254644,-0.537132,-1.819315,0.306068,-0.981551,-0.10155
2,-0.987316,-0.885557,-0.011064,0.202408,0.614709,-0.542036,-1.548716,-1.657491,-0.925655,-1.852258,-1.770612,-0.298822,0.494071,0.310561,0.551954,0.306068,-1.743599,-0.390741
3,1.012847,0.503145,0.085153,-0.55139,0.614709,-0.542036,0.016781,-0.905605,-0.925655,-1.093352,-1.020432,-0.298822,-0.266502,0.310561,-0.238469,1.157574,-0.981551,-0.15413
4,-0.987316,0.106373,-0.338805,-1.305189,-0.69788,-0.542036,-0.765968,0.598168,-0.184698,-0.334447,-1.770612,-1.852179,-1.027075,-2.232517,0.551954,-2.24845,-0.981551,-0.390741


In [None]:
data = pd.concat([scaled_data, data.select_dtypes(include=["object"])], axis=1)
data.head()

Unnamed: 0,Gender,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,...,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Customer Type,Type of Travel,Class,satisfaction
0,-0.987316,-0.885557,-0.04915,-0.55139,-0.69788,-0.542036,-0.765968,1.350054,1.297217,1.183364,...,-0.266502,0.310561,0.551954,0.306068,1.304594,-0.390741,Loyal Customer,Business travel,Business,satisfied
1,-0.987316,-0.951686,-0.630464,-0.55139,1.271003,1.604356,1.582278,-0.905605,-0.925655,-1.093352,...,1.254644,-0.537132,-1.819315,0.306068,-0.981551,-0.10155,Loyal Customer,Business travel,Business,neutral or dissatisfied
2,-0.987316,-0.885557,-0.011064,0.202408,0.614709,-0.542036,-1.548716,-1.657491,-0.925655,-1.852258,...,0.494071,0.310561,0.551954,0.306068,-1.743599,-0.390741,Loyal Customer,Personal Travel,Eco,neutral or dissatisfied
3,1.012847,0.503145,0.085153,-0.55139,0.614709,-0.542036,0.016781,-0.905605,-0.925655,-1.093352,...,-0.266502,0.310561,-0.238469,1.157574,-0.981551,-0.15413,Loyal Customer,Personal Travel,Eco,neutral or dissatisfied
4,-0.987316,0.106373,-0.338805,-1.305189,-0.69788,-0.542036,-0.765968,0.598168,-0.184698,-0.334447,...,-1.027075,-2.232517,0.551954,-2.24845,-0.981551,-0.390741,Loyal Customer,Business travel,Business,neutral or dissatisfied


## Категориальные переменные

Закодируем категориальные переменные.

In [None]:
data.select_dtypes(include=["object"]).head(5)

Unnamed: 0,Customer Type,Type of Travel,Class,satisfaction
0,Loyal Customer,Business travel,Business,satisfied
1,Loyal Customer,Business travel,Business,neutral or dissatisfied
2,Loyal Customer,Personal Travel,Eco,neutral or dissatisfied
3,Loyal Customer,Personal Travel,Eco,neutral or dissatisfied
4,Loyal Customer,Business travel,Business,neutral or dissatisfied


In [None]:
categories = ["Customer Type", "Type of Travel", "Class", "satisfaction"]

data = pd.get_dummies(data, columns=categories)
data.head(5)

Unnamed: 0,Gender,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,...,Departure Delay in Minutes,Customer Type_Loyal Customer,Customer Type_disloyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,Class_Eco,Class_Eco Plus,satisfaction_neutral or dissatisfied,satisfaction_satisfied
0,-0.987316,-0.885557,-0.04915,-0.55139,-0.69788,-0.542036,-0.765968,1.350054,1.297217,1.183364,...,-0.390741,1,0,1,0,1,0,0,0,1
1,-0.987316,-0.951686,-0.630464,-0.55139,1.271003,1.604356,1.582278,-0.905605,-0.925655,-1.093352,...,-0.10155,1,0,1,0,1,0,0,1,0
2,-0.987316,-0.885557,-0.011064,0.202408,0.614709,-0.542036,-1.548716,-1.657491,-0.925655,-1.852258,...,-0.390741,1,0,0,1,0,1,0,1,0
3,1.012847,0.503145,0.085153,-0.55139,0.614709,-0.542036,0.016781,-0.905605,-0.925655,-1.093352,...,-0.15413,1,0,0,1,0,1,0,1,0
4,-0.987316,0.106373,-0.338805,-1.305189,-0.69788,-0.542036,-0.765968,0.598168,-0.184698,-0.334447,...,-0.390741,1,0,1,0,1,0,0,1,0


# Логистическая регрессия

Разделим переменные.

In [None]:
X = data.drop(columns=["satisfaction_neutral or dissatisfied", "satisfaction_satisfied"])
y = data["satisfaction_satisfied"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [None]:
model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

train_accuracy = classification_report(y_test, y_pred)
print(train_accuracy)

              precision    recall  f1-score   support

           0       0.88      0.90      0.89      7049
           1       0.87      0.83      0.85      5420

    accuracy                           0.87     12469
   macro avg       0.87      0.87      0.87     12469
weighted avg       0.87      0.87      0.87     12469



In [None]:
features = X.columns

coeff_df = pd.DataFrame(model.coef_[0], columns=['Coefficient'])
coeff_df['features'] = features

coeff_df.sort_values(by='Coefficient', ascending=False)

Unnamed: 0,Coefficient,features
20,1.358563,Type of Travel_Business travel
18,1.009165,Customer Type_Loyal Customer
8,0.838589,Online boarding
22,0.513504,Class_Business
3,0.508101,Inflight wifi service
14,0.404979,Checkin service
11,0.392745,On-board service
12,0.335486,Leg room service
16,0.279892,Cleanliness
13,0.157448,Baggage handling


# Результат

In [None]:
data = pd.read_csv("drive/MyDrive/data for colab/pass_satisfaction_test.csv", sep=",")
data_indices = data["id"]
data.drop(columns=["id"], inplace=True)
# data.fillna(value={"Arrival Delay in Minutes": data["Arrival Delay in Minutes"].median()}, inplace=True)
data.drop(columns=["Arrival Delay in Minutes"], inplace=True)
# data.fillna(value={"Arrival Delay in Minutes": data["Departure Delay in Minutes"]}, inplace=True)

selected_data = data.select_dtypes(exclude=["object"])
transformed_data = StandardScaler().fit_transform(selected_data)
scaled_data = pd.DataFrame(transformed_data, columns=selected_data.columns)

data = pd.concat([scaled_data, data.select_dtypes(include=["object"])], axis=1)

categories = ["Customer Type", "Type of Travel", "Class"]
data = pd.get_dummies(data, columns=categories)

y_pred = model.predict(data)
# y_pred = np.where(y_pred, "satisfied", "neutral or dissatisfied")
ans = pd.DataFrame({"id": data_indices, "satisfaction": y_pred})
# ans.to_csv('ans5.csv', index=False)