In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display

pd.set_option('display.max_columns', None)
plt.style.use('ggplot')

In [None]:
df = pd.read_csv('Датасет №1_Airline Passenger Satisfaction_mod.csv', parse_dates=['Date'])
print("Размер датасета:", df.shape)
display(df.head(2))

In [None]:
display(df.info())
missing = df.isna().sum().sort_values(ascending=False)
missing = missing[missing > 0]
print("\nПропущенные значения:")
display(missing)

In [None]:
df['satisfaction'] = df['satisfaction'].map({'satisfied': 1, 'neutral or dissatisfied': 0})
plt.figure(figsize=(8, 5))
plt.bar(['Неудовлетворены', 'Удовлетворены'], 
        df['satisfaction'].value_counts().sort_index(),
        color=['lightcoral', 'lightgreen'])
plt.title('Распределение удовлетворенности пассажиров')
plt.ylabel('Количество')
plt.show()

print("\nПроцентное соотношение:")
display(df['satisfaction'].value_counts(normalize=True).mul(100).round(1))

In [None]:
service_cols = [col for col in df.columns if 'service' in col.lower() 
                or 'comfort' in col.lower() 
                or 'food' in col.lower()
                or 'wifi' in col.lower()]
print("Колонки с оценками сервисов:")
display(service_cols)

In [None]:
high_ratings_low_satisfaction = df[
    (df[service_cols].mean(axis=1) > 4) & 
    (df['satisfaction'] == 0)
]

print("\nКоличество случаев с высокими оценками, но низкой удовлетворенностью:", 
      len(high_ratings_low_satisfaction))

if not high_ratings_low_satisfaction.empty:
    print("\nПримеры таких случаев:")
    display(high_ratings_low_satisfaction.sample(3))

In [None]:
plt.figure(figsize=(12, 6))
plt.boxplot([df[col] for col in service_cols], labels=service_cols)
plt.title('Распределение оценок сервисов')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
df['Average Service Rating'] = df[service_cols].mean(axis=1)
df['Service Rating Std'] = df[service_cols].std(axis=1)

print("\nНовые признаки:")
display(df[['Average Service Rating', 'Service Rating Std']].describe())

In [None]:
cat_cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class']

plt.figure(figsize=(15, 10))
for i, col in enumerate(cat_cols, 1):
    plt.subplot(2, 2, i)
    counts = df.groupby([col, 'satisfaction']).size().unstack()
    counts.plot(kind='bar', stacked=True, ax=plt.gca(),
               color=['lightcoral', 'lightgreen'])
    plt.title(f'Удовлетворенность по {col}')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
corr = df.corr(numeric_only=True)

plt.figure(figsize=(12, 10))
plt.imshow(corr, cmap='coolwarm', interpolation='none')
plt.colorbar()
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.title('Матрица корреляций')
plt.show()