<a href="https://colab.research.google.com/github/Nikita-Vasinkov/StatPrak-HW/blob/master/dz4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

flights = pd.read_csv("flights.csv", low_memory=False)
airports = pd.read_csv("airports.csv", low_memory=False)
airlines = pd.read_csv("airlines.csv", low_memory=False)

airlines_dict = {airlines.iloc[i, 0]: airlines.iloc[i, 1] for i in range(len(airlines))}
airports.set_index("IATA_CODE", inplace=True)
airports = airports.dropna()
flights = flights[(flights["ORIGIN_AIRPORT"].apply(lambda x: len(str(x)) == 3)) & (flights["DESTINATION_AIRPORT"].apply(lambda x: len(str(x)) == 3))]
flights.loc[flights["DEPARTURE_DELAY"] < 0, "DEPARTURE_DELAY"] = 0
flights = flights.dropna(subset=["DEPARTURE_DELAY"])
print(flights.head())

day_map = {1: "ПН", 2: "ВТ", 3: "СР", 4: "ЧТ", 5: "ПТ", 6: "СБ", 7: "ВС"}
flights["DAY_NAME"] = flights["DAY_OF_WEEK"].map(day_map)
mean_delay_by_day = flights[flights["DEPARTURE_DELAY"] > 0].groupby("DAY_NAME")["DEPARTURE_DELAY"].mean()
flights_count_by_day = flights.groupby("DAY_NAME")["FLIGHT_NUMBER"].count()
order = ["ПН", "ВТ", "СР", "ЧТ", "ПТ", "СБ", "ВС"]
mean_delay_by_day = mean_delay_by_day.reindex(order)
flights_count_by_day = flights_count_by_day.reindex(order)

plt.figure(figsize=(10,6))
plt.bar(mean_delay_by_day.index, mean_delay_by_day.values, color='violet', edgecolor='black')
plt.xlabel("День недели")
plt.ylabel("Средняя задержка (мин)")
plt.title("Средняя задержка рейсов по дням недели")
plt.show()

plt.figure(figsize=(10,6))
plt.bar(flights_count_by_day.index, flights_count_by_day.values, color='skyblue', edgecolor='black')
plt.xlabel("День недели")
plt.ylabel("Количество рейсов")
plt.title("Количество рейсов по дням недели")
plt.ylim(550000, flights_count_by_day.max() + 100000)
plt.axhline(y=flights_count_by_day.max(), color='red', linestyle='--', label="Макс. число рейсов")
plt.legend()
plt.show()

delay_by_airport = flights.groupby("ORIGIN_AIRPORT")["AIR_SYSTEM_DELAY"].median()
max_delays_airport = delay_by_airport.nlargest(20)
plt.figure(figsize=(10, 6))
max_delays_airport.sort_values().plot(kind='barh', color='violet', edgecolor='black')
plt.xlabel("Медианная задержка (мин)")
plt.ylabel("Аэропорт")
plt.title("Топ-20 аэропортов по медианной задержке (AIR_SYSTEM_DELAY)")
plt.axvline(x=flights["AIR_SYSTEM_DELAY"].mean(), color='red', linestyle='--', label="Среднее значение для всех рейсов")
plt.legend()
plt.show()

num_f_air = flights.groupby("ORIGIN_AIRPORT")["FLIGHT_NUMBER"].count().reset_index()
max_fligh_airport = num_f_air.nlargest(20, columns='FLIGHT_NUMBER')
plt.figure(figsize=(10, 6))
plt.bar(max_fligh_airport['ORIGIN_AIRPORT'], max_fligh_airport['FLIGHT_NUMBER'], color='violet', edgecolor='black')
plt.title("Топ-20 аэропортов по числу вылетов")
plt.xlabel("Аэропорт")
plt.ylabel("Количество рейсов")
plt.grid(axis='x', alpha=0.7)
plt.show()

mean_delay_airline = flights[flights["AIRLINE_DELAY"] > 0].groupby("AIRLINE")["AIRLINE_DELAY"].mean()
num_f_on_air = flights.groupby("AIRLINE")["FLIGHT_NUMBER"].count().reset_index()
mean_security_delay = flights[flights["SECURITY_DELAY"] > 0].groupby("AIRLINE")["SECURITY_DELAY"].mean()
plt.figure(figsize=(10,5))
mean_delay_airline.sort_values().plot(kind='bar', color='pink', edgecolor='black')
plt.title("Средняя задержка (AIRLINE_DELAY) по авиакомпаниям")
plt.xlabel("Авиакомпания")
plt.ylabel("Задержка (мин)")
plt.grid(axis='x', alpha=0.7)
plt.show()

plt.figure(figsize=(10,5))
plt.bar(num_f_on_air['AIRLINE'], num_f_on_air['FLIGHT_NUMBER'], color='violet', edgecolor='black')
plt.title("Количество рейсов по авиакомпаниям")
plt.xlabel("Авиакомпания")
plt.ylabel("Число рейсов")
plt.grid(axis='x', alpha=0.7)
plt.show()

plt.figure(figsize=(10,5))
mean_security_delay.sort_values().plot(kind='bar', color='lightblue', edgecolor='black')
plt.title("Средняя задержка по безопасности (SECURITY_DELAY) по авиакомпаниям")
plt.xlabel("Авиакомпания")
plt.ylabel("Задержка (мин)")
plt.grid(axis='x', alpha=0.7)
plt.show()

mean_air_system = flights["AIR_SYSTEM_DELAY"].mean()
mean_security = flights["SECURITY_DELAY"].mean()
mean_airline = flights["AIRLINE_DELAY"].mean()
mean_late_aircraft = flights["LATE_AIRCRAFT_DELAY"].mean()
mean_weather = flights["WEATHER_DELAY"].mean()
reasons = ["AIR_SYSTEM_DELAY", "SECURITY_DELAY", "AIRLINE_DELAY", "LATE_AIRCRAFT_DELAY", "WEATHER_DELAY"]
values = [mean_air_system, mean_security, mean_airline, mean_late_aircraft, mean_weather]
plt.figure(figsize=(7,7))
plt.pie(values, labels=reasons, autopct='%1.1f%%', startangle=140, colors=['skyblue','hotpink','plum','purple','m'])
plt.title("Распределение средней задержки по причинам")
plt.show()

flights["SCHEDULED_DEPARTURE_HOUR"] = flights["SCHEDULED_DEPARTURE"] // 100
delay_by_hour = flights.groupby("SCHEDULED_DEPARTURE_HOUR")["DEPARTURE_DELAY"].median()
plt.figure(figsize=(10,6))
sns.lineplot(x=delay_by_hour.index, y=delay_by_hour.values, marker='o', color='purple')
plt.xlabel("Час вылета")
plt.ylabel("Медианная задержка (мин)")
plt.title("Зависимость задержки вылета от времени суток")
plt.xticks(range(0, 24))
plt.grid(True)
plt.show()

flights_count_by_hour = flights.groupby("SCHEDULED_DEPARTURE_HOUR")["FLIGHT_NUMBER"].count()
plt.figure(figsize=(10,6))
sns.barplot(x=flights_count_by_hour.index, y=flights_count_by_hour.values, palette="Blues_d")
plt.xlabel("Час вылета")
plt.ylabel("Количество рейсов")
plt.title("Количество рейсов по часу вылета")
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'flights.csv'