In [None]:
import xml.etree.ElementTree as ET
tree = ET.parse('PointzAggregator-AirlinesData.xml')
root = tree.getroot()

def parse_users(root):
    data = []

    for user in root.findall('user'):
        user_uid = user.attrib.get('uid')
        first_name = user.find('name').attrib.get('first')
        last_name = user.find('name').attrib.get('last')

        for card in user.findall('.//card'):
            card_number = card.attrib.get('number')
            bonus_program = card.find('bonusprogramm').text

            for activity in card.findall('.//activity'):
                activity_data = {
                    'user_uid': user_uid,
                    'first_name': first_name,
                    'last_name': last_name,
                    'card_number': card_number,
                    'bonus_program': bonus_program,
                    'activity_code': activity.find('Code').text,
                    'activity_date': activity.find('Date').text,
                    'departure': activity.find('Departure').text,
                    'arrival': activity.find('Arrival').text,
                    'fare': activity.find('Fare').text,
                }
                data.append(activity_data)

    return data

user_data = parse_users(root)
df_xml = pd.DataFrame(user_data)
df_xml.to_csv('PointzAggregator-AirlinesData.csv', index=False)


In [None]:
import re
from ruamel.yaml import YAML
yaml = YAML(typ='safe')

def process_large_yaml(file_path):
    data = []
    buf = ''

    with open(file_path, 'r') as file:
        for line in file:
            if re.search(r'\d{4}-\d{2}-\d{2}', line):
                if buf:
                    daily_data = yaml.load_all(buf)
                    for daily_entry in daily_data:
                        data.append(daily_entry)
                    buf = ''
            buf += line

        if buf:
            daily_data = yaml.load_all(buf)
            for daily_entry in daily_data:
                data.append(daily_entry)

    flattened_data = []
    for entry in data:
        for date, flights in entry.items():
            for flight_num, flight_info in flights.items():
                if 'FF' in flight_info:
                    for ff_num, ff_data in flight_info['FF'].items():
                        row = {
                            'Date': date,
                            'Flight': flight_num,
                            'FF Number': ff_num,
                            'Class': ff_data['CLASS'],
                            'Fare': ff_data['FARE'],
                            'From': flight_info['FROM'],
                            'To': flight_info['TO'],
                            'Status': flight_info['STATUS']
                        }
                        flattened_data.append(row)

    return flattened_data

df = pd.DataFrame(process_large_yaml('SkyTeam-Exchange.yaml'))
df.to_csv("SkyTeam-Exchange.csv", index=False)


In [None]:
from transliterate import translit
file1 = open('Sirena-export-fixed.tab', mode='r', encoding='utf-8')
file2 = open('Sirena-export-fixed.csv', mode='w', encoding='utf-8')
heading = ",".join(file1.readline().split()[:-2])
file2.write(heading)
file2.write('\n')
list_str = file1.readlines()

for s in list_str:
    new_s = []
    s = s.split()
    new_s.append(translit(" ".join(s[0:3]), language_code='ru', reversed=True))
    new_s.append(",".join(s[3:8]))
    new_s.append(s[8][:6])
    new_s.append(s[8][6:])
    new_s.append(",".join(s[9:11]))
    new_s.append(s[11][:6])
    new_s.append(s[11][6:])
    new_s.append(" ".join(s[12:14]))
    new_s.append(s[14])
    if len(s[15]) == 1:
        s.insert(15, " ")
    new_s.append(",".join(s[15:17]))
    new_s.append(s[17][:6])
    if len(s[17]) == 6:
        new_s.append(" ")
    else:
        new_s.append(s[17][6:])
    clean_s = ",".join(new_s)
    file2.write(clean_s)
    file2.write('\n')

file1.close()
file2.close()

In [None]:
import pandas as pd

df_json = pd.read_json('FrequentFlyerForum-Profiles.json')

normal_df = pd.json_normalize(df_json["Forum Profiles"])

exploded_df = normal_df.explode("Registered Flights", ignore_index = True).explode("Travel Documents", ignore_index = True).explode("Loyality Programm", ignore_index = True)

RegFli_df = pd.json_normalize(exploded_df["Registered Flights"])
RegFli_df.columns = ['Date', 'CodeShare', 'Flight', 'ArrCity', 'ArrAirport', 'ArrCountry', 'DepCity', 'DepAirport', 'DepCountry']

TravDoc_df = pd.json_normalize(exploded_df["Travel Documents"])

LoyalProg_df = pd.json_normalize(exploded_df["Loyality Programm"])
LoyalProg_df.columns = ['Status', 'LoyalProgramm', 'LoyalProgNumber']

exploded_df = exploded_df.drop(['Registered Flights', 'Travel Documents', 'Loyality Programm'], axis = 1)
exploded_df.columns = ['NickName', 'Sex', 'Last Name', 'First Name']

df = pd.concat([exploded_df, RegFli_df, TravDoc_df, LoyalProg_df], axis = 1)

df.to_csv("FrequentFlyerForum-Profiles.csv", index=False)

In [None]:
import zipfile

with zipfile.ZipFile('YourBoardingPassDotAero.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [None]:
import glob
file_paths = glob.glob("*.xlsx")
file_paths

In [None]:
def parsing(file_path):

    xlsx_data = pd.ExcelFile(file_path)

    extracted_data = []

    for sheet in xlsx_data.sheet_names:
        try:

            df = pd.read_excel(xlsx_data, sheet_name=sheet, header=None)

            name = f"{df.iloc[2, 0]} {df.iloc[2, 1]}"
            sequence = df.iloc[0, 7]
            passenger_class = df.iloc[2, 7]
            flight = df.iloc[4, 0]
            from_city = df.iloc[4, 3]
            to_city = df.iloc[4, 7]
            from_airport = df.iloc[6, 3]
            to_airport = df.iloc[6, 7]
            gate = df.iloc[6, 1]
            date = df.iloc[8, 0]
            time = df.iloc[8, 2]
            carrier = df.iloc[8, 4]
            seat = df.iloc[10, 7]
            pnr = df.iloc[12, 1]
            e_ticket = df.iloc[12, 4]

            extracted_data.append({
                "NAME": name,
                "SEQUENCE": sequence,
                "CLASS": passenger_class,
                "FLIGHT": flight,
                "FROM": from_city,
                "TO": to_city,
                "FROMAIR": from_airport,
                "TOAIR": to_airport,
                "GATE": gate,
                "DATE": date,
                "TIME": time,
                "CARRIER": carrier,
                "SEAT": seat,
                "PNR": pnr,
                "E-TICKET": e_ticket
            })
        except Exception as e:
            print(f"Error processing sheet {sheet}: {e}")

    passengers_df = df.DataFrame(extracted_data)

    return passengers_df

In [None]:
import pandas as pd
import dask
import dask.bag as db
import cudf

bag = db.from_sequence(file_paths, npartitions=10).map(parsing)

all_data = bag.compute()

final_df = pd.concat(all_data, ignore_index=True)


final_df.to_csv("passenger_info(ZIP).csv", index=False)

In [None]:
import pandas as pd
import PyPDF2
import pdfplumber
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
from PIL import Image
import os
pdf_path = 'Skyteam_Timetable.pdf'
import fitz  
from pathlib import Path

def split_pdf_vertically(input_pdf, output_left, output_right):

    pdf = fitz.open(input_pdf)

    num_pages = pdf.page_count

    pdf_left = fitz.open()
    pdf_right = fitz.open()

    for i in range(4, num_pages):
        page = pdf.load_page(i)
        rect = page.rect  

        left_rect = fitz.Rect(rect.x0, rect.y0, rect.width / 2, rect.y1)
        right_rect = fitz.Rect(rect.width / 2, rect.y0, rect.x1, rect.y1)

        left_page = pdf_left.new_page(width=left_rect.width, height=left_rect.height)
        right_page = pdf_right.new_page(width=right_rect.width, height=right_rect.height)

        left_page.show_pdf_page(left_page.rect, pdf, i, clip=left_rect)
        right_page.show_pdf_page(right_page.rect, pdf, i, clip=right_rect)

    pdf_left.save(output_left)
    pdf_right.save(output_right)

    pdf.close()
    pdf_left.close()
    pdf_right.close()

split_pdf_vertically(pdf_path, "new_left_tables.pdf", "new_right_tables.pdf")

import fitz  
import pandas as pd
import re

MONTHS = {"Jan", "Feb", "Mar", "Apr", "May", "Jun", 
          "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"}

def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as pdf:
        texts = [page.get_text() for page in pdf]
    return texts

def is_month(text):
    return any(text.endswith(month) for month in MONTHS)

def safe_get_line(lines, index):
    return lines[index].strip() if index < len(lines) else None

def parse_flights(text, current_from=None, current_to=None):
    from_pattern = r"FROM:\s*(.+),\s*(.+)"
    flight_data = []
    lines = text.splitlines()
    i = 0

    while i < len(lines):
        line = lines[i].strip()

        # Обработка FROM
        from_match = re.match(from_pattern, line)
        if from_match:
            airport_city = f"{from_match.group(1).strip()}"
            airport_country = f"{from_match.group(2).strip()}"
            airport_code = safe_get_line(lines, i + 1)  # Получаем код аэропорта
            current_from = f"{airport_city}, {airport_country}, {airport_code}"  # Добавляем код аэропорта к названию
            i += 2  # Пропускаем строку с кодом аэропорта
            continue

        # Обработка TO
        if line.startswith("TO:"):
            i += 1
            to_city = safe_get_line(lines, i)
            i += 1
            to_country = safe_get_line(lines, i)
            current_to = f"{to_city}, {to_country}"
            i += 1  
            continue

        if line in ["Validity", "Days", "Dep", "Time", "Arr", "Flight", "Aircraft", "Travel", "Time"]:
            i += 1
            continue

        validity = line.strip()
        if not is_month(validity.split()[-1]):  # Если Validity содержит Days
            days = validity
            dep_time = safe_get_line(lines, i + 1)
            arr_time = safe_get_line(lines, i + 2)
            flight = safe_get_line(lines, i + 3)
            aircraft = safe_get_line(lines, i + 4)
            travel_time = safe_get_line(lines, i + 5)
            i += 6
        else:  # Если Validity не содержит Days
            days = safe_get_line(lines, i + 1)
            if ':' in days:  # Если Days содержит DepTime
                dep_time = days
                days = None
                arr_time = safe_get_line(lines, i + 2)
                flight = safe_get_line(lines, i + 3)
                aircraft = safe_get_line(lines, i + 4)
                travel_time = safe_get_line(lines, i + 5)
                i += 6
            else:
                dep_time = safe_get_line(lines, i + 2)
                arr_time = safe_get_line(lines, i + 3)
                flight = safe_get_line(lines, i + 4)
                aircraft = safe_get_line(lines, i + 5)
                travel_time = safe_get_line(lines, i + 6)
                i += 7

        if not days:  # Если days пустое
            dep_time = dep_time.strip()  # Убираем лишние пробелы
            days, dep_time = dep_time[:-5].strip(), dep_time[-5:].strip()

        # Если какое-то поле пустое, добавляем сообщение "Consult your travel agent"
        entry = {
            "FROM": current_from or "Consult your travel agent",
            "TO": current_to or "Consult your travel agent",
            "Validity": validity or "Consult your travel agent",
            "Days": days or "Consult your travel agent",
            "DepTime": dep_time or "Consult your travel agent",
            "ArrTime": arr_time or "Consult your travel agent",
            "Flight": flight or "Consult your travel agent",
            "Aircraft": aircraft or "Consult your travel agent",
            "TravelTime": travel_time or "Consult your travel agent",
        }
        flight_data.append(entry)

    return flight_data

def process_pdf_files(left_pdf, right_pdf):
    """Обрабатывает два PDF и возвращает объединенный DataFrame."""
    left_texts = extract_text_from_pdf(left_pdf)
    right_texts = extract_text_from_pdf(right_pdf)

    all_flight_data = []
    current_from, current_to = None, None

    # Обработка левого PDF
    for text in left_texts:
        flights = parse_flights(text, current_from, current_to)
        if flights:
            current_from = flights[-1]["FROM"]
            current_to = flights[-1]["TO"]
        all_flight_data.extend(flights)

    # Обработка правого PDF
    for text in right_texts:
        flights = parse_flights(text, current_from, current_to)
        if flights:
            current_from = flights[-1]["FROM"]
            current_to = flights[-1]["TO"]
        all_flight_data.extend(flights)

    # Создаем DataFrame
    return pd.DataFrame(all_flight_data)

# Пример использования
left_pdf_path = "left_tables.pdf"
right_pdf_path = "right_tables.pdf"

df = process_pdf_files(left_pdf_path, right_pdf_path)

# Сохранение результата в CSV
df.to_csv("flights_data(PDF).csv", index=False)