In [12]:
import requests
import json
import pandas as pd
from pandas import json_normalize
import config
from db_manager import DBManager
import os
from rest_api import Api
from datetime import date, datetime
from dateutil.relativedelta import relativedelta
from urllib.parse import quote_plus
from sqlalchemy import create_engine
import itertools
import logging
import pickle
import hashlib


In [13]:
logging.basicConfig(
    format='%(levelname)s-%(lineno)s-%(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level='INFO'
)
logger=logging.getLogger(__name__)

#### Show all columns without truncating

Today's date

In [14]:
today = date.today()
logger.info(f'Current date: {today}')

INFO-__main__:2-Current date: 2023-11-12


Adding months to today's date. Near term travel dates 30 - 60 days ahead

In [15]:
departure_date_1_month = today + relativedelta(months=1)
departure_date_2_month = today + relativedelta(months=2)
return_date_1_month = departure_date_2_month
return_date_2_month = today + relativedelta(months=3)
logger.info(f'Departure_dates:{departure_date_1_month} and {departure_date_2_month} \nReturn dates:{return_date_1_month} and {return_date_2_month}')

INFO-__main__:5-Departure_dates:2023-12-12 and 2024-01-12 
Return dates:2024-01-12 and 2024-02-12


Apply the combination of params to get data for all possible combinations of depart, arrival dates and depart arrival airports

Combination itertools usage. Im calculating the total combinations by multiplying the number of possibilities (unique values) for each variable<br />
Departure airport: 2 possibilities<br />
Arrival airport: 4 possibilities<br />
Departure date: 2 possibilities<br />
Arrival date: 1 possibilities<br />
Total combinations = 2 x 4 x 2 x 1 = 16 combinations

#### Travel payouts API to get flight search results for each combination in the list 

--------------------------------------------------------

API test for amadeus

POST request to the Amadeus authorization server to get the access token

In [16]:
response = requests.post(url=config.url_token, headers=config.headers_token, data=config.data)

if response.status_code == 200:
  # API call succeeded
  token = response.json()['access_token'] 
  logger.info('Successful access token generation')
else:
  # API call failed
  print(response.text)

INFO-__main__:6-Successful access token generation


In [17]:
#Combinations of destination and origin airport codes
origins = config.params['originLocationCode']
destinations = config.params['destinationLocationCode']
departure_dates=[departure_date_1_month,departure_date_2_month]
# return_dates=[return_date_1_month,return_date_2_month]

Creating combinations of Origin and Destination to be passed as parameters to the api call

trial

In [18]:
headers = {
  'client_id': config.client_id, 
  'client_secret': config.client_secret,
  'Authorization': f'Bearer {token}'
  }

url = config.url

response = []
counter = 0

for origin, dest, depart in itertools.product(origins, destinations, departure_dates):

    params={
        'originLocationCode': origin,
        'destinationLocationCode':dest, 
        'departureDate': depart, 
        'returnDate' : None,
        'adults':1,
        'children':None,
        'infants':None,
        'travelClass':None,
        'currencyCode':'EUR',
        'maxPrice' : None
        }


    #to be worked on later for by calling rest_api module
    # api = Api()
    # resp = api.make_flight_api_request(url,headers,params)
    resp = requests.get(url, headers=headers, params=params)
    counter = counter+1
    resp = resp.json()
    response.append(resp)
    logger.info(f'API call {counter} succeeded for flight offers')
    # else:
    #     # API call failed
    #     logger.info('API call failed')


INFO-__main__:35-API call 1 succeeded for flight offers
INFO-__main__:35-API call 2 succeeded for flight offers
INFO-__main__:35-API call 3 succeeded for flight offers
INFO-__main__:35-API call 4 succeeded for flight offers
INFO-__main__:35-API call 5 succeeded for flight offers
INFO-__main__:35-API call 6 succeeded for flight offers
INFO-__main__:35-API call 7 succeeded for flight offers
INFO-__main__:35-API call 8 succeeded for flight offers
INFO-__main__:35-API call 9 succeeded for flight offers
INFO-__main__:35-API call 10 succeeded for flight offers
INFO-__main__:35-API call 11 succeeded for flight offers
INFO-__main__:35-API call 12 succeeded for flight offers
INFO-__main__:35-API call 13 succeeded for flight offers
INFO-__main__:35-API call 14 succeeded for flight offers
INFO-__main__:35-API call 15 succeeded for flight offers
INFO-__main__:35-API call 16 succeeded for flight offers
INFO-__main__:35-API call 17 succeeded for flight offers
INFO-__main__:35-API call 18 succeeded f

In [11]:
counter

24

### Saving the Global variable to a file

### Creating the payload for real time price script

In [19]:

payload = {
  "data": {
    "type": "flight-offers-pricing",
    "flightOffers": [], 
    "travelers": []
  }
}

for r in response:

  # Extract flight offer data
  offers = r['data']
  
  # Add offers to payload
  payload['data']['flightOffers'].extend(offers) 

# Convert payload to JSON string for request 
json_payload = json.dumps(payload)

In [20]:
json_payload

'{"data": {"type": "flight-offers-pricing", "flightOffers": [{"type": "flight-offer", "id": "1", "source": "GDS", "instantTicketingRequired": false, "nonHomogeneous": false, "oneWay": false, "lastTicketingDate": "2023-11-30", "lastTicketingDateTime": "2023-11-30", "numberOfBookableSeats": 1, "itineraries": [{"duration": "PT10H30M", "segments": [{"departure": {"iataCode": "BER", "terminal": "1", "at": "2023-12-12T17:05:00"}, "arrival": {"iataCode": "VIE", "terminal": "3", "at": "2023-12-12T18:20:00"}, "carrierCode": "OS", "number": "232", "aircraft": {"code": "320"}, "operating": {"carrierCode": "OS"}, "duration": "PT1H15M", "id": "165", "numberOfStops": 0, "blacklistedInEU": false}, {"departure": {"iataCode": "VIE", "terminal": "3", "at": "2023-12-12T20:15:00"}, "arrival": {"iataCode": "DEL", "terminal": "3", "at": "2023-12-13T08:05:00"}, "carrierCode": "AI", "number": "154", "aircraft": {"code": "788"}, "operating": {"carrierCode": "AI"}, "duration": "PT7H20M", "id": "166", "numberOfS

In [13]:
flight_search_response =json_payload

In [13]:
#Save the variable to a file
with open('global_variable.pkl', 'wb') as file:
    pickle.dump(flight_search_response, file)

Convert that sample nested JSON data into separate DataFrames

Repeat the same above procedure for itineraries and traveler_pricing

### Offers

In [14]:
offer_df = pd.DataFrame()

for i in range(counter):
    # Assuming response[i]['data'] is a dictionary
    data_to_append = pd.json_normalize(response[i]['data'], record_path=['itineraries'], meta=['id', 'source'])
    
    # Append the data to the existing DataFrame
    offer_df = pd.concat([offer_df, data_to_append], ignore_index=True)

Checking what the offer df contains

To get the first dept airport and last arrival airport looping through each row in itineraries df

In [15]:
offer_df['first_dept_airport'] = offer_df['segments'].apply(lambda x: x[0]['departure']['iataCode']) 
offer_df['last_arr_airport'] = offer_df['segments'].apply(lambda x: x[-1]['arrival']['iataCode'])
offer_df['first_dept_time'] = offer_df['segments'].apply(lambda x: x[0]['departure']['at']) 
offer_df['last_arr_time'] = offer_df['segments'].apply(lambda x: x[-1]['arrival']['at'])

Creating a composite key

Think About a unique key that you will use to join pricing table and segments table together to get the price of the flight

In [16]:
offer_df = offer_df.rename(columns={'id': 'offer_id'})

Trying another unique id

In [17]:
def generate_unique_id(row):
    unique_string = f"{row['offer_id']}_{row['first_dept_airport']}_{row['last_arr_airport']}_{row['first_dept_time']}_{row['last_arr_time']}"
    unique_id = hashlib.md5(unique_string.encode()).hexdigest()
    return unique_id

offer_df['unique_id'] = offer_df.apply(generate_unique_id, axis=1)

In [18]:
offer_df=offer_df.drop(columns=['segments'],axis=1)

In [19]:
# Define the desired column order
desired_order = ['unique_id', 'offer_id', 'source','first_dept_airport','last_arr_airport','first_dept_time','last_arr_time','duration']

# Create a new DataFrame with columns in the desired order
offer_df = offer_df[desired_order]

In [20]:
# converting to datetime type
offer_df['first_dept_time'] = pd.to_datetime(offer_df['first_dept_time'])
offer_df['last_arr_time'] = pd.to_datetime(offer_df['last_arr_time'])

In [21]:
#creating the festival flag column that differentiates between festival dates and near future dates 
offer_df['festival_flag']=0
offer_df['festival']=None

In [22]:
now = datetime.now()
offer_df['incremental_day']=now
offer_df['incremental_day'] = pd.to_datetime(offer_df['incremental_day'])

In [23]:
offer_df.tail()

Unnamed: 0,unique_id,offer_id,source,first_dept_airport,last_arr_airport,first_dept_time,last_arr_time,duration,festival_flag,festival,incremental_day
2289,666c81f4319c6b6b8f8304de4b26a0c4,54,GDS,MUC,MAA,2024-01-11 21:30:00,2024-01-13 02:15:00,PT24H15M,0,,2023-11-11 09:43:31.176165
2290,acbf8d0df3198ff884b55b92f680cf6e,55,GDS,MUC,MAA,2024-01-11 21:30:00,2024-01-13 08:15:00,PT30H15M,0,,2023-11-11 09:43:31.176165
2291,e5b1ea5b58911ad0d06bff3c7856bc38,56,GDS,MUC,MAA,2024-01-11 09:10:00,2024-01-11 23:00:00,PT9H20M,0,,2023-11-11 09:43:31.176165
2292,d3b9423ee724e13f41d9ea2baab87b14,57,GDS,MUC,MAA,2024-01-11 11:05:00,2024-01-12 02:50:00,PT11H15M,0,,2023-11-11 09:43:31.176165
2293,43052277708792e69c797bca41b46762,58,GDS,MUC,MAA,2024-01-11 21:45:00,2024-01-12 14:00:00,PT11H45M,0,,2023-11-11 09:43:31.176165


### Itineraries segments

In [24]:
itineraries_df = pd.DataFrame()

for i in range(counter):
    # Assuming response[i]['data'] is a dictionary
    data_to_append = pd.json_normalize(response[i]['data'], record_path=['itineraries'], meta=['id', 'source'])
    
    # Append the data to the existing DataFrame
    itineraries_df = pd.concat([itineraries_df, data_to_append], ignore_index=True)

In [25]:
itineraries_df['first_dept_airport'] = itineraries_df['segments'].apply(lambda x: x[0]['departure']['iataCode']) 
itineraries_df['last_arr_airport'] = itineraries_df['segments'].apply(lambda x: x[-1]['arrival']['iataCode'])
itineraries_df['first_dept_time'] = itineraries_df['segments'].apply(lambda x: x[0]['departure']['at']) 
itineraries_df['last_arr_time'] = itineraries_df['segments'].apply(lambda x: x[-1]['arrival']['at'])

Trying to genearte unique id before exploding

In [26]:
def generate_unique_id(row):
    unique_string = f"{row['id']}_{row['first_dept_airport']}_{row['last_arr_airport']}_{row['first_dept_time']}_{row['last_arr_time']}"
    unique_id = hashlib.md5(unique_string.encode()).hexdigest()
    return unique_id

itineraries_df['unique_id_fk'] = itineraries_df.apply(generate_unique_id, axis=1)

Treating the itineraries df

Just taking the exploded df and carefully scanning each row in the top 5 rows to see what kind of values does the exploded version of offer df contains
exploded df
df2
df

In [27]:
# Explode the 'segment' column to create separate rows for each dictionary
exploded_df = itineraries_df.explode('segments')

# Split the dictionary into columns  
df2 = exploded_df['segments'].apply(pd.Series)

# Merge the new columns back to the original DataFrame
df = exploded_df.merge(df2, right_index=True, left_index=True, suffixes=('_original', '_segments'))

# Drop the original dictionary column
df = df.drop(columns=['segments']) 
df = df.rename(columns={'duration_original': 'duration_total','id_original':'offer_id'})

In [28]:
# Extract departure keys using apply
df['dept_airport'] = df['departure'].apply(lambda x: x['iataCode'])
df['dept_at'] = df['departure'].apply(lambda x: x['at'])
df['arrival_airport'] = df['arrival'].apply(lambda x: x['iataCode'])
df['arrival_at'] = df['arrival'].apply(lambda x: x['at'])
df['aircraft'] = df['aircraft'].apply(lambda x: x['code'])
df.head()
# Drop stops column if it exists
column_to_drop = 'stops'
if column_to_drop in df.columns:
    df.drop(column_to_drop,axis=1,inplace=True)
itineraries_df = df.drop(columns=['departure','arrival','operating','aircraft','blacklistedInEU','first_dept_airport','last_arr_airport','first_dept_time','last_arr_time'])

itineraries_df = itineraries_df.drop_duplicates()
# Reset index
itineraries_df = itineraries_df.reset_index(drop=True)
itineraries_df.head()

Unnamed: 0,duration_total,offer_id,source,unique_id_fk,carrierCode,number,duration_segments,id_segments,numberOfStops,dept_airport,dept_at,arrival_airport,arrival_at
0,PT10H35M,1,GDS,9a6cd24faa6a82e8afa41e39eaf7723b,KL,1824,PT1H25M,156,0,BER,2023-12-11T11:50:00,AMS,2023-12-11T13:15:00
1,PT10H35M,1,GDS,9a6cd24faa6a82e8afa41e39eaf7723b,KL,109,PT8H15M,157,0,AMS,2023-12-11T14:10:00,DEL,2023-12-12T01:55:00
2,PT16H25M,2,GDS,364f511553003d1146d24e0e38de0d2c,KL,1818,PT1H25M,130,0,BER,2023-12-11T06:00:00,AMS,2023-12-11T07:25:00
3,PT16H25M,2,GDS,364f511553003d1146d24e0e38de0d2c,KL,109,PT8H15M,131,0,AMS,2023-12-11T14:10:00,DEL,2023-12-12T01:55:00
4,PT25H15M,3,GDS,05d1f2d46b2a769d7f9530eaad6bd821,KL,1834,PT1H30M,103,0,BER,2023-12-11T21:10:00,AMS,2023-12-11T22:40:00


In [29]:
itineraries_df['segment_key'] = itineraries_df['unique_id_fk'].astype(str) + '_' + itineraries_df['id_segments'].astype(str)

Create unique id for segments df

Renaming, reordering and changing datatypes

In [30]:
# Rename the columns according to the SQL table schema
itineraries_df = itineraries_df.rename(columns={
    'source': 'source',
    'duration_segments': 'duration_segments',
    'duration_total': 'duration_total',
    'carrierCode': 'carrier_code',
    'number': 'number',
    'duration_segments': 'duration_segments',
    'numberOfStops': 'number_of_stops',
    'dept_airport': 'dept_airport',
    'dept_at': 'dept_at',
    'arrival_airport': 'arrival_airport',
    'arrival_at': 'arrival_at'
})

# Ensure data types are set correctly
itineraries_df['offer_id'] = itineraries_df['offer_id'].astype(int)
itineraries_df['number_of_stops'] = itineraries_df['number_of_stops'].astype(int)
itineraries_df['dept_at'] = pd.to_datetime(itineraries_df['dept_at'])
itineraries_df['arrival_at'] = pd.to_datetime(itineraries_df['arrival_at'])

now = datetime.now()
itineraries_df['incremental_day']=now
itineraries_df['incremental_day'] = pd.to_datetime(itineraries_df['incremental_day'])


In [31]:
logger.info(f'Current date: {now}')

INFO-__main__:1-Current date: 2023-11-11 09:43:33.777006


In [32]:
# Assuming itineraries_df is your DataFrame
new_column_order = [
    'segment_key',
    'duration_total',
    'offer_id',
    'source',
    'unique_id_fk',
    'carrier_code',
    'number',
    'duration_segments',
    'id_segments',
    'number_of_stops',
    'dept_airport',
    'dept_at',
    'arrival_airport',
    'arrival_at',
    'incremental_day'
]

# Reorder the DataFrame columns
itineraries_df = itineraries_df[new_column_order]


In [33]:
itineraries_df.head()

Unnamed: 0,segment_key,duration_total,offer_id,source,unique_id_fk,carrier_code,number,duration_segments,id_segments,number_of_stops,dept_airport,dept_at,arrival_airport,arrival_at,incremental_day
0,9a6cd24faa6a82e8afa41e39eaf7723b_156,PT10H35M,1,GDS,9a6cd24faa6a82e8afa41e39eaf7723b,KL,1824,PT1H25M,156,0,BER,2023-12-11 11:50:00,AMS,2023-12-11 13:15:00,2023-11-11 09:43:33.777006
1,9a6cd24faa6a82e8afa41e39eaf7723b_157,PT10H35M,1,GDS,9a6cd24faa6a82e8afa41e39eaf7723b,KL,109,PT8H15M,157,0,AMS,2023-12-11 14:10:00,DEL,2023-12-12 01:55:00,2023-11-11 09:43:33.777006
2,364f511553003d1146d24e0e38de0d2c_130,PT16H25M,2,GDS,364f511553003d1146d24e0e38de0d2c,KL,1818,PT1H25M,130,0,BER,2023-12-11 06:00:00,AMS,2023-12-11 07:25:00,2023-11-11 09:43:33.777006
3,364f511553003d1146d24e0e38de0d2c_131,PT16H25M,2,GDS,364f511553003d1146d24e0e38de0d2c,KL,109,PT8H15M,131,0,AMS,2023-12-11 14:10:00,DEL,2023-12-12 01:55:00,2023-11-11 09:43:33.777006
4,05d1f2d46b2a769d7f9530eaad6bd821_103,PT25H15M,3,GDS,05d1f2d46b2a769d7f9530eaad6bd821,KL,1834,PT1H30M,103,0,BER,2023-12-11 21:10:00,AMS,2023-12-11 22:40:00,2023-11-11 09:43:33.777006


Segment key unique check

Finding out which columns in a dataframe are lists 

In [34]:
# Assuming deduplicated_df is your DataFrame
# list_columns = []

# for column in final_df.columns:
#     if final_df[column].apply(lambda x: isinstance(x, list)).any():
#         list_columns.append(column)

# print("Columns with lists:", list_columns)

### Pricing flights

In [35]:
offer = pd.DataFrame()

for i in range(counter):
    # Assuming response[i]['data'] is a dictionary
    data_to_append = pd.json_normalize(response[i]['data'], meta=['id', 'source'])
    
    # Append the data to the existing DataFrame
    offer = pd.concat([offer, data_to_append], ignore_index=True)

In [36]:
pricing_df = pd.DataFrame()

for i in range(counter):
    # Assuming response[i]['data'] is a dictionary
    data_to_append = pd.json_normalize(response[i]['data'], meta=['id', 'source'])
    
    # Append the data to the existing DataFrame
    pricing_df = pd.concat([pricing_df, data_to_append], ignore_index=True)

In [37]:
pricing_df = pricing_df.drop(columns=['instantTicketingRequired','nonHomogeneous','oneWay','lastTicketingDate','lastTicketingDateTime','numberOfBookableSeats','validatingAirlineCodes','travelerPricings'],axis=1)


In [38]:
# Extract first departure airport
pricing_df['first_dept_airport'] = pricing_df['itineraries'].apply(lambda x: x[0]['segments'][0]['departure']['iataCode'])

# Extract last arrival airport
pricing_df['last_arr_airport'] = pricing_df['itineraries'].apply(lambda x: x[0]['segments'][-1]['arrival']['iataCode'])

# Extract first departure time
pricing_df['first_dept_time'] = pricing_df['itineraries'].apply(lambda x: x[0]['segments'][0]['departure']['at'])

# Extract last arrival time
pricing_df['last_arr_time'] = pricing_df['itineraries'].apply(lambda x: x[0]['segments'][-1]['arrival']['at'])

# Drop the 'itineraries' column if no longer needed
pricing_df.drop(columns=['itineraries'], inplace=True)

In [39]:
def generate_unique_id(row):
    unique_string = f"{row['id']}_{row['first_dept_airport']}_{row['last_arr_airport']}_{row['first_dept_time']}_{row['last_arr_time']}"
    unique_id = hashlib.md5(unique_string.encode()).hexdigest()
    return unique_id

pricing_df['unique_id_fk'] = pricing_df.apply(generate_unique_id, axis=1)

#### Remove "price" and dots from column names


In [40]:
pricing_df = pricing_df.rename(columns=lambda x: x.replace('price.', '').replace('price', ''))
pricing_df = pricing_df.rename(columns=lambda x: x.replace('pricingOptions.', '').replace('price', ''))

In [41]:
# Split 'additionalServices' into two columns
pricing_df = pricing_df.drop(['additionalServices','fees','type','id','source','first_dept_time','last_arr_time','last_arr_airport','first_dept_airport'], axis=1)

In [42]:

pricing_df['incremental_day'] =now
pricing_df['incremental_day'] = pd.to_datetime(pricing_df['incremental_day'])


In [43]:
pricing_df.rename(columns={'grandTotal': 'grand_total','fareType': 'fare_type','includedCheckedBagsOnly': 'included_checkedbags_only'}, inplace=True)

In [44]:
pricing_df.head()

Unnamed: 0,currency,total,base,grand_total,fare_type,included_checkedbags_only,unique_id_fk,incremental_day
0,EUR,479.48,259.0,479.48,[PUBLISHED],False,9a6cd24faa6a82e8afa41e39eaf7723b,2023-11-11 09:43:33.777006
1,EUR,479.48,259.0,479.48,[PUBLISHED],False,364f511553003d1146d24e0e38de0d2c,2023-11-11 09:43:33.777006
2,EUR,479.48,259.0,479.48,[PUBLISHED],False,05d1f2d46b2a769d7f9530eaad6bd821,2023-11-11 09:43:33.777006
3,EUR,479.48,259.0,479.48,[PUBLISHED],False,edd508caa952605c195f19502ca3eb20,2023-11-11 09:43:33.777006
4,EUR,479.48,259.0,479.48,[PUBLISHED],False,a3d243a634cf4ac35736c54cb43dbe11,2023-11-11 09:43:33.777006


In [45]:
credentials=config.pg_credentials
db = DBManager(credentials=credentials)
db.create_table_from_df(df=offer_df,table_name='flight_offers')
db.create_table_from_df(df=itineraries_df,table_name='itineraries')
db.create_table_from_df(df=pricing_df,table_name='pricing')

INFO-db_manager:29-CONNECTED
INFO-db_manager:90-Dumped to DB


INFO-db_manager:90-Dumped to DB
INFO-db_manager:90-Dumped to DB
