In [39]:
import requests
import json
import pandas as pd
from pandas import json_normalize
import config
from db_manager import DBManager
import os
from rest_api import Api
from datetime import date, datetime
from dateutil.relativedelta import relativedelta
from urllib.parse import quote_plus
from sqlalchemy import create_engine
import itertools
import logging

In [40]:
logging.basicConfig(
    format='%(levelname)s-%(lineno)s-%(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level='INFO'
)
logger=logging.getLogger(__name__)

#### Show all columns without truncating

Today's date

In [41]:
today = date.today()
logger.info(f'Current date: {today}')

INFO-__main__:2-Current date: 2023-10-17


Adding months to today's date. Near term travel dates 30 - 60 days ahead

In [42]:
departure_date_1_month = today + relativedelta(months=1)
departure_date_2_month = today + relativedelta(months=2)
return_date_1_month = departure_date_2_month
return_date_2_month = today + relativedelta(months=3)
logger.info(f'Departure_dates:{departure_date_1_month} and {departure_date_2_month} \nReturn dates:{return_date_1_month} and {return_date_2_month}')

INFO-__main__:5-Departure_dates:2023-11-17 and 2023-12-17 
Return dates:2023-12-17 and 2024-01-17


Get the nearby festival departure dates by start_date from the public.indian_holidays table

In [43]:
# credentials=config.pg_credentials
# db = DBManager(credentials=credentials)
# df_departure_arrival_dates = db.run_query(query_file_name='festival_dates')

In [44]:
#df_departure_arrival_dates 

Apply the combination of params to get data for all possible combinations of depart, arrival dates and depart arrival airports

Combination itertools usage. Im calculating the total combinations by multiplying the number of possibilities (unique values) for each variable<br />
Departure airport: 2 possibilities<br />
Arrival airport: 4 possibilities<br />
Departure date: 2 possibilities<br />
Arrival date: 1 possibilities<br />
Total combinations = 2 x 4 x 2 x 1 = 16 combinations

#### Travel payouts API to get flight search results for each combination in the list 

--------------------------------------------------------

API test for amadeus

POST request to the Amadeus authorization server to get the access token

In [45]:
response = requests.post(url=config.url_token, headers=config.headers_token, data=config.data)

if response.status_code == 200:
  # API call succeeded
  token = response.json()['access_token'] 
  logger.info('Successful access token generation')
else:
  # API call failed
  print(response.text)

INFO-__main__:6-Successful access token generation


In [46]:
#Combinations of destination and origin airport codes
origins = config.params['originLocationCode']
destinations = config.params['destinationLocationCode']
departure_dates=[departure_date_1_month,departure_date_2_month]
# return_dates=[return_date_1_month,return_date_2_month]

Creating combinations of Origin and Destination to be passed as parameters to the api call

trial

In [47]:
headers = {
  'client_id': config.client_id, 
  'client_secret': config.client_secret,
  'Authorization': f'Bearer {token}'
  }

url = config.url

response = []
counter = 0

for origin, dest, depart in itertools.product(origins, destinations, departure_dates):

    params={
        'originLocationCode': origin,
        'destinationLocationCode':dest, 
        'departureDate': depart, 
        'returnDate' : None,
        'adults':1,
        'children':None,
        'infants':None,
        'travelClass':None,
        'currencyCode':'EUR',
        'maxPrice' : None
        }


    #to be worked on later for by calling rest_api module
    # api = Api()
    # resp = api.make_flight_api_request(url,headers,params)
    resp = requests.get(url, headers=headers, params=params)
    counter = counter+1
    resp = resp.json()
    response.append(resp)
    logger.info(f'API call {counter} succeeded for flight offers')
    # else:
    #     # API call failed
    #     logger.info('API call failed')


INFO-__main__:35-API call 1 succeeded for flight offers
INFO-__main__:35-API call 2 succeeded for flight offers
INFO-__main__:35-API call 3 succeeded for flight offers
INFO-__main__:35-API call 4 succeeded for flight offers
INFO-__main__:35-API call 5 succeeded for flight offers
INFO-__main__:35-API call 6 succeeded for flight offers
INFO-__main__:35-API call 7 succeeded for flight offers
INFO-__main__:35-API call 8 succeeded for flight offers
INFO-__main__:35-API call 9 succeeded for flight offers
INFO-__main__:35-API call 10 succeeded for flight offers
INFO-__main__:35-API call 11 succeeded for flight offers
INFO-__main__:35-API call 12 succeeded for flight offers
INFO-__main__:35-API call 13 succeeded for flight offers
INFO-__main__:35-API call 14 succeeded for flight offers
INFO-__main__:35-API call 15 succeeded for flight offers
INFO-__main__:35-API call 16 succeeded for flight offers


Convert that sample nested JSON data into separate DataFrames

In [81]:
# Flight offers DataFrame
for r in response: 
    offers_df = pd.DataFrame(r['data'])
    # Itineraries DataFrame
    itineraries_df = pd.json_normalize(r['data'], record_path='itineraries', meta=['id', 'source'])
    # # Traveler pricing DataFrame
    traveler_pricing_df = pd.json_normalize(r['data'], record_path=['travelerPricings'], meta=['id', 'source'])


Treating the itineraries df

In [82]:
# Explode the 'segment' column to create separate rows for each dictionary
exploded_df = itineraries_df.explode('segments')

# Split the dictionary into columns  
df2 = exploded_df['segments'].apply(pd.Series)

# Merge the new columns back to the original DataFrame
df = exploded_df.merge(df2, right_index=True, left_index=True, suffixes=('_original', '_segments'))

# Drop the original dictionary column
df = df.drop(columns=['segments']) 
df = df.rename(columns={'duration_original': 'duration_total'})

In [83]:
# Extract departure keys using apply
df['dept_airport'] = df['departure'].apply(lambda x: x['iataCode'])
df['dept_at'] = df['departure'].apply(lambda x: x['at'])
df['arrival_airport'] = df['arrival'].apply(lambda x: x['iataCode'])
df['arrival_at'] = df['arrival'].apply(lambda x: x['at'])
df['aircraft'] = df['aircraft'].apply(lambda x: x['code'])
df.head()
# Drop stops column if it exists
column_to_drop = 'stops'
if column_to_drop in df.columns:
    df.drop(column_to_drop,axis=1,inplace=True)
itineraries_df = df.drop(columns=['departure','arrival','operating','aircraft','blacklistedInEU','id_segments'])

itineraries_df = itineraries_df.drop_duplicates()
# Reset index
itineraries_df = itineraries_df.reset_index(drop=True)
itineraries_df.head()

Unnamed: 0,duration_total,id_original,source,carrierCode,number,duration_segments,numberOfStops,dept_airport,dept_at,arrival_airport,arrival_at
0,PT12H10M,1,GDS,AI,120,PT7H35M,0,FRA,2023-12-17T20:10:00,DEL,2023-12-18T08:15:00
1,PT12H10M,1,GDS,AI,429,PT2H55M,0,DEL,2023-12-18T09:55:00,MAA,2023-12-18T12:50:00
2,PT14H5M,2,GDS,AI,120,PT7H35M,0,FRA,2023-12-17T20:10:00,DEL,2023-12-18T08:15:00
3,PT14H5M,2,GDS,AI,554,PT2H45M,0,DEL,2023-12-18T12:00:00,MAA,2023-12-18T14:45:00
4,PT18H35M,3,GDS,AI,120,PT7H35M,0,FRA,2023-12-17T20:10:00,DEL,2023-12-18T08:15:00


Renaming, reordering and changing datatypes

In [84]:
# Rename the columns according to the SQL table schema
itineraries_df = itineraries_df.rename(columns={
    'id_original': 'id_original',
    'source': 'source',
    'duration_segments': 'duration_segments',
    'duration_total': 'duration_total',
    'carrierCode': 'carrier_code',
    'number': 'number',
    'duration_segments': 'duration_segments',
    'numberOfStops': 'number_of_stops',
    'dept_airport': 'dept_airport',
    'dept_at': 'dept_at',
    'arrival_airport': 'arrival_airport',
    'arrival_at': 'arrival_at'
})

# Ensure data types are set correctly
itineraries_df['id_original'] = itineraries_df['id_original'].astype(int)
itineraries_df['number_of_stops'] = itineraries_df['number_of_stops'].astype(int)
itineraries_df['dept_at'] = pd.to_datetime(itineraries_df['dept_at'])
itineraries_df['arrival_at'] = pd.to_datetime(itineraries_df['arrival_at'])

now = datetime.now()
itineraries_df['incremental_day']=now
itineraries_df['incremental_day'] = pd.to_datetime(itineraries_df['incremental_day'])


In [85]:
logger.info(f'Current date: {now}')

INFO-__main__:1-Current date: 2023-10-17 08:27:06.659224


Finding out which columns in a dataframe are lists 

In [86]:
# Assuming deduplicated_df is your DataFrame
# list_columns = []

# for column in final_df.columns:
#     if final_df[column].apply(lambda x: isinstance(x, list)).any():
#         list_columns.append(column)

# print("Columns with lists:", list_columns)

Treating traveler_pricing_df

In [87]:
# Explode the 'segment' column to create separate rows for each dictionary
exploded_df = traveler_pricing_df.explode('fareDetailsBySegment')

# Apply json_normalize to the 'segment' column to split dictionaries into separate columns
normalized_df = json_normalize(exploded_df['fareDetailsBySegment']).add_suffix('_fare')

# Join the exploded and normalized DataFrames
final_df = exploded_df.drop('fareDetailsBySegment', axis=1).join(normalized_df)

# Drop duplicates based on all columns to deduplicate the DataFrame
deduplicated_df = final_df.drop_duplicates()

# # If you want to reset the index of the deduplicated DataFrame
deduplicated_df.reset_index(drop=True, inplace=True)
# deduplicated_df=deduplicated_df.drop(columns=['id_segments'])

# # Remove the added '_segment' suffix from column names
deduplicated_df.columns = deduplicated_df.columns.str.replace('_fare$', '', regex=True)

# #replace . in column anmes by underscore
deduplicated_df.columns = deduplicated_df.columns.str.replace('.', '_', regex=False)

# #Assigning to itineraries df
traveler_pricing_df = deduplicated_df

# #drop unwanted columns 
traveler_pricing_df=traveler_pricing_df.drop(columns=['fareBasis','class','brandedFare'])



---------------------------------------------------

Renaming, reordering and changing datatypes

In [88]:
# Rename the columns according to the SQL table schema
traveler_pricing_df = traveler_pricing_df.rename(columns={
    'travelerId': 'traveler_id',
    'fareOption': 'fare_option',
    'travelerType': 'traveler_type',
    'price_currency': 'price_currency',
    'price_total': 'price_total',
    'price_base': 'price_base',
    'id': 'id',
    'source': 'source',
    'segmentId': 'segment_id',
    'cabin': 'cabin',
    'includedCheckedBags_quantity': 'included_checkedbags_quantity',
    'includedCheckedBags_weight': 'included_checkedbags_weight',
    'includedCheckedBags_weightUnit': 'included_checkedbags_unit',
    'incremental_day': 'incremental_day'
})

# Change the data types of relevant columns
traveler_pricing_df['price_total'] = traveler_pricing_df['price_total'].astype(float)
traveler_pricing_df['price_base'] = traveler_pricing_df['price_base'].astype(float)
traveler_pricing_df['id'] = traveler_pricing_df['id'].astype(int)
traveler_pricing_df['segment_id'] = traveler_pricing_df['segment_id'].astype(int)
traveler_pricing_df['included_checkedbags_quantity'] = traveler_pricing_df['included_checkedbags_quantity'].astype(float)


# Ensure data types are set correctly
traveler_pricing_df['incremental_day'] =now
traveler_pricing_df['incremental_day'] = pd.to_datetime(traveler_pricing_df['incremental_day'])


In [89]:
credentials=config.pg_credentials
db = DBManager(credentials=credentials)
db.create_table_from_df(df=itineraries_df,table_name='itineraries')
db.create_table_from_df(df=traveler_pricing_df,table_name='traveler_pricing')

INFO-db_manager:27-CONNECTED
INFO-db_manager:73-Dumped to DB
INFO-db_manager:73-Dumped to DB
