<a href="https://colab.research.google.com/github/TamizharasanG/Travel-and-Tourism-Clarity-TTS/blob/main/Synthetic_Travel_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [410]:
!pip install names
!pip install geopy

In [411]:
# Import packages
import names
import tqdm

import random
import pandas as pd
from datetime import datetime as dt
from datetime import timedelta as td
from dateutil.relativedelta import relativedelta
from geopy.distance import geodesic

**Definitions**

In [308]:
dt.now() + relativedelta(years=- 2)

datetime.datetime(2021, 4, 22, 10, 9, 27, 644151)

In [329]:
#- Companies and Users
defGenders = ['male', 'female', 'none']
defAgesInterval = {'min': 23, 'max': 65}
defFlightsInterval = {'min': 0, 'max': 5}
defCompanies = {
    'MakeMyTrip': {'usersCount': 20175},
    'Trivago': {'usersCount': 21145},
    "Goibibo" : {'usersCount': 20023},
    "Yatra" : {'usersCount': 18524},
    "SOTC" : {'usersCount': 12543},
    "Expedia" : {'usersCount': 11425},
    "Cleartrip" : {'usersCount': 10245}
}

#- Flight Agencies
defFlightTypes = {
    'economic': {'price': 1.0},
    "delux" : {'price': 1.33},
    'premium': {'price': 1.95},
}
defAgenciesName = ['Vistara', 'Air India', 'IndiGo', 'SpiceJet', "Emmirates"]
defAgencies = dict()

#- Places
defPlacesName = ['Maldives', 'Sri Lanka', 'Chennai', 'Manali', "Delhi", "Mumbai", "Jaipur", "Kochi", "Madurai", "Bangkok", "Mysore", "Hydrabad", "Kolkata", "Nepal", "Ladakh", "Combodia", "Kanyakumari", 
                 "Kollam", "Goa", "Bengaluru", "Megalaya", "Varanasi", "Agra", "Jammu City", "Darjeeling", "Nagpur", "Pondicherry", "Rameswaram", "Varkala", "Thailand", "visakhapatnam"]
defPlaces = {name: dict() for name in defPlacesName}
defDistancesInterval = {'min': 500.0, 'max': 2000.0}
defPlaceTravelKmPerHour = 500.0 

#- Lodge (Accommodation)
defLodgesInterval = {'min': 1, 'max': 6}
defLodgesPrices   = {'min': 1000.0, 'max': 4000.0}
defLodgesPrex = 'Hotel'
defLodges = {name: list() for name in defPlacesName}

#- Travel
defTravels = list()
defTravelsDays = {'min': 3, 'max': 10}
defTravelsFlightPrices = {'init': 3000.0, 'interval': 900.0}
defTravelWithLodge = 1
defTravelDate = {'init': dt.now() + relativedelta(years=- 2), 'interval':{'min': 1, 'max': 10}}


**Companies and Users - Generator**

In [330]:
#- Functions
def funcUserGenerator(genders, agesInterval, flightsInterval, code):
    '''
    Generate random user, based on predefinitions.
    - genders: list
    - agesInterval {min, max}: user age
    - flightsInterval {min, max}: number of flights
    - code: user ID
    '''
    user = dict()
    user['code'] = code
    user['gender'] = genders[random.randint(0, len(genders)-1)]
    gender = user['gender'] if (user['gender'] != 'none') else False
    user['name'] = names.get_full_name(gender=gender)
    user['age'] = random.randint(agesInterval['min'], agesInterval['max'])
    user['flights'] = random.randint(flightsInterval['min'], flightsInterval['max'])
    return user

In [331]:
#- Fill Companies data
userId = 0
for company, data in defCompanies.items():
    users = list()
    for idx in range(data['usersCount']):
        user = funcUserGenerator(defGenders, defAgesInterval, defFlightsInterval, userId)
        users.append(user)
        userId += 1
    defCompanies[company]['users'] = users

In [332]:
#defCompanies#['SOTC']['users']

**Flight Agencies - Generator**

In [333]:
#- Functions
def funcAgencyGenerator(flightTypes):
    '''
    Generate random agency services, based on predefinitions.
    - flightTypes: types of flight
    '''
    agency = dict()
    types = list(flightTypes.copy().keys())
    random.shuffle(types)
    typesMany = random.randint(1, len(types))
    agency['types'] = [types[i] for i in range(typesMany)]
    return agency

In [334]:
for agency in defAgenciesName:
    defAgencies[agency] = funcAgencyGenerator(defFlightTypes)

Example - Flight Types of Agencies

In [335]:
defAgencies

{'Vistara': {'types': ['economic']},
 'Air India': {'types': ['economic', 'premium']},
 'IndiGo': {'types': ['delux']},
 'SpiceJet': {'types': ['delux', 'premium', 'economic']},
 'Emmirates': {'types': ['delux', 'economic', 'premium']}}

**Places - Generator**

In [336]:
#- Functions
def funcPlaceGenerator(i, j, distInterval, kmPerHour):
    '''
    Generate random place distances, based on predefinitions.
    - i: number of place
    - j: number of place
    - distInterval {min, max} values: distance range
    - kmPerHour: km per hour of the plain
    '''
    if i == j:
        return False, False, False
    distance = round(random.uniform(distInterval['min'], distInterval['max']), 2)
    time = round(distance/kmPerHour, 2)
    hours = int(time)
    minutes = (time*60) % 60
    timeMsg = '%d:%dh' % (hours, minutes)
    return (distance, time, timeMsg)

In [337]:
n = len(defPlacesName)
for i in range(n):
    for j in range(i, n):
        fromPlace = defPlacesName[i]
        toPlace = defPlacesName[j]
        distance, time, msg = funcPlaceGenerator(i, j, defDistancesInterval, defPlaceTravelKmPerHour)
        if distance and time:
            place = {'distance': distance, 'time': time, 'timeMsg': msg}
            defPlaces[fromPlace][toPlace] = place
            defPlaces[toPlace][fromPlace] = place

Example - Distances from a Place

In [338]:
defPlaces['Chennai']

{'Maldives': {'distance': 1245.73, 'time': 2.49, 'timeMsg': '2:29h'},
 'Srilanka': {'distance': 965.8, 'time': 1.93, 'timeMsg': '1:55h'},
 'Manali': {'distance': 1576.83, 'time': 3.15, 'timeMsg': '3:9h'},
 'Delhi': {'distance': 1841.47, 'time': 3.68, 'timeMsg': '3:40h'},
 'Mumbai': {'distance': 728.71, 'time': 1.46, 'timeMsg': '1:27h'},
 'Jaipur': {'distance': 1890.98, 'time': 3.78, 'timeMsg': '3:46h'},
 'Kochi': {'distance': 1413.9, 'time': 2.83, 'timeMsg': '2:49h'},
 'Madurai': {'distance': 1974.74, 'time': 3.95, 'timeMsg': '3:57h'},
 'Bangkok': {'distance': 1329.9, 'time': 2.66, 'timeMsg': '2:39h'},
 'Mysore': {'distance': 994.62, 'time': 1.99, 'timeMsg': '1:59h'},
 'Hydrabad': {'distance': 1087.65, 'time': 2.18, 'timeMsg': '2:10h'},
 'Kolkata': {'distance': 802.2, 'time': 1.6, 'timeMsg': '1:36h'},
 'Nepal': {'distance': 632.59, 'time': 1.27, 'timeMsg': '1:16h'},
 'Ladakh': {'distance': 632.96, 'time': 1.27, 'timeMsg': '1:16h'},
 'Combodia': {'distance': 780.55, 'time': 1.56, 'timeM

**Lodges - Generator**

In [339]:
#- Definitions
defName = 'A'


#- Functions
def getNextChar(text):
    '''
    Generate order alphabetic.
    - text: input text
    '''
    if len(text) == 0:
        return 'A'
    nextChar = chr(ord(text[-1]) + 1)
    if nextChar <= 'Z':
        text = text[:-1] + nextChar
    else:
        text = getNextChar(text[:-1]) + 'A'
    return text


def funcLodgesGenerator(lodgesInterval, lodgesPrices):
    '''
    Generate random lodges, based on predefinitions.
    - lodgesInterval {min, max} values: number of hotels
    - lodgesPrices {min, max} values: hotel range
    '''
    global defName
    lodges = list()
    n = random.randint(lodgesInterval['min'], lodgesInterval['max'])
    for i in range(n):
        lodgeName = '%s %s' % (defLodgesPrex, defName)
        price = round(random.uniform(lodgesPrices['min'], lodgesPrices['max']), 2)
        lodge = {'code': defName, 'name': lodgeName, 'price': price}
        lodges.append(lodge)
        defName = getNextChar(defName)
    return lodges

In [340]:
for name in defPlacesName:
    lodges = funcLodgesGenerator(defLodgesInterval, defLodgesPrices)
    defLodges[name] = lodges

Example - Hotels from a Place

In [341]:
defLodges['Maldives']

[{'code': 'A', 'name': 'Hotel A', 'price': 3084.12},
 {'code': 'B', 'name': 'Hotel B', 'price': 2082.38},
 {'code': 'C', 'name': 'Hotel C', 'price': 2239.89},
 {'code': 'D', 'name': 'Hotel D', 'price': 3116.07}]

**Travel Possibilities - Generator**

In [342]:
#- Functions
def funcCalculatePrice(priceMin, priceMax, weight):
    '''
    Calculate a random price for a travel.
    - priceMin: min price
    - priceMax: max price
    - weight: weight the price range
    '''
    priceMin = priceMin * weight
    priceMax = priceMax * weight
    price = round(random.uniform(priceMin, priceMax), 2)
    return price


def funcElaborateflight(fromPlace, toPlace, distance, agency, flightType, price, time, timeMsg):
    '''
    Elaborate a possible flight.
    - fromPlace: from
    - toPlace: to
    - distance: distance
    - agency: agency name
    - flightType: flight type
    - price: flight price
    - time: time in hours
    - timeMsg: time calculated
    '''
    flight = {'from': fromPlace, 'to': toPlace, 'distance': distance,
              'agency': agency, 'flightType': flightType, 'price': price,
              'time': time, 'timeMsg': timeMsg}
    return flight


def funcFlightsPossibilities(places, flightPrices, flightTypes, agencies):
    '''
    Elaborate a list of possible flights.
    - places: places data
    - flightPrices: flight prices
    - flightTypes: flight types
    - agencies: agencies data
    '''
    flightsPossibilities = list()
    for fromPlace, toPlaces in places.items():
        toPlacesSorted = sorted(toPlaces.items(), key=lambda x:x[1]['distance'], reverse=False)
        priceA, priceB = flightPrices['init'], \
                         flightPrices['init'] + flightPrices['interval']
        for (toPlace, placeData) in toPlacesSorted:
            for (agencyName, agencyData) in agencies.items():
                if len(agencyData['types']) > 1: # has more than 1 element
                    for typeA in agencyData['types']:
                        weight = flightTypes[typeA]['price']
                        price = funcCalculatePrice(priceA, priceB, weight)
                        flight = funcElaborateflight(fromPlace, toPlace, placeData['distance'], \
                                                     agencyName, typeA, price, placeData['time'], placeData['timeMsg'])
                        flightsPossibilities.append(flight)
                else:
                    typeA = agencyData['types'][0]
                    weight = flightTypes[typeA]['price']
                    price = funcCalculatePrice(priceA, priceB, weight)
                    flight = funcElaborateflight(fromPlace, toPlace, placeData['distance'], agencyName, \
                                                 typeA, price, placeData['time'], placeData['timeMsg'])
                    flightsPossibilities.append(flight)
            # Update prices for bigger distances
            priceA, priceB = priceB, priceB + flightPrices['interval']
    return flightsPossibilities


def funcLodgesPossibilities(placesName, lodges):
    '''
    Elaborate a list of possible hotels.
    - placesName: places names
    - lodges: lodges data
    '''
    lodgesPossibilities = list()
    for place in placesName:
        for lodge in lodges[place]:
            lodge = lodge.copy()
            lodge['place'] = place
            lodgesPossibilities.append(lodge)
    return lodgesPossibilities

Example - Fligts Possibilities (for each Place)

In [343]:
flightsPossibilities = funcFlightsPossibilities(defPlaces, defTravelsFlightPrices, defFlightTypes, defAgencies)
pd.DataFrame(flightsPossibilities).head(5)

Unnamed: 0,from,to,distance,agency,flightType,price,time,timeMsg
0,Maldives,Bangkok,569.3,Vistara,economic,3775.26,1.14,1:8h
1,Maldives,Bangkok,569.3,Air India,economic,3804.01,1.14,1:8h
2,Maldives,Bangkok,569.3,Air India,premium,6041.51,1.14,1:8h
3,Maldives,Bangkok,569.3,IndiGo,delux,4036.65,1.14,1:8h
4,Maldives,Bangkok,569.3,SpiceJet,delux,5074.45,1.14,1:8h


Example - Hotel Possibilities (for each Place)



In [344]:
lodgesPossibilities = funcLodgesPossibilities(defPlacesName, defLodges)


In [345]:
pd.DataFrame(lodgesPossibilities).head(5)


Unnamed: 0,code,name,price,place
0,A,Hotel A,3084.12,Maldives
1,B,Hotel B,2082.38,Maldives
2,C,Hotel C,2239.89,Maldives
3,D,Hotel D,3116.07,Maldives
4,E,Hotel E,1256.92,Srilanka


**Travel Dataset - Generator**


In [346]:
#- Definitions
travelCode = 0


#- Functions
def df2Dict(df):
    '''
    Convert dataframe into dict
    '''
    procDict = dict()
    tmp = df.to_dict('split')
    data = tmp['data'][0]
    for (i, column) in enumerate(tmp['columns']):
        procDict[column] = data[i]
    return procDict


def funcTravelsSimulated(companies, flightsPossibilities, lodgesPossibilities, travelDate, travelsDays, \
                         travelWithLodge, placesName):
    '''
    Elaborate random travels with flights and lodges, based on possibilities.
    - flightsPossibilities: possible flights
    - lodgesPossibilities: possible hotels
    '''
    global travelCode
    dfFlightsPos = pd.DataFrame(flightsPossibilities)
    dfLodgesPos = pd.DataFrame(lodgesPossibilities)
    flightsSimulated, lodgesSimulated = list(), list()
    for (companyName, companyData) in companies.items():
        for user in companyData['users']:
            date = travelDate['init']
            for i in range(user['flights']):
                # random - days, places, hotel?
                daysFlight = random.randint(travelsDays['min'], travelsDays['max'])
                daysNextTravel = random.randint(travelDate['interval']['min'], travelDate['interval']['min'])
                fromPlace, toPlace = random.sample(placesName, 2)
                chanceTravelWithLodge = (random.randrange(100) < travelWithLodge*100)
                # travels
                fromConditions = (dfFlightsPos['from']==fromPlace) & (dfFlightsPos['to']==toPlace)
                tmpFlightFrom  = df2Dict(dfFlightsPos[fromConditions].sample(n=1))
                toConditions = (dfFlightsPos['from']==toPlace) & (dfFlightsPos['to']==fromPlace) & \
                               (dfFlightsPos['agency']==tmpFlightFrom['agency']) & \
                               (dfFlightsPos['flightType']==tmpFlightFrom['flightType'])
                tmpFlightTo  = df2Dict(dfFlightsPos[toConditions])
                tmpFlightFrom['userCode'] = tmpFlightTo['userCode'] = user['code']
                tmpFlightFrom['travelCode'] = tmpFlightTo['travelCode'] = travelCode
                tmpFlightFrom['date'] = date
                tmpFlightTo['date']   = date + td(days=daysFlight)
                # lodge
                if chanceTravelWithLodge:
                    lodgeConditions = (dfLodgesPos['place']==toPlace)
                    tmpLodge = df2Dict(dfLodgesPos[lodgeConditions])
                    tmpLodge['userCode'] = user['code']
                    tmpLodge['date'] = date
                    tmpLodge['days'] = daysFlight
                    tmpLodge['total'] = round(tmpLodge['price'] * daysFlight, 2)
                    tmpLodge['travelCode'] = travelCode
                    lodgesSimulated.append(tmpLodge)
                # save and update data
                flightsSimulated.append(tmpFlightFrom)
                flightsSimulated.append(tmpFlightTo)
                travelCode += 1
                date = dt.now() + td(days=daysNextTravel)
    return flightsSimulated, lodgesSimulated

In [347]:
flightsSimulated, lodgesSimulated = \
        funcTravelsSimulated(defCompanies, flightsPossibilities, lodgesPossibilities, 
        defTravelDate, defTravelsDays, defTravelWithLodge, defPlacesName)


Example - Travel (From->To + To->From)



In [348]:
len(flightsSimulated)

571166

In [349]:
len(lodgesSimulated)

285583

In [350]:
from_data = []
to_data = []
for i in range(len(flightsSimulated)):
  if i % 2 == 0:
    from_data.append(flightsSimulated[i])
  elif i % 2 != 0:
    to_data.append(flightsSimulated[i])

In [None]:
from_data

In [351]:
len(to_data)

285583

In [None]:
flightsSimulated[:5]

**Converting the json to Data Frame - Travelling Data**

---



In [356]:
d1 = pd.DataFrame(from_data[0], index = [0])
for i in range(len(from_data)):
  dt = pd.DataFrame(from_data[i], index = [0])
  d1 = pd.concat([d1, dt], axis = 0, ignore_index = True)

In [357]:
d1.head()

Unnamed: 0,from,to,distance,agency,flightType,price,time,timeMsg,userCode,travelCode,date
0,Darjling,Nepal,1699.6,IndiGo,delux,30412.89,3.4,3:24h,0,0,2021-04-22 10:18:46.081627
1,Darjling,Nepal,1699.6,IndiGo,delux,30412.89,3.4,3:24h,0,0,2021-04-22 10:18:46.081627
2,Megalaya,Delhi,1054.97,SpiceJet,premium,22299.78,2.11,2:6h,0,1,2023-04-23 10:27:36.839039
3,Darjling,Bangkok,1929.79,Air India,economic,27977.91,3.86,3:51h,0,2,2023-04-23 10:27:36.845325
4,Mysore,Nepal,1916.56,SpiceJet,delux,37017.04,3.83,3:49h,0,3,2023-04-23 10:27:36.851590


In [358]:
d1.shape

(285584, 11)

**Finding Latitude and lonngitude for the synthetic data to find their origial distance**

In [363]:
all_locations_in_india = ['Chennai', 'Manali', "Delhi", "Mumbai", "Jaipur", "Kochi", "Madurai", "Mysore", "Hydrabad", "Kolkata", "Nepal", "Ladakh", "Kanyakumari", "Kollam", "Goa", "Bengaluru", 
                 "Megalaya", "Varanasi", "Agra", "Jammu City","Darjeeling",  "Nagpur", "Pondicherry", "Rameswaram", "Varkala",  "visakhapatnam"]

outside_india = ['Maldives', "Bangkok","Combodia", "Thailand", 'Sri Lanka']

lat_data = {}
lon_data = {}
from numpy import NaN
import geopy
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="google", timeout = 2)
for i in all_locations_in_india:
  try:
    country = "India"
    loc = geolocator.geocode(i + ", "+ country)
    lat_data[i] = loc.raw["lat"]
    lon_data[i] = loc.raw["lon"] 
  except AttributeError:
    lat_data[i] = NaN
    lon_data[i] = NaN

for i in outside_india:
  try:
    loc = geolocator.geocode(i)
    lat_data[i] = loc.raw["lat"]
    lon_data[i] = loc.raw["lon"] 
  except AttributeError:
    lat_data[i] = NaN
    lon_data[i] = NaN

In [391]:
lat_data

{'Chennai': '13.0836939',
 'Manali': '32.26309405',
 'Delhi': '28.6517178',
 'Mumbai': '19.0785451',
 'Jaipur': '26.9154576',
 'Kochi': '9.9674277',
 'Madurai': '9.9261153',
 'Mysore': '12.3051828',
 'Hydrabad': '17.360589',
 'Kolkata': '22.5726459',
 'Nepal': '26.8581869',
 'Ladakh': '33.9456407',
 'Kanyakumari': '8.079252',
 'Kollam': '8.8879509',
 'Goa': '15.3004543',
 'Bengaluru': '12.9767936',
 'Megalaya': '25.5379432',
 'Varanasi': '25.3356491',
 'Agra': '27.1752554',
 'Jammu City': '32.7347754',
 'Darjeeling': '27.0377554',
 'Nagpur': '21.1498134',
 'Pondicherry': '10.91564885',
 'Rameswaram': '9.2388621',
 'Varkala': '8.7342288',
 'visakhapatnam': '17.7231276',
 'Maldives': '3.7203503',
 'Bangkok': '13.7524938',
 'Combodia': '45.7646425',
 'Thailand': '14.8971921',
 'Sri Lanka': '7.877395849999999'}

In [392]:
lon_data

{'Chennai': '80.270186',
 'Manali': '77.18812183241408',
 'Delhi': '77.2219388',
 'Mumbai': '72.878176',
 'Jaipur': '75.8189817',
 'Kochi': '76.2454436',
 'Madurai': '78.1140983',
 'Mysore': '76.6553609',
 'Hydrabad': '78.4740613',
 'Kolkata': '88.3638953',
 'Nepal': '83.0462591',
 'Ladakh': '77.6568576',
 'Kanyakumari': '77.5499338',
 'Kollam': '76.5955013',
 'Goa': '74.0855134',
 'Bengaluru': '77.590082',
 'Megalaya': '91.2999102',
 'Varanasi': '83.0076292',
 'Agra': '78.0098161',
 'Jammu City': '74.8629589',
 'Darjeeling': '88.263176',
 'Nagpur': '79.0820556',
 'Pondicherry': '79.80694879844232',
 'Rameswaram': '79.33260795536674',
 'Varkala': '76.7257086',
 'visakhapatnam': '83.3012842',
 'Maldives': '73.2244152',
 'Bangkok': '100.4935089',
 'Combodia': '4.8279289',
 'Thailand': '100.83273',
 'Sri Lanka': '80.66247852355892'}

In [None]:
d1["from_lat"] = d1["from"].map(lat_data)
d1["from_lon"] = d1["from"].map(lon_data)
d1["to_lat"] = d1["to"].map(lat_data)
d1["to_lon"] = d1["to"].map(lon_data)

In [441]:
d1

Unnamed: 0,from,to,distance,agency,flightType,price,time,timeMsg,userCode,travelCode,date,from_lat,from_lon,to_lat,to_lon
1,Darjeeling,Nepal,1699.60,IndiGo,delux,30412.89,3.40,3:24h,0,0,2021-04-22 10:18:46.081627,27.0377554,88.263176,26.8581869,83.0462591
2,Megalaya,Delhi,1054.97,SpiceJet,premium,22299.78,2.11,2:6h,0,1,2023-04-23 10:27:36.839039,25.5379432,91.2999102,28.6517178,77.2219388
3,Darjeeling,Bangkok,1929.79,Air India,economic,27977.91,3.86,3:51h,0,2,2023-04-23 10:27:36.845325,27.0377554,88.263176,13.7524938,100.4935089
4,Mysore,Nepal,1916.56,SpiceJet,delux,37017.04,3.83,3:49h,0,3,2023-04-23 10:27:36.851590,12.3051828,76.6553609,26.8581869,83.0462591
5,Jammu City,Pondicherry,1525.18,Emmirates,premium,47370.63,3.05,3:3h,0,4,2023-04-23 10:27:36.857856,32.7347754,74.8629589,10.91564885,79.80694879844232
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285579,Kolkata,Bangkok,1651.72,IndiGo,delux,33768.60,3.30,3:18h,114077,285578,2023-04-23 11:03:31.725881,22.5726459,88.3638953,13.7524938,100.4935089
285580,Goa,Mysore,959.96,Emmirates,premium,21482.16,1.92,1:55h,114077,285579,2023-04-23 11:03:31.732805,15.3004543,74.0855134,12.3051828,76.6553609
285581,Kochi,Pondicherry,1772.58,SpiceJet,premium,52326.41,3.55,3:33h,114078,285580,2021-04-22 10:18:46.081627,9.9674277,76.2454436,10.91564885,79.80694879844232
285582,Varanasi,Varkala,637.90,Emmirates,delux,7690.06,1.28,1:16h,114078,285581,2023-04-23 11:03:31.746677,25.3356491,83.0076292,8.7342288,76.7257086


In [None]:
d1[["from_lat", "from_lon", "to_lat", "to_lon"]] = d1[["from_lat", "from_lon", "to_lat", "to_lon"]].astype("float")

In [383]:
d1 = d1.iloc[1:, :]

**Finding original distance between two locations**

In [418]:
def real_dist(lst):
  from_ = (lst[0], lst[1])
  to = (lst[2], lst[3])
  return geodesic(from_, to).km

In [462]:
dist = []
for i in d1.iloc[:, -4 : ].values:
  dist.append(real_dist(i))

In [463]:
len(dist)

285583

**Finnding the cost for the original distance**

In [None]:
d1["Cost per km"] = d1["price"] / d1["distance"]
d1["Distance"] = dist
d1["Travelling time(In Minutes)"] = round((d1["Distance"] / 500) * 60, 0)
d1["Price"] = d1["Distance"] * d1["Cost per km"]

**Shedding off the unwanted features**

In [None]:
d1.drop(["distance", "price", "time", "timeMsg", "userCode", "travelCode", "date"], axis = 1, inplace = True)

**Structuring Data**

In [487]:
d1.columns = ["Boarding Point", "Destination", "Flight agency", "Class", "Boarding Latitude", "Boarding Longitude", "Destination Latitude", "Destination Longitude", "Cost prt Km", "Total Distance", "Travelling time(In Minutes)", "Travel Cost"]

In [489]:
d1.head()

Unnamed: 0,Boarding Point,Destination,Flight agency,Class,Boarding Latitude,Boarding Longitude,Destination Latitude,Destination Longitude,Cost prt Km,Total Distance,Travelling time(In Minutes),Travel Cost
1,Darjeeling,Nepal,IndiGo,delux,27.0377554,88.263176,26.8581869,83.0462591,17.894146,518.387432,62.0,9276.100229
2,Megalaya,Delhi,SpiceJet,premium,25.5379432,91.2999102,28.6517178,77.2219388,21.137833,1437.125211,172.0,30377.713143
3,Darjeeling,Bangkok,Air India,economic,27.0377554,88.263176,13.7524938,100.4935089,14.497904,1944.771603,233.0,28195.11184
4,Mysore,Nepal,SpiceJet,delux,12.3051828,76.6553609,26.8581869,83.0462591,19.314313,1744.072128,209.0,33685.555226
5,Jammu City,Pondicherry,Emmirates,premium,32.7347754,74.8629589,10.91564885,79.80694879844232,31.059042,2468.718848,296.0,76676.042918


**Converting the json to Data Frame - Hotel Data**

In [427]:
dh = pd.DataFrame(lodgesSimulated[0], index = [0])
for i in range(len(lodgesSimulated)):
  dth = pd.DataFrame(lodgesSimulated[i], index = [0])
  dh = pd.concat([dh, dth], axis = 0, ignore_index = True)

In [498]:
dh

Unnamed: 0,code,name,price,place,userCode,date,days,total,travelCode
1,AV,Hotel AV,3307.87,Nepal,0,2021-04-22 10:18:46.081627,5,16539.35,0
2,M,Hotel M,2140.44,Delhi,0,2023-04-23 10:27:36.839039,4,8561.76,1
3,AE,Hotel AE,3424.90,Bangkok,0,2023-04-23 10:27:36.845325,6,20549.40,2
4,AV,Hotel AV,3307.87,Nepal,0,2023-04-23 10:27:36.851590,4,13231.48,3
5,CN,Hotel CN,1583.78,Pondicherry,0,2023-04-23 10:27:36.857856,4,6335.12,4
...,...,...,...,...,...,...,...,...,...
285579,AE,Hotel AE,3424.90,Bangkok,114077,2023-04-23 11:03:31.725881,8,27399.20,285578
285580,AJ,Hotel AJ,2190.04,Mysore,114077,2023-04-23 11:03:31.732805,3,6570.12,285579
285581,CN,Hotel CN,1583.78,Pondicherry,114078,2021-04-22 10:18:46.081627,9,14254.02,285580
285582,CX,Hotel CX,1869.97,Varkala,114078,2023-04-23 11:03:31.746677,6,11219.82,285581


In [497]:
dh = dh.iloc[1:, :]

**Structuring Hotel Data**

In [499]:
dh_refined = dh.loc[:, ["name", "price", "place", "days", "total"]]
dh_refined.columns = ["Hotel Name", "Cost per day", "Hotel place", "Travel days", "Total stay cost"]

**Merging two data**

In [530]:
travel_data = d1.copy()
hotel_data = dh_refined.copy()

In [531]:
Tourism_data = pd.concat([travel_data, hotel_data], axis = 1, join = "outer")

In [532]:
Tourism_data

Unnamed: 0,Boarding Point,Destination,Flight agency,Class,Boarding Latitude,Boarding Longitude,Destination Latitude,Destination Longitude,Cost prt Km,Total Distance,Travelling time(In Minutes),Travel Cost,Hotel Name,Cost per day,Hotel place,Travel days,Total stay cost
1,Darjeeling,Nepal,IndiGo,delux,27.0377554,88.263176,26.8581869,83.0462591,17.894146,518.387432,62.0,9276.100229,Hotel AV,3307.87,Nepal,5,16539.35
2,Megalaya,Delhi,SpiceJet,premium,25.5379432,91.2999102,28.6517178,77.2219388,21.137833,1437.125211,172.0,30377.713143,Hotel M,2140.44,Delhi,4,8561.76
3,Darjeeling,Bangkok,Air India,economic,27.0377554,88.263176,13.7524938,100.4935089,14.497904,1944.771603,233.0,28195.111840,Hotel AE,3424.90,Bangkok,6,20549.40
4,Mysore,Nepal,SpiceJet,delux,12.3051828,76.6553609,26.8581869,83.0462591,19.314313,1744.072128,209.0,33685.555226,Hotel AV,3307.87,Nepal,4,13231.48
5,Jammu City,Pondicherry,Emmirates,premium,32.7347754,74.8629589,10.91564885,79.80694879844232,31.059042,2468.718848,296.0,76676.042918,Hotel CN,1583.78,Pondicherry,4,6335.12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285579,Kolkata,Bangkok,IndiGo,delux,22.5726459,88.3638953,13.7524938,100.4935089,20.444506,1610.991341,193.0,32935.922665,Hotel AE,3424.90,Bangkok,8,27399.20
285580,Goa,Mysore,Emmirates,premium,15.3004543,74.0855134,12.3051828,76.6553609,22.378182,432.443572,52.0,9677.301145,Hotel AJ,2190.04,Mysore,3,6570.12
285581,Kochi,Pondicherry,SpiceJet,premium,9.9674277,76.2454436,10.91564885,79.80694879844232,29.519914,403.794848,48.0,11919.989364,Hotel CN,1583.78,Pondicherry,9,14254.02
285582,Varanasi,Varkala,Emmirates,delux,25.3356491,83.0076292,8.7342288,76.7257086,12.055275,1954.284189,235.0,23559.433563,Hotel CX,1869.97,Varkala,6,11219.82


**Function to add the travel agency name**

In [533]:
def travel_agent(x) :
  if x>= 0 and x <= 51405:
    return "MakeMyTrip"
  elif x > 51405 and x <= 102810:
    return "Trivago"
  elif x > 102810 and x <= 151359:
    return "Goibibo"
  elif x > 151359 and x <= 197052:
    return "Yatra"
  elif x > 197052 and x <= 228466:
    return "SOTC"
  elif x > 228466 and x <= 257025:
    return "Expedia"
  elif x > 257025 and x <= 285583:
    return "ClearTrip"

#creating the data for travel agency column
agency = []
for i in range(ddf.shape[0]):
  agency.append(travel_agent(i))

#Inserting the column
Tourism_data.insert(2, "Travel Agency", agency)

#Adding the final target columnn - Total Package cost
Tourism_data["Total Package"] = Tourism_data["Travel Cost"] + Tourism_data["Total stay cost"]

In [536]:
Tourism_data

Unnamed: 0,Boarding Point,Destination,Travel Agency,Flight agency,Class,Boarding Latitude,Boarding Longitude,Destination Latitude,Destination Longitude,Cost prt Km,Total Distance,Travelling time(In Minutes),Travel Cost,Hotel Name,Cost per day,Hotel place,Travel days,Total stay cost,Total Package
1,Darjeeling,Nepal,MakeMyTrip,IndiGo,delux,27.0377554,88.263176,26.8581869,83.0462591,17.894146,518.387432,62.0,9276.100229,Hotel AV,3307.87,Nepal,5,16539.35,25815.450229
2,Megalaya,Delhi,MakeMyTrip,SpiceJet,premium,25.5379432,91.2999102,28.6517178,77.2219388,21.137833,1437.125211,172.0,30377.713143,Hotel M,2140.44,Delhi,4,8561.76,38939.473143
3,Darjeeling,Bangkok,MakeMyTrip,Air India,economic,27.0377554,88.263176,13.7524938,100.4935089,14.497904,1944.771603,233.0,28195.111840,Hotel AE,3424.90,Bangkok,6,20549.40,48744.511840
4,Mysore,Nepal,MakeMyTrip,SpiceJet,delux,12.3051828,76.6553609,26.8581869,83.0462591,19.314313,1744.072128,209.0,33685.555226,Hotel AV,3307.87,Nepal,4,13231.48,46917.035226
5,Jammu City,Pondicherry,MakeMyTrip,Emmirates,premium,32.7347754,74.8629589,10.91564885,79.80694879844232,31.059042,2468.718848,296.0,76676.042918,Hotel CN,1583.78,Pondicherry,4,6335.12,83011.162918
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285579,Kolkata,Bangkok,ClearTrip,IndiGo,delux,22.5726459,88.3638953,13.7524938,100.4935089,20.444506,1610.991341,193.0,32935.922665,Hotel AE,3424.90,Bangkok,8,27399.20,60335.122665
285580,Goa,Mysore,ClearTrip,Emmirates,premium,15.3004543,74.0855134,12.3051828,76.6553609,22.378182,432.443572,52.0,9677.301145,Hotel AJ,2190.04,Mysore,3,6570.12,16247.421145
285581,Kochi,Pondicherry,ClearTrip,SpiceJet,premium,9.9674277,76.2454436,10.91564885,79.80694879844232,29.519914,403.794848,48.0,11919.989364,Hotel CN,1583.78,Pondicherry,9,14254.02,26174.009364
285582,Varanasi,Varkala,ClearTrip,Emmirates,delux,25.3356491,83.0076292,8.7342288,76.7257086,12.055275,1954.284189,235.0,23559.433563,Hotel CX,1869.97,Varkala,6,11219.82,34779.253563


In [541]:
Tourism_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285583 entries, 1 to 285583
Data columns (total 19 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Boarding Point               285583 non-null  object 
 1   Destination                  285583 non-null  object 
 2   Travel Agency                285583 non-null  object 
 3   Flight agency                285583 non-null  object 
 4   Class                        285583 non-null  object 
 5   Boarding Latitude            285583 non-null  object 
 6   Boarding Longitude           285583 non-null  object 
 7   Destination Latitude         285583 non-null  object 
 8   Destination Longitude        285583 non-null  object 
 9   Cost prt Km                  285583 non-null  float64
 10  Total Distance               285583 non-null  float64
 11  Travelling time(In Minutes)  285583 non-null  int16  
 12  Travel Cost                  285583 non-null  float64
 13 

In [545]:
Tourism_data["Travelling time(In Minutes)"] = Tourism_data["Travelling time(In Minutes)"].astype("int16")
Tourism_data["Cost prt Km"] = Tourism_data["Cost prt Km"].astype("float32")
Tourism_data["Total Distance"] = Tourism_data["Total Distance"].astype("float32")
Tourism_data["Travel Cost"] = Tourism_data["Travel Cost"].astype("float32")
Tourism_data["Cost per day"] = Tourism_data["Cost per day"].astype("float32")
Tourism_data["Cost prt Km"] = Tourism_data["Cost prt Km"].astype("float32")
Tourism_data["Travel days"] = Tourism_data["Travel days"].astype("int8")
Tourism_data["Total stay cost"] = Tourism_data["Total stay cost"].astype("float32")
Tourism_data["Total Package"] = Tourism_data["Total Package"].astype("float32")
Tourism_data["Boarding Latitude"] = Tourism_data["Boarding Latitude"].astype("float32")
Tourism_data["Boarding Longitude"] = Tourism_data["Boarding Longitude"].astype("float32")
Tourism_data["Destination Latitude"] = Tourism_data["Destination Latitude"].astype("float32")
Tourism_data["Destination Longitude"] = Tourism_data["Destination Longitude"].astype("float32")

**Optimizing data types for better dataset size**

In [546]:
Tourism_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285583 entries, 1 to 285583
Data columns (total 19 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Boarding Point               285583 non-null  object 
 1   Destination                  285583 non-null  object 
 2   Travel Agency                285583 non-null  object 
 3   Flight agency                285583 non-null  object 
 4   Class                        285583 non-null  object 
 5   Boarding Latitude            285583 non-null  float32
 6   Boarding Longitude           285583 non-null  float32
 7   Destination Latitude         285583 non-null  float32
 8   Destination Longitude        285583 non-null  float32
 9   Cost prt Km                  285583 non-null  float32
 10  Total Distance               285583 non-null  float32
 11  Travelling time(In Minutes)  285583 non-null  int16  
 12  Travel Cost                  285583 non-null  float32
 13 

In [None]:
Tourism_data.head()

In [None]:
Tourism_data.to_csv("Travel Agency Data - Synthetic.csv")

# **Since The Data file is Uploaded in Git Hub in two parts**