<a href="https://colab.research.google.com/github/TamizharasanG/Travel-and-Tourism-Clarity-TTS/blob/main/Synthetic_Indian_Travel_Data_for_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install names
!pip install geopy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting names
  Downloading names-0.3.0.tar.gz (789 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m789.1/789.1 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: names
  Building wheel for names (setup.py) ... [?25l[?25hdone
  Created wheel for names: filename=names-0.3.0-py3-none-any.whl size=803698 sha256=46c8fb3f72d676c14f19f14998891a01dd916a2ea44fb743237a66e41975e52f
  Stored in directory: /root/.cache/pip/wheels/f1/bc/04/55ab9499ea02359ece8b02b4169ebb30aa52d82b84c13fc506
Successfully built names
Installing collected packages: names
Successfully installed names-0.3.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# Import packages
import names
import tqdm

import random
import pandas as pd
from datetime import datetime as dt
from datetime import timedelta as td
from dateutil.relativedelta import relativedelta
from geopy.distance import geodesic

**Definitions**

In [3]:
#- Companies and Users
defGenders = ['male', 'female', 'none']
defAgesInterval = {'min': 23, 'max': 65}
defFlightsInterval = {'min': 0, 'max': 5}
defCompanies = {
    'MakeMyTrip': {'usersCount': 20175},
    'Trivago': {'usersCount': 21145},
    "Goibibo" : {'usersCount': 20023},
    "Yatra" : {'usersCount': 18524},
    "SOTC" : {'usersCount': 12543},
    "Expedia" : {'usersCount': 11425},
    "Cleartrip" : {'usersCount': 10245}
}

#- Flight Agencies
defFlightTypes = {
    'economic': {'price': 0.92},
    "delux" : {'price': 1.33},
    'premium': {'price': 1.87},
}
defAgenciesName = ['Vistara', 'Air India', 'IndiGo', 'SpiceJet', "Emmirates"]
defAgencies = dict()

#- Places
defPlacesName = ['Rajasthan', 'Tamil Nadu', 'Kerala', 'Telangana', 'Gujarat',
       'Andhra Pradesh', 'Tripura', 'Lakshadweep', 'Goa', 'Uttar Pradesh',
       'Karnataka', 'Maharashtra', 'Uttarakhand', 'Mizoram', 'Ladakh',
       'West Bengal', 'Madhya Pradesh', 'Arunachal Pradesh', 'Haryana',
       'Chhattisgarh', 'Punjab', 'Jammu and Kashmir', 'Himachal Pradesh',
       'Odisha', 'Sikkim', 'Puducherry', 'Bihar', 'Jharkhand',
       'Andaman and Nicobar Islands', 'Assam', 'Meghalaya', 'Manipur',
       'Chandigarh', 'Dadra and Nagar Haveli and Daman and Diu', 'Delhi',
       'Nagaland', 'Gulf of Kutch']
defPlaces = {name: dict() for name in defPlacesName}
defDistancesInterval = {'min': 500.0, 'max': 2000.0}
defPlaceTravelKmPerHour = 500.0 

#- Lodge (Accommodation)
defLodgesInterval = {'min': 1, 'max': 6}
defLodgesPrices   = {'min': 1000.0, 'max': 4000.0}
defLodgesPrex = 'Hotel'
defLodges = {name: list() for name in defPlacesName}

#- Travel
defTravels = list()
defTravelsDays = {'min': 3, 'max': 10}
defTravelsFlightPrices = {'init': 3000.0, 'interval': 900.0}
defTravelWithLodge = 1
defTravelDate = {'init': dt.now() + relativedelta(years=- 2), 'interval':{'min': 1, 'max': 10}}


**Companies and Users - Generator**

In [4]:
#- Functions
def funcUserGenerator(genders, agesInterval, flightsInterval, code):
    '''
    Generate random user, based on predefinitions.
    - genders: list
    - agesInterval {min, max}: user age
    - flightsInterval {min, max}: number of flights
    - code: user ID
    '''
    user = dict()
    user['code'] = code
    user['gender'] = genders[random.randint(0, len(genders)-1)]
    gender = user['gender'] if (user['gender'] != 'none') else False
    user['name'] = names.get_full_name(gender=gender)
    user['age'] = random.randint(agesInterval['min'], agesInterval['max'])
    user['flights'] = random.randint(flightsInterval['min'], flightsInterval['max'])
    return user

In [5]:
#- Fill Companies data
userId = 0
for company, data in defCompanies.items():
    users = list()
    for idx in range(data['usersCount']):
        user = funcUserGenerator(defGenders, defAgesInterval, defFlightsInterval, userId)
        users.append(user)
        userId += 1
    defCompanies[company]['users'] = users

In [None]:
#defCompanies#['SOTC']['users']

**Flight Agencies - Generator**

In [6]:
#- Functions
def funcAgencyGenerator(flightTypes):
    '''
    Generate random agency services, based on predefinitions.
    - flightTypes: types of flight
    '''
    agency = dict()
    types = list(flightTypes.copy().keys())
    random.shuffle(types)
    typesMany = random.randint(1, len(types))
    agency['types'] = [types[i] for i in range(typesMany)]
    return agency

In [7]:
for agency in defAgenciesName:
    defAgencies[agency] = funcAgencyGenerator(defFlightTypes)

Example - Flight Types of Agencies

In [None]:
defAgencies

{'Vistara': {'types': ['economic']},
 'Air India': {'types': ['economic', 'premium']},
 'IndiGo': {'types': ['delux']},
 'SpiceJet': {'types': ['delux', 'premium', 'economic']},
 'Emmirates': {'types': ['delux', 'economic', 'premium']}}

**Places - Generator**

In [8]:
#- Functions
def funcPlaceGenerator(i, j, distInterval, kmPerHour):
    '''
    Generate random place distances, based on predefinitions.
    - i: number of place
    - j: number of place
    - distInterval {min, max} values: distance range
    - kmPerHour: km per hour of the plain
    '''
    if i == j:
        return False, False, False
    distance = round(random.uniform(distInterval['min'], distInterval['max']), 2)
    time = round(distance/kmPerHour, 2)
    hours = int(time)
    minutes = (time*60) % 60
    timeMsg = '%d:%dh' % (hours, minutes)
    return (distance, time, timeMsg)

In [9]:
n = len(defPlacesName)
for i in range(n):
    for j in range(i, n):
        fromPlace = defPlacesName[i]
        toPlace = defPlacesName[j]
        distance, time, msg = funcPlaceGenerator(i, j, defDistancesInterval, defPlaceTravelKmPerHour)
        if distance and time:
            place = {'distance': distance, 'time': time, 'timeMsg': msg}
            defPlaces[fromPlace][toPlace] = place
            defPlaces[toPlace][fromPlace] = place

Example - Distances from a Place

In [None]:
#defPlaces['Chennai']

**Lodges - Generator**

In [10]:
#- Definitions
defName = 'A'


#- Functions
def getNextChar(text):
    '''
    Generate order alphabetic.
    - text: input text
    '''
    if len(text) == 0:
        return 'A'
    nextChar = chr(ord(text[-1]) + 1)
    if nextChar <= 'Z':
        text = text[:-1] + nextChar
    else:
        text = getNextChar(text[:-1]) + 'A'
    return text


def funcLodgesGenerator(lodgesInterval, lodgesPrices):
    '''
    Generate random lodges, based on predefinitions.
    - lodgesInterval {min, max} values: number of hotels
    - lodgesPrices {min, max} values: hotel range
    '''
    global defName
    lodges = list()
    n = random.randint(lodgesInterval['min'], lodgesInterval['max'])
    for i in range(n):
        lodgeName = '%s %s' % (defLodgesPrex, defName)
        price = round(random.uniform(lodgesPrices['min'], lodgesPrices['max']), 2)
        lodge = {'code': defName, 'name': lodgeName, 'price': price}
        lodges.append(lodge)
        defName = getNextChar(defName)
    return lodges

In [11]:
for name in defPlacesName:
    lodges = funcLodgesGenerator(defLodgesInterval, defLodgesPrices)
    defLodges[name] = lodges

Example - Hotels from a Place

In [None]:
defLodges['Maldives']

[{'code': 'A', 'name': 'Hotel A', 'price': 3084.12},
 {'code': 'B', 'name': 'Hotel B', 'price': 2082.38},
 {'code': 'C', 'name': 'Hotel C', 'price': 2239.89},
 {'code': 'D', 'name': 'Hotel D', 'price': 3116.07}]

**Travel Possibilities - Generator**

In [12]:
#- Functions
def funcCalculatePrice(priceMin, priceMax, weight):
    '''
    Calculate a random price for a travel.
    - priceMin: min price
    - priceMax: max price
    - weight: weight the price range
    '''
    priceMin = priceMin * weight
    priceMax = priceMax * weight
    price = round(random.uniform(priceMin, priceMax), 2)
    return price


def funcElaborateflight(fromPlace, toPlace, distance, agency, flightType, price, time, timeMsg):
    '''
    Elaborate a possible flight.
    - fromPlace: from
    - toPlace: to
    - distance: distance
    - agency: agency name
    - flightType: flight type
    - price: flight price
    - time: time in hours
    - timeMsg: time calculated
    '''
    flight = {'from': fromPlace, 'to': toPlace, 'distance': distance,
              'agency': agency, 'flightType': flightType, 'price': price,
              'time': time, 'timeMsg': timeMsg}
    return flight


def funcFlightsPossibilities(places, flightPrices, flightTypes, agencies):
    '''
    Elaborate a list of possible flights.
    - places: places data
    - flightPrices: flight prices
    - flightTypes: flight types
    - agencies: agencies data
    '''
    flightsPossibilities = list()
    for fromPlace, toPlaces in places.items():
        toPlacesSorted = sorted(toPlaces.items(), key=lambda x:x[1]['distance'], reverse=False)
        priceA, priceB = flightPrices['init'], \
                         flightPrices['init'] + flightPrices['interval']
        for (toPlace, placeData) in toPlacesSorted:
            for (agencyName, agencyData) in agencies.items():
                if len(agencyData['types']) > 1: # has more than 1 element
                    for typeA in agencyData['types']:
                        weight = flightTypes[typeA]['price']
                        price = funcCalculatePrice(priceA, priceB, weight)
                        flight = funcElaborateflight(fromPlace, toPlace, placeData['distance'], \
                                                     agencyName, typeA, price, placeData['time'], placeData['timeMsg'])
                        flightsPossibilities.append(flight)
                else:
                    typeA = agencyData['types'][0]
                    weight = flightTypes[typeA]['price']
                    price = funcCalculatePrice(priceA, priceB, weight)
                    flight = funcElaborateflight(fromPlace, toPlace, placeData['distance'], agencyName, \
                                                 typeA, price, placeData['time'], placeData['timeMsg'])
                    flightsPossibilities.append(flight)
            # Update prices for bigger distances
            priceA, priceB = priceB, priceB + flightPrices['interval']
    return flightsPossibilities


def funcLodgesPossibilities(placesName, lodges):
    '''
    Elaborate a list of possible hotels.
    - placesName: places names
    - lodges: lodges data
    '''
    lodgesPossibilities = list()
    for place in placesName:
        for lodge in lodges[place]:
            lodge = lodge.copy()
            lodge['place'] = place
            lodgesPossibilities.append(lodge)
    return lodgesPossibilities

Example - Fligts Possibilities (for each Place)

In [15]:
flightsPossibilities = funcFlightsPossibilities(defPlaces, defTravelsFlightPrices, defFlightTypes, defAgencies)
pd.DataFrame(flightsPossibilities).head(5)

Unnamed: 0,from,to,distance,agency,flightType,price,time,timeMsg
0,Rajasthan,Ladakh,526.09,Vistara,delux,5167.44,1.05,1:3h
1,Rajasthan,Ladakh,526.09,Air India,premium,5805.91,1.05,1:3h
2,Rajasthan,Ladakh,526.09,IndiGo,economic,3560.42,1.05,1:3h
3,Rajasthan,Ladakh,526.09,SpiceJet,premium,7255.19,1.05,1:3h
4,Rajasthan,Ladakh,526.09,SpiceJet,delux,4092.41,1.05,1:3h


Example - Hotel Possibilities (for each Place)



In [16]:
lodgesPossibilities = funcLodgesPossibilities(defPlacesName, defLodges)


In [17]:
pd.DataFrame(lodgesPossibilities).head(5)


Unnamed: 0,code,name,price,place
0,A,Hotel A,1505.05,Rajasthan
1,B,Hotel B,1677.02,Rajasthan
2,C,Hotel C,1607.12,Rajasthan
3,D,Hotel D,2810.3,Rajasthan
4,E,Hotel E,1622.85,Rajasthan


**Travel Dataset - Generator**


In [18]:
#- Definitions
travelCode = 0


#- Functions
def df2Dict(df):
    '''
    Convert dataframe into dict
    '''
    procDict = dict()
    tmp = df.to_dict('split')
    data = tmp['data'][0]
    for (i, column) in enumerate(tmp['columns']):
        procDict[column] = data[i]
    return procDict


def funcTravelsSimulated(companies, flightsPossibilities, lodgesPossibilities, travelDate, travelsDays, \
                         travelWithLodge, placesName):
    '''
    Elaborate random travels with flights and lodges, based on possibilities.
    - flightsPossibilities: possible flights
    - lodgesPossibilities: possible hotels
    '''
    global travelCode
    dfFlightsPos = pd.DataFrame(flightsPossibilities)
    dfLodgesPos = pd.DataFrame(lodgesPossibilities)
    flightsSimulated, lodgesSimulated = list(), list()
    for (companyName, companyData) in companies.items():
        for user in companyData['users']:
            date = travelDate['init']
            for i in range(user['flights']):
                # random - days, places, hotel?
                daysFlight = random.randint(travelsDays['min'], travelsDays['max'])
                daysNextTravel = random.randint(travelDate['interval']['min'], travelDate['interval']['min'])
                fromPlace, toPlace = random.sample(placesName, 2)
                chanceTravelWithLodge = (random.randrange(100) < travelWithLodge*100)
                # travels
                fromConditions = (dfFlightsPos['from']==fromPlace) & (dfFlightsPos['to']==toPlace)
                tmpFlightFrom  = df2Dict(dfFlightsPos[fromConditions].sample(n=1))
                toConditions = (dfFlightsPos['from']==toPlace) & (dfFlightsPos['to']==fromPlace) & \
                               (dfFlightsPos['agency']==tmpFlightFrom['agency']) & \
                               (dfFlightsPos['flightType']==tmpFlightFrom['flightType'])
                tmpFlightTo  = df2Dict(dfFlightsPos[toConditions])
                tmpFlightFrom['userCode'] = tmpFlightTo['userCode'] = user['code']
                tmpFlightFrom['travelCode'] = tmpFlightTo['travelCode'] = travelCode
                tmpFlightFrom['date'] = date
                tmpFlightTo['date']   = date + td(days=daysFlight)
                # lodge
                if chanceTravelWithLodge:
                    lodgeConditions = (dfLodgesPos['place']==toPlace)
                    tmpLodge = df2Dict(dfLodgesPos[lodgeConditions])
                    tmpLodge['userCode'] = user['code']
                    tmpLodge['date'] = date
                    tmpLodge['days'] = daysFlight
                    tmpLodge['total'] = round(tmpLodge['price'] * daysFlight, 2)
                    tmpLodge['travelCode'] = travelCode
                    lodgesSimulated.append(tmpLodge)
                # save and update data
                flightsSimulated.append(tmpFlightFrom)
                flightsSimulated.append(tmpFlightTo)
                travelCode += 1
                date = dt.now() + td(days=daysNextTravel)
    return flightsSimulated, lodgesSimulated

In [22]:
flightsSimulated, lodgesSimulated = \
        funcTravelsSimulated(defCompanies, flightsPossibilities, lodgesPossibilities, 
        defTravelDate, defTravelsDays, defTravelWithLodge, defPlacesName)


Example - Travel (From->To + To->From)



In [24]:
len(flightsSimulated)


571156

In [None]:
len(lodgesSimulated)

In [25]:
from_data = []
to_data = []
for i in range(len(flightsSimulated)):
  if i % 2 == 0:
    from_data.append(flightsSimulated[i])
  elif i % 2 != 0:
    to_data.append(flightsSimulated[i])

In [None]:
from_data

In [26]:
len(to_data)

285578

In [None]:
flightsSimulated[:5]

**Converting the json to Data Frame - Travelling Data**

---



In [27]:
d1 = pd.DataFrame(from_data[0], index = [0])
for i in range(len(from_data)):
  dt = pd.DataFrame(from_data[i], index = [0])
  d1 = pd.concat([d1, dt], axis = 0, ignore_index = True)

In [29]:
d1.head()

Unnamed: 0,from,to,distance,agency,flightType,price,time,timeMsg,userCode,travelCode,date
0,Odisha,Manipur,731.57,SpiceJet,economic,6864.13,1.46,1:27h,0,218574,2021-04-23 08:55:41.421437
1,Odisha,Manipur,731.57,SpiceJet,economic,6864.13,1.46,1:27h,0,218574,2021-04-23 08:55:41.421437
2,Maharashtra,Dadra and Nagar Haveli and Daman and Diu,1697.26,IndiGo,economic,25694.85,3.39,3:23h,0,218575,2023-04-24 09:37:18.831615
3,Goa,Gulf of Kutch,1107.54,Air India,premium,25006.05,2.22,2:13h,0,218576,2023-04-24 09:37:18.847775
4,Ladakh,Punjab,681.15,Air India,premium,23810.09,1.36,1:21h,0,218577,2023-04-24 09:37:18.858922


In [28]:
d1.shape

(285579, 11)

**Finding Latitude and lonngitude for the synthetic data to find their origial distance**

In [30]:
all_locations_in_india = ['Rajasthan', 'Tamil Nadu', 'Kerala', 'Telangana', 'Gujarat',
       'Andhra Pradesh', 'Tripura', 'Lakshadweep', 'Goa', 'Uttar Pradesh',
       'Karnataka', 'Maharashtra', 'Uttarakhand', 'Mizoram', 'Ladakh',
       'West Bengal', 'Madhya Pradesh', 'Arunachal Pradesh', 'Haryana',
       'Chhattisgarh', 'Punjab', 'Jammu and Kashmir', 'Himachal Pradesh',
       'Odisha', 'Sikkim', 'Puducherry', 'Bihar', 'Jharkhand',
       'Andaman and Nicobar Islands', 'Assam', 'Meghalaya', 'Manipur',
       'Chandigarh', 'Dadra and Nagar Haveli and Daman and Diu', 'Delhi',
       'Nagaland', 'Gulf of Kutch']

lat_data = {}
lon_data = {}
from numpy import NaN
import geopy
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="google", timeout = 2)
for i in all_locations_in_india:
  try:
    country = "India"
    loc = geolocator.geocode(i + ", "+ country)
    lat_data[i] = loc.raw["lat"]
    lon_data[i] = loc.raw["lon"] 
  except AttributeError:
    lat_data[i] = NaN
    lon_data[i] = NaN


In [31]:
lat_data

{'Rajasthan': '26.8105777',
 'Tamil Nadu': '10.9094334',
 'Kerala': '10.3528744',
 'Telangana': '17.8495919',
 'Gujarat': '22.3850051',
 'Andhra Pradesh': '15.9240905',
 'Tripura': '23.7750823',
 'Lakshadweep': '10.8132489',
 'Goa': '15.3004543',
 'Uttar Pradesh': '27.1303344',
 'Karnataka': '14.5203896',
 'Maharashtra': '18.9068356',
 'Uttarakhand': '30.0417376',
 'Mizoram': '23.2146169',
 'Ladakh': '33.9456407',
 'West Bengal': '22.9964948',
 'Madhya Pradesh': '23.8143419',
 'Arunachal Pradesh': '28.0937702',
 'Haryana': '29',
 'Chhattisgarh': '21.6637359',
 'Punjab': '30.9293211',
 'Jammu and Kashmir': '33.6649297',
 'Himachal Pradesh': '31.81676015',
 'Odisha': '20.5431241',
 'Sikkim': '27.601029',
 'Puducherry': '10.91564885',
 'Bihar': '25.6440845',
 'Jharkhand': '23.4559809',
 'Andaman and Nicobar Islands': '12.61123865',
 'Assam': '26.4073841',
 'Meghalaya': '25.5379432',
 'Manipur': '24.7208818',
 'Chandigarh': '30.72984395',
 'Dadra and Nagar Haveli and Daman and Diu': '20.71

In [32]:
lon_data

{'Rajasthan': '73.7684549',
 'Tamil Nadu': '78.3665347',
 'Kerala': '76.5120396',
 'Telangana': '79.1151663',
 'Gujarat': '71.745261',
 'Andhra Pradesh': '80.1863809',
 'Tripura': '91.7025091',
 'Lakshadweep': '73.6804620941119',
 'Goa': '74.0855134',
 'Uttar Pradesh': '80.859666',
 'Karnataka': '75.7223521',
 'Maharashtra': '75.6741579',
 'Uttarakhand': '79.089691',
 'Mizoram': '92.8687612',
 'Ladakh': '77.6568576',
 'West Bengal': '87.6855882',
 'Madhya Pradesh': '77.5340719',
 'Arunachal Pradesh': '94.5921326',
 'Haryana': '76',
 'Chhattisgarh': '81.8406351',
 'Punjab': '75.5004841',
 'Jammu and Kashmir': '75.1629584',
 'Himachal Pradesh': '77.34932051968858',
 'Odisha': '84.6897321',
 'Sikkim': '88.45413638680145',
 'Puducherry': '79.80694879844232',
 'Bihar': '85.906508',
 'Jharkhand': '85.2557301',
 'Andaman and Nicobar Islands': '92.83165406414926',
 'Assam': '93.2551303',
 'Meghalaya': '91.2999102',
 'Manipur': '93.9229386',
 'Chandigarh': '76.78414567016054',
 'Dadra and Nagar

In [33]:
d1["from_lat"] = d1["from"].map(lat_data)
d1["from_lon"] = d1["from"].map(lon_data)
d1["to_lat"] = d1["to"].map(lat_data)
d1["to_lon"] = d1["to"].map(lon_data)

In [34]:
d1

Unnamed: 0,from,to,distance,agency,flightType,price,time,timeMsg,userCode,travelCode,date,from_lat,from_lon,to_lat,to_lon
0,Odisha,Manipur,731.57,SpiceJet,economic,6864.13,1.46,1:27h,0,218574,2021-04-23 08:55:41.421437,20.5431241,84.6897321,24.7208818,93.9229386
1,Odisha,Manipur,731.57,SpiceJet,economic,6864.13,1.46,1:27h,0,218574,2021-04-23 08:55:41.421437,20.5431241,84.6897321,24.7208818,93.9229386
2,Maharashtra,Dadra and Nagar Haveli and Daman and Diu,1697.26,IndiGo,economic,25694.85,3.39,3:23h,0,218575,2023-04-24 09:37:18.831615,18.9068356,75.6741579,20.718174949999998,70.93238341010638
3,Goa,Gulf of Kutch,1107.54,Air India,premium,25006.05,2.22,2:13h,0,218576,2023-04-24 09:37:18.847775,15.3004543,74.0855134,22.6612804,69.7625581
4,Ladakh,Punjab,681.15,Air India,premium,23810.09,1.36,1:21h,0,218577,2023-04-24 09:37:18.858922,33.9456407,77.6568576,30.9293211,75.5004841
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285574,Jharkhand,Puducherry,1565.80,SpiceJet,economic,24385.24,3.13,3:7h,114078,504147,2023-04-24 10:16:19.886271,23.4559809,85.2557301,10.91564885,79.80694879844232
285575,Jammu and Kashmir,Sikkim,1516.93,IndiGo,economic,21497.96,3.03,3:1h,114078,504148,2023-04-24 10:16:19.893783,33.6649297,75.1629584,27.601029,88.45413638680145
285576,Sikkim,Ladakh,1444.91,Emmirates,premium,45977.49,2.89,2:53h,114078,504149,2023-04-24 10:16:19.901226,27.601029,88.45413638680145,33.9456407,77.6568576
285577,Chandigarh,Arunachal Pradesh,1304.28,Vistara,delux,31052.69,2.61,2:36h,114079,504150,2021-04-23 08:55:41.421437,30.72984395,76.78414567016054,28.0937702,94.5921326


In [35]:
d1[["from_lat", "from_lon", "to_lat", "to_lon"]] = d1[["from_lat", "from_lon", "to_lat", "to_lon"]].astype("float")

In [36]:
d1 = d1.iloc[1:, :]

**Finding original distance between two locations**

In [37]:
def real_dist(lst):
  from_ = (lst[0], lst[1])
  to = (lst[2], lst[3])
  return geodesic(from_, to).km

In [38]:
dist = []
for i in d1.iloc[:, -4 : ].values:
  dist.append(real_dist(i))

In [39]:
len(dist)

285578

**Finnding the cost for the original distance**

In [None]:
d1["Cost per km"] = d1["price"] / d1["distance"]
d1["Distance"] = dist
d1["Travelling time(In Minutes)"] = (d1["Distance"] // 500) * 60
d1["Price"] = d1["Distance"] * d1["Cost per km"]

**Shedding off the unwanted features**

In [42]:
d1.drop(["distance", "price", "time", "timeMsg", "userCode", "travelCode", "date"], axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d1.drop(["distance", "price", "time", "timeMsg", "userCode", "travelCode", "date"], axis = 1, inplace = True)


**Structuring Data**

In [43]:
d1.columns = ["Boarding Point", "Destination", "Flight agency", "Class", "Boarding Latitude", "Boarding Longitude", "Destination Latitude", "Destination Longitude", "Cost per Km", "Total Distance", "Travelling time(In Minutes)", "Travel Cost"]

In [44]:
d1.head()

Unnamed: 0,Boarding Point,Destination,Flight agency,Class,Boarding Latitude,Boarding Longitude,Destination Latitude,Destination Longitude,Cost per Km,Total Distance,Travelling time(In Minutes),Travel Cost
1,Odisha,Manipur,SpiceJet,economic,20.543124,84.689732,24.720882,93.922939,9.382738,1055.482348,120.0,9903.314857
2,Maharashtra,Dadra and Nagar Haveli and Daman and Diu,IndiGo,economic,18.906836,75.674158,20.718175,70.932383,15.139018,535.696905,60.0,8109.925188
3,Goa,Gulf of Kutch,Air India,premium,15.300454,74.085513,22.66128,69.762558,22.578011,933.123909,60.0,21068.081636
4,Ladakh,Punjab,Air India,premium,33.945641,77.656858,30.929321,75.500484,34.955722,391.135452,0.0,13672.422105
5,Maharashtra,Punjab,SpiceJet,premium,18.906836,75.674158,30.929321,75.500484,23.149844,1331.897476,120.0,30833.218173


**Converting the json to Data Frame - Hotel Data**

In [46]:
dh = pd.DataFrame(lodgesSimulated[0], index = [0])
for i in range(len(lodgesSimulated)):
  dth = pd.DataFrame(lodgesSimulated[i], index = [0])
  dh = pd.concat([dh, dth], axis = 0, ignore_index = True)

In [47]:
dh

Unnamed: 0,code,name,price,place,userCode,date,days,total,travelCode
0,DG,Hotel DG,1678.17,Manipur,0,2021-04-23 08:55:41.421437,4,6712.68,218574
1,DG,Hotel DG,1678.17,Manipur,0,2021-04-23 08:55:41.421437,4,6712.68,218574
2,DK,Hotel DK,3054.56,Dadra and Nagar Haveli and Daman and Diu,0,2023-04-24 09:37:18.831615,4,12218.24,218575
3,DU,Hotel DU,3185.50,Gulf of Kutch,0,2023-04-24 09:37:18.847775,9,28669.50,218576
4,BQ,Hotel BQ,3226.92,Punjab,0,2023-04-24 09:37:18.858922,7,22588.44,218577
...,...,...,...,...,...,...,...,...,...
285574,CL,Hotel CL,3914.10,Puducherry,114078,2023-04-24 10:16:19.886271,5,19570.50,504147
285575,CH,Hotel CH,2854.13,Sikkim,114078,2023-04-24 10:16:19.893783,8,22833.04,504148
285576,AR,Hotel AR,1006.47,Ladakh,114078,2023-04-24 10:16:19.901226,5,5032.35,504149
285577,BC,Hotel BC,2284.41,Arunachal Pradesh,114079,2021-04-23 08:55:41.421437,7,15990.87,504150


In [48]:
dh = dh.iloc[1:, :]

**Structuring Hotel Data**

In [49]:
dh_refined = dh.loc[:, ["name", "price", "place", "days", "total"]]
dh_refined.columns = ["Hotel Name", "Cost per day", "Hotel place", "Travel days", "Total stay cost"]

**Merging two data**

In [50]:
travel_data = d1.copy()
hotel_data = dh_refined.copy()

In [51]:
Tourism_data = pd.concat([travel_data, hotel_data], axis = 1, join = "outer")

In [52]:
Tourism_data

Unnamed: 0,Boarding Point,Destination,Flight agency,Class,Boarding Latitude,Boarding Longitude,Destination Latitude,Destination Longitude,Cost per Km,Total Distance,Travelling time(In Minutes),Travel Cost,Hotel Name,Cost per day,Hotel place,Travel days,Total stay cost
1,Odisha,Manipur,SpiceJet,economic,20.543124,84.689732,24.720882,93.922939,9.382738,1055.482348,120.0,9903.314857,Hotel DG,1678.17,Manipur,4,6712.68
2,Maharashtra,Dadra and Nagar Haveli and Daman and Diu,IndiGo,economic,18.906836,75.674158,20.718175,70.932383,15.139018,535.696905,60.0,8109.925188,Hotel DK,3054.56,Dadra and Nagar Haveli and Daman and Diu,4,12218.24
3,Goa,Gulf of Kutch,Air India,premium,15.300454,74.085513,22.661280,69.762558,22.578011,933.123909,60.0,21068.081636,Hotel DU,3185.50,Gulf of Kutch,9,28669.50
4,Ladakh,Punjab,Air India,premium,33.945641,77.656858,30.929321,75.500484,34.955722,391.135452,0.0,13672.422105,Hotel BQ,3226.92,Punjab,7,22588.44
5,Maharashtra,Punjab,SpiceJet,premium,18.906836,75.674158,30.929321,75.500484,23.149844,1331.897476,120.0,30833.218173,Hotel BQ,3226.92,Punjab,7,22588.44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285574,Jharkhand,Puducherry,SpiceJet,economic,23.455981,85.255730,10.915649,79.806949,15.573662,1503.498907,180.0,23414.983827,Hotel CL,3914.10,Puducherry,5,19570.50
285575,Jammu and Kashmir,Sikkim,IndiGo,economic,33.664930,75.162958,27.601029,88.454136,14.172018,1438.928552,120.0,20392.522031,Hotel CH,2854.13,Sikkim,8,22833.04
285576,Sikkim,Ladakh,Emmirates,premium,27.601029,88.454136,33.945641,77.656858,31.820314,1249.021172,120.0,39744.245980,Hotel AR,1006.47,Ladakh,5,5032.35
285577,Chandigarh,Arunachal Pradesh,Vistara,delux,30.729844,76.784146,28.093770,94.592133,23.808300,1750.833649,180.0,41684.373406,Hotel BC,2284.41,Arunachal Pradesh,7,15990.87


**Function to add the travel agency name**

In [54]:
def travel_agent(x) :
  if x>= 0 and x <= 51405:
    return "MakeMyTrip"
  elif x > 51405 and x <= 102810:
    return "Trivago"
  elif x > 102810 and x <= 151359:
    return "Goibibo"
  elif x > 151359 and x <= 197052:
    return "Yatra"
  elif x > 197052 and x <= 228466:
    return "SOTC"
  elif x > 228466 and x <= 257025:
    return "Expedia"
  elif x > 257025 and x <= 285583:
    return "ClearTrip"

#creating the data for travel agency column
agency = []
for i in range(Tourism_data.shape[0]):
  agency.append(travel_agent(i))

#Inserting the column
Tourism_data.insert(2, "Travel Agency", agency)

#Adding the final target columnn - Total Package cost
Tourism_data["Total Package"] = Tourism_data["Travel Cost"] + Tourism_data["Total stay cost"]

In [55]:
Tourism_data

Unnamed: 0,Boarding Point,Destination,Travel Agency,Flight agency,Class,Boarding Latitude,Boarding Longitude,Destination Latitude,Destination Longitude,Cost per Km,Total Distance,Travelling time(In Minutes),Travel Cost,Hotel Name,Cost per day,Hotel place,Travel days,Total stay cost,Total Package
1,Odisha,Manipur,MakeMyTrip,SpiceJet,economic,20.543124,84.689732,24.720882,93.922939,9.382738,1055.482348,120.0,9903.314857,Hotel DG,1678.17,Manipur,4,6712.68,16615.994857
2,Maharashtra,Dadra and Nagar Haveli and Daman and Diu,MakeMyTrip,IndiGo,economic,18.906836,75.674158,20.718175,70.932383,15.139018,535.696905,60.0,8109.925188,Hotel DK,3054.56,Dadra and Nagar Haveli and Daman and Diu,4,12218.24,20328.165188
3,Goa,Gulf of Kutch,MakeMyTrip,Air India,premium,15.300454,74.085513,22.661280,69.762558,22.578011,933.123909,60.0,21068.081636,Hotel DU,3185.50,Gulf of Kutch,9,28669.50,49737.581636
4,Ladakh,Punjab,MakeMyTrip,Air India,premium,33.945641,77.656858,30.929321,75.500484,34.955722,391.135452,0.0,13672.422105,Hotel BQ,3226.92,Punjab,7,22588.44,36260.862105
5,Maharashtra,Punjab,MakeMyTrip,SpiceJet,premium,18.906836,75.674158,30.929321,75.500484,23.149844,1331.897476,120.0,30833.218173,Hotel BQ,3226.92,Punjab,7,22588.44,53421.658173
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285574,Jharkhand,Puducherry,ClearTrip,SpiceJet,economic,23.455981,85.255730,10.915649,79.806949,15.573662,1503.498907,180.0,23414.983827,Hotel CL,3914.10,Puducherry,5,19570.50,42985.483827
285575,Jammu and Kashmir,Sikkim,ClearTrip,IndiGo,economic,33.664930,75.162958,27.601029,88.454136,14.172018,1438.928552,120.0,20392.522031,Hotel CH,2854.13,Sikkim,8,22833.04,43225.562031
285576,Sikkim,Ladakh,ClearTrip,Emmirates,premium,27.601029,88.454136,33.945641,77.656858,31.820314,1249.021172,120.0,39744.245980,Hotel AR,1006.47,Ladakh,5,5032.35,44776.595980
285577,Chandigarh,Arunachal Pradesh,ClearTrip,Vistara,delux,30.729844,76.784146,28.093770,94.592133,23.808300,1750.833649,180.0,41684.373406,Hotel BC,2284.41,Arunachal Pradesh,7,15990.87,57675.243406


In [56]:
Tourism_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285578 entries, 1 to 285578
Data columns (total 19 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Boarding Point               285578 non-null  object 
 1   Destination                  285578 non-null  object 
 2   Travel Agency                285578 non-null  object 
 3   Flight agency                285578 non-null  object 
 4   Class                        285578 non-null  object 
 5   Boarding Latitude            285578 non-null  float64
 6   Boarding Longitude           285578 non-null  float64
 7   Destination Latitude         285578 non-null  float64
 8   Destination Longitude        285578 non-null  float64
 9   Cost per Km                  285578 non-null  float64
 10  Total Distance               285578 non-null  float64
 11  Travelling time(In Minutes)  285578 non-null  float64
 12  Travel Cost                  285578 non-null  float64
 13 

In [60]:
Tourism_data["Travelling time(In Minutes)"] = Tourism_data["Travelling time(In Minutes)"].astype("int16")
Tourism_data["Cost per Km"] = Tourism_data["Cost per Km"].astype("float32")
Tourism_data["Total Distance"] = Tourism_data["Total Distance"].astype("float32")
Tourism_data["Travel Cost"] = Tourism_data["Travel Cost"].astype("float32")
Tourism_data["Cost per day"] = Tourism_data["Cost per day"].astype("float32")
Tourism_data["Travel days"] = Tourism_data["Travel days"].astype("int8")
Tourism_data["Total stay cost"] = Tourism_data["Total stay cost"].astype("float32")
Tourism_data["Total Package"] = Tourism_data["Total Package"].astype("float32")
Tourism_data["Boarding Latitude"] = Tourism_data["Boarding Latitude"].astype("float32")
Tourism_data["Boarding Longitude"] = Tourism_data["Boarding Longitude"].astype("float32")
Tourism_data["Destination Latitude"] = Tourism_data["Destination Latitude"].astype("float32")
Tourism_data["Destination Longitude"] = Tourism_data["Destination Longitude"].astype("float32")

**Optimizing data types for better dataset size**

In [61]:
Tourism_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285578 entries, 1 to 285578
Data columns (total 19 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Boarding Point               285578 non-null  object 
 1   Destination                  285578 non-null  object 
 2   Travel Agency                285578 non-null  object 
 3   Flight agency                285578 non-null  object 
 4   Class                        285578 non-null  object 
 5   Boarding Latitude            285578 non-null  float32
 6   Boarding Longitude           285578 non-null  float32
 7   Destination Latitude         285578 non-null  float32
 8   Destination Longitude        285578 non-null  float32
 9   Cost per Km                  285578 non-null  float32
 10  Total Distance               285578 non-null  float32
 11  Travelling time(In Minutes)  285578 non-null  int16  
 12  Travel Cost                  285578 non-null  float32
 13 

In [63]:
Tourism_data.tail()

Unnamed: 0,Boarding Point,Destination,Travel Agency,Flight agency,Class,Boarding Latitude,Boarding Longitude,Destination Latitude,Destination Longitude,Cost per Km,Total Distance,Travelling time(In Minutes),Travel Cost,Hotel Name,Cost per day,Hotel place,Travel days,Total stay cost,Total Package
285574,Jharkhand,Puducherry,ClearTrip,SpiceJet,economic,23.45598,85.25573,10.915648,79.806946,15.573662,1503.498901,180,23414.984375,Hotel CL,3914.100098,Puducherry,5,19570.5,42985.484375
285575,Jammu and Kashmir,Sikkim,ClearTrip,IndiGo,economic,33.664928,75.162956,27.601028,88.45414,14.172018,1438.928589,120,20392.521484,Hotel CH,2854.129883,Sikkim,8,22833.039062,43225.5625
285576,Sikkim,Ladakh,ClearTrip,Emmirates,premium,27.601028,88.45414,33.945641,77.65686,31.820314,1249.021118,120,39744.246094,Hotel AR,1006.469971,Ladakh,5,5032.350098,44776.597656
285577,Chandigarh,Arunachal Pradesh,ClearTrip,Vistara,delux,30.729843,76.784149,28.093771,94.592133,23.8083,1750.833618,180,41684.375,Hotel BC,2284.409912,Arunachal Pradesh,7,15990.870117,57675.242188
285578,Gujarat,Karnataka,ClearTrip,Vistara,delux,22.385006,71.745262,14.52039,75.722351,18.187141,966.386108,60,17575.800781,Hotel AH,1658.630005,Karnataka,10,16586.300781,34162.101562


In [65]:
tour_data_refined = Tourism_data.loc[:, ["Boarding Point", "Destination", "Travel Agency", "Flight agency", "Class", "Boarding Latitude", "Boarding Longitude", "Destination Latitude", "Destination Longitude", "Total Distance", "Hotel place", "Travel days", "Total Package"]]

In [66]:
tour_data_refined

Unnamed: 0,Boarding Point,Destination,Travel Agency,Flight agency,Class,Boarding Latitude,Boarding Longitude,Destination Latitude,Destination Longitude,Total Distance,Hotel place,Travel days,Total Package
1,Odisha,Manipur,MakeMyTrip,SpiceJet,economic,20.543123,84.689735,24.720882,93.922935,1055.482300,Manipur,4,16615.994141
2,Maharashtra,Dadra and Nagar Haveli and Daman and Diu,MakeMyTrip,IndiGo,economic,18.906836,75.674156,20.718176,70.932381,535.696899,Dadra and Nagar Haveli and Daman and Diu,4,20328.166016
3,Goa,Gulf of Kutch,MakeMyTrip,Air India,premium,15.300454,74.085510,22.661280,69.762558,933.123901,Gulf of Kutch,9,49737.582031
4,Ladakh,Punjab,MakeMyTrip,Air India,premium,33.945641,77.656860,30.929321,75.500481,391.135437,Punjab,7,36260.863281
5,Maharashtra,Punjab,MakeMyTrip,SpiceJet,premium,18.906836,75.674156,30.929321,75.500481,1331.897461,Punjab,7,53421.656250
...,...,...,...,...,...,...,...,...,...,...,...,...,...
285574,Jharkhand,Puducherry,ClearTrip,SpiceJet,economic,23.455980,85.255730,10.915648,79.806946,1503.498901,Puducherry,5,42985.484375
285575,Jammu and Kashmir,Sikkim,ClearTrip,IndiGo,economic,33.664928,75.162956,27.601028,88.454140,1438.928589,Sikkim,8,43225.562500
285576,Sikkim,Ladakh,ClearTrip,Emmirates,premium,27.601028,88.454140,33.945641,77.656860,1249.021118,Ladakh,5,44776.597656
285577,Chandigarh,Arunachal Pradesh,ClearTrip,Vistara,delux,30.729843,76.784149,28.093771,94.592133,1750.833618,Arunachal Pradesh,7,57675.242188


In [67]:
tour_data_refined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285578 entries, 1 to 285578
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Boarding Point         285578 non-null  object 
 1   Destination            285578 non-null  object 
 2   Travel Agency          285578 non-null  object 
 3   Flight agency          285578 non-null  object 
 4   Class                  285578 non-null  object 
 5   Boarding Latitude      285578 non-null  float32
 6   Boarding Longitude     285578 non-null  float32
 7   Destination Latitude   285578 non-null  float32
 8   Destination Longitude  285578 non-null  float32
 9   Total Distance         285578 non-null  float32
 10  Hotel place            285578 non-null  object 
 11  Travel days            285578 non-null  int8   
 12  Total Package          285578 non-null  float32
dtypes: float32(6), int8(1), object(6)
memory usage: 19.9+ MB


In [73]:
#tour_data_refined.iloc[ :142789, : ].to_csv("Travel Agency Data - Synthetic- Part 1.csv")
tour_data_refined.iloc[142789: , : ].to_csv("Travel Agency Data - Synthetic- Part 2.csv")

# **Since The Data file is Uploaded in Git Hub in two parts**