-
Notifications
You must be signed in to change notification settings - Fork 0
/
trustpilot_json_parser_glom.py
150 lines (130 loc) · 6.14 KB
/
trustpilot_json_parser_glom.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import json
import requests
from bs4 import BeautifulSoup
from glom import glom
url = "https://scrape.smartproxy.com/v1/tasks?universal="
payload = {
"target": "universal",
"url": "https://www.trustpilot.com/review/panaceafinancial.com",
"headless": "html",
"device_type": "desktop"
}
headers = {
"accept": "application/json",
"content-type": "application/json",
"authorization": "Basic AUTH"
}
def main():
response = requests.post(url, json=payload, headers=headers)
json_data = response.text
parsed_data = json.loads(json_data)
content = parsed_data['results'][0]['content']
soup = BeautifulSoup(content, "html.parser")
# Find the script tag with id "__NEXT_DATA__"
data_script = soup.select_one('script#__NEXT_DATA__')
# Extract the contents of the script tag
data_string = data_script.text
data = json.loads(data_string)
# You can access the object data here
pageUrl = glom(data, 'props.pageProps.pageUrl')
businessUnit = glom(data, 'props.pageProps.businessUnit')
displayName = glom(businessUnit, 'displayName')
identifyingName = glom(businessUnit, 'identifyingName')
trustScore = glom(businessUnit, 'trustScore')
websiteTitle = glom(businessUnit, 'websiteTitle')
profileImageUrl = glom(businessUnit, 'profileImageUrl')
stars = glom(businessUnit, 'stars')
categoryName = glom(businessUnit, 'categories.0.name')
categoryId = glom(businessUnit, 'categories.0.id')
categoryRank = glom(businessUnit, 'categories.0.rank')
topLevelDisplayName = glom(businessUnit, 'breadcrumb.topLevelDisplayName')
locationsCount = glom(businessUnit, 'locationsCount')
isClosed = glom(businessUnit, 'isClosed')
isCollectingReviews = glom(businessUnit, 'isCollectingReviews')
verifiedByGoogle = glom(businessUnit, 'verification.verifiedByGoogle')
verifiedBusiness = glom(businessUnit, 'verification.verifiedBusiness')
verifiedPaymentMethod = glom(businessUnit, 'verification.verifiedPaymentMethod')
hasCollectedIncentivisedReviews = glom(businessUnit, 'hasCollectedIncentivisedReviews')
email = glom(businessUnit, 'contactInfo.email')
address = glom(businessUnit, 'contactInfo.address')
city = glom(businessUnit, 'contactInfo.city')
country = glom(businessUnit, 'contactInfo.country')
phone = glom(businessUnit, 'contactInfo.phone')
zipCode = glom(businessUnit, 'contactInfo.zipCode')
isUsingPaidFeatures = glom(businessUnit, 'activity.isUsingPaidFeatures')
hasSubscription = glom(businessUnit, 'activity.hasSubscription')
isAskingForReviews = glom(businessUnit, 'activity.isAskingForReviews')
claimedDate = glom(businessUnit, 'activity.claimedDate')
averageDaysToReply = glom(businessUnit, 'activity.replyBehavior.averageDaysToReply')
lastReplyToNegativeReview = glom(businessUnit, 'activity.replyBehavior.lastReplyToNegativeReview')
negativeReviewsWithRepliesCount = glom(businessUnit, 'activity.replyBehavior.negativeReviewsWithRepliesCount')
replyPercentage = glom(businessUnit, 'activity.replyBehavior.replyPercentage')
totalNegativeReviewsCount = glom(businessUnit, 'activity.replyBehavior.totalNegativeReviewsCount')
reviews = glom(data, 'props.pageProps.reviews')
filters = glom(data, 'props.pageProps.filters')
totalNumberOfReviews = glom(filters, 'totalNumberOfReviews')
totalNumberOfFilteredReviews = glom(filters, 'totalNumberOfFilteredReviews')
currentPage = glom(filters,'pagination.currentPage')
perPage = glom(filters,'pagination.perPage')
totalCount = glom(filters, 'pagination.totalCount')
totalPages = glom(filters, 'pagination.totalPages')
ratingsTotal = glom(filters, 'reviewStatistics.ratings.total')
ratingsOneStar = glom(filters, 'reviewStatistics.ratings.one')
ratingsTwoStar = glom(filters, 'reviewStatistics.ratings.two')
ratingsThreeStar = glom(filters, 'reviewStatistics.ratings.three')
ratingsFourStar = glom(filters, 'reviewStatistics.ratings.four')
ratingsFiveStar = glom(filters, 'reviewStatistics.ratings.five')
# Create a dictionary
page_info = {
'PageURL': pageUrl,
'DisplayName': displayName,
'IdentifyingName':identifyingName,
'TrustScore': trustScore,
'WebsiteTitle': websiteTitle,
'ProfileImageURL': profileImageUrl,
'Stars': stars,
'CategoryName': categoryName,
'CategoryID': categoryId,
'CategoryRank': categoryRank,
'TopLevelDisplayName': topLevelDisplayName,
'NumberOfLocations': locationsCount,
'IsClosed': isClosed,
'IsCollectingReviews': isCollectingReviews,
'VerifiedByGoogle': verifiedByGoogle,
'VerifiedBusiness': verifiedBusiness,
'VerifiedPaymentMethod': verifiedPaymentMethod,
'HasCollectedIncentivizedReviews': hasCollectedIncentivisedReviews,
'Email': email,
'Address': address,
'City': city,
'Country': country,
'PhoneNumber': phone,
'ZIPCode': zipCode,
'IsUsingPaidFeatures': isUsingPaidFeatures,
'HasSubscription': hasSubscription,
'IsAskingForReviews': isAskingForReviews,
'ClaimedDate': claimedDate,
'AverageDaystoReply': averageDaysToReply,
'LastReplyToNegativeReview': lastReplyToNegativeReview,
'NegativeReviewsWithReplies': negativeReviewsWithRepliesCount,
'ReplyPercentage': replyPercentage,
'TotalNegativeReviews': totalNegativeReviewsCount,
'Reviews': reviews,
'NumberOfReviews': totalNumberOfReviews,
'NumberOfFilteredReviews': totalNumberOfFilteredReviews,
'CurrentPage': currentPage,
'ReviewsPerPage': perPage,
'TotalReviewCount': totalCount,
'TotalPages': totalPages,
'RatingsCount': ratingsTotal,
'OneStarRatingsCount': ratingsOneStar,
'TwoStarRatingsCount': ratingsTwoStar,
'ThreeStarRatingsCount': ratingsThreeStar,
'FourStarRatingsCount': ratingsFourStar,
'FiveStarRatingsCount': ratingsFiveStar
}
# Dump the output to .json
with open('page_info.json', 'w') as outfile:
json.dump(page_info, outfile)
if __name__ == "__main__":
main()