-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
143 lines (120 loc) · 4.56 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import pandas as pd
from bs4 import BeautifulSoup
def data_in_fives(file):
'''
takes: scraped datafile - pickle of review card as extracted by beautifulsoup
Data is scraped as it is loaded - in groups of 5 reviews.
extracts: username; month of post; rating; flight details; title of
review and the review text itself
returns list of extracted information
'''
all_data_in_fives = []
for n, line in enumerate(file):
# print(n)
line = str(line)
soup = BeautifulSoup(line, 'lxml')
who_when_list = soup.find_all('div',{'class':"_2fxQ4TOx"}, limit = 5)
who = []
when = []
try:
for who_when in who_when_list:
who_when_split = who_when.text.split(' wrote a review ')
who.append(who_when_split[0])
when.append(who_when_split[1])
except:
print(n)
pass
ratings_list = soup.find_all('div', {'data-test-target':'review-rating'})
ratings = []
try:
for rating in ratings_list:
rate = str(rating)
ratings.append(rate[-17])
except:
print(n, 'ratings')
pass
flight_type_list = soup.find_all('div',{'class': 'hpZJCN7D'})
flights = []
try:
for flight_type in flight_type_list:
flight_soup = BeautifulSoup(str(flight_type), 'lxml')
three_elements = flight_soup.find_all('div',{'class': '_3tp-5a1G'})
flight = [element.text for element in three_elements]
flights.append (', '.join(flight))
except:
print(n, 'flights')
pass
title_list = soup.find_all('div',{'data-test-target': 'review-title'})
titles = []
for title in title_list:
titles.append(title.text)
text_list = soup.find_all('q', {'class':'IRsGHoPm'})
texts = []
try:
for text in text_list:
texts.append(text.text)
except:
print(n, 'texts')
pass
all_data = zip(who, when, ratings, flights, titles, texts)
all_data_in_fives.append(all_data)
return all_data_in_fives
def dict_builder(all_data_in_fives):
'''
takes list of 5 review details; constructs 1 large dictionary effectively
concatenating the data ready for datafframe construction
'''
keys = ['reviewer', 'review_date', 'rating', 'flight', 'title', 'text']
final_dict = {}
n = 0
for all_data in all_data_in_fives:
for items in all_data:
final_dict[n] = dict(zip(keys, items))
n += 1
return final_dict
def dataframe_constructor(dict_out):
'''
takes dictionary of reviews and metadata and creates DataFrame
duplicates are dropped.
returns dataframe.
'''
df = pd.DataFrame(dict_out)
df = df.T
df.drop_duplicates(subset='text', inplace=True)
return df
def stanford_to_csv(stanford_pp):
'''
takes stanford output file and returns a list of nested lists of stanford
output per token
'''
processed_sentence_list = []
for doc in stanford_pp:
# this i+1 is review number - not sentence number
output = [[i+1, word.id, word.text, word.lemma, word.upos, word.xpos,
word.head, word.deprel] for i, sent in enumerate(doc.sentences) for word in sent.words]
processed_sentence_list.append(output)
return processed_sentence_list
def add_pre_processed_col(stanford_pp, dataframe, basename):
'''
takes a dataframe of airline reviews and a pickled stanford doc of the text
column
returns modified dataframe with extra columns in which the text has been
lemmatised, and reference to the airline and digits have been reduced and
another column (no_NER) in which all NER labels have attempted to be removed
'''
stanford_ner = []
stanford_lemmatized = []
stanford_pos = []
for doc in stanford_pp:
lemma_list = [word.lemma.lower()
for sent in doc.sentences for word in sent.words
if word.upos != 'PUNCT']
stanford_lemmatized.append(' '.join(lemma_list))
ner_list = [(ent.text, ent.type) for sent in doc.sentences
for ent in sent.ents]
stanford_ner.append(ner_list)
pos_list = [word.upos for sent in doc.sentences for word in sent.words
if word.upos != 'PUNCT']
stanford_pos.append(pos_list)
dataframe['stanford_lemma'] = (stanford_lemmatized)
return dataframe