In [87]:
import pandas as pd
import spacy
from spacy import displacy
from spacy.matcher import Matcher
from spacy.tokens import Doc, Span
from dataclasses import dataclass
from typing import List, Dict
import re

## Cleaning

In [71]:
#data = pd.read_csv('../../output/scraped-ns/greviewsandredditcorpus.txt').dropna().reset_index(drop=True)
data = pd.read_csv('../../output/scraped-ns/all-camps-google-reviews.csv').rename(columns = {'Content': 'content'}).dropna().reset_index(drop=True)
#data = pd.read_csv('../../output/scraped-ns/nationalservicesg_combineddata.csv').dropna().reset_index(drop=True)
#data['word_count'] = data['content'].str.count(' ') + 1
#data.to_csv('../../output/scraped-ns/temp.csv', index=False)
data = data[['content']].replace('\r|\n', '', regex=True)
data

Unnamed: 0,content
0,Very Professional People down there. Recommend...
1,"Unlike what the other reviews may suggest, peo..."
2,The place overall is fine. Good experience. Ju...
3,I went there for my NS Checkup today 29 August...
4,Don't bother showing up at the timing assigned...
...,...
73,Clear RT or IPPT or IPPT here.
74,The location is too ulu. Fifty years ago this ...
75,IPPT here every year~
76,whete you get rewarded for exercising


In [72]:
data['content'][26]

'No cookhouse. NSF need to spend $$$ out of their meagre pay.'

In [108]:
ls = list(data['content'])
corpus = ' '.join(map(str, ls))

## Examine and redefine entities

In [116]:
nlp = spacy.load('en_core_web_sm')
ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True}).from_disk("patterns.jsonl")
print("Pipeline:", nlp.pipe_names)

Pipeline: ['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer', 'entity_ruler']


In [117]:
displacy.render(nlp(corpus), jupyter=True, style='ent')

### First rule: "the people here were adequately friendly"
- the place is really clean and efficient
- the place overall is fine
- this place is a complete waste of space and time
- all staff are professional
- place is not well kept, people working there are unbelievably rude
- location is too ulu

In [115]:
# Examine lines
ex = 'location is too ulu'
displacy.render(nlp(ex), style='dep', jupyter = True, options = {'distance': 120})

In [111]:
# Storing matches

matcher = Matcher(nlp.vocab)

@dataclass
class MatchRule:
    name: str
    patterns: List
        
patterns1 = [
    [
        {'LOWER': {'IN': ['place', 'location', 'staff']}},
        {'LEMMA': 'be'},
        {'POS': 'ADJ'}
    ]
]

initial_rule = MatchRule('initial_rule', patterns1)

def add_matches(matcher: Matcher, doc: Doc, i: int, matches: List):
    match_id, start, end = matches[i]
    string_id = nlp.vocab.strings[match_id]
    mlist = doc._.get(string_id)
    
    if (start, end) not in mlist:
        mlist.append((start, end))
        doc._.set(string_id, mlist)

Doc.set_extension(initial_rule.name, default=[], force=True)
matcher.add(initial_rule.name, initial_rule.patterns, on_match=add_matches)

# Match a subset of reviews

subset_length = 78

subset = [d["reviewText"] for i, d in enumerate(ls) if i < subset_length]
parsed_subset = [doc for doc in nlp.pipe(subset)]

for _ in matcher.pipe(parsed_subset):
    pass  # leave it to the add_matches callback

TypeError: string indices must be integers

### Second rule: "very professional people"
- rude staff
- lousy service
- inconvenient location
- unfriendly tone
- extremely poor and rude customer service

## Unused

In [35]:
data['content'][183:188]

x = data['content'][184]
ls = re.split('\s|(?<!\d)[,.?!](?!\d)', x)
filtered = list(filter(None, ls))
print(filtered)
len(filtered)

183    This is really scummy, trying to get a "easy a...
184                                                  NaN
185    Actually being PES B1 doesn't mean you will ge...
186    Hey man I understand where you're coming from ...
187    What about people who pretend to require a dow...
Name: content, dtype: object