In [54]:
# imports

import pandas as pd
from collections import Counter

import spacy
nlp = spacy.load('en_core_web_sm')
from dframcy import DframCy

import stanza
stanza.download('en')

from nltk.corpus import stopwords
StopWords = stopwords.words('english')

import string 
punct = string.punctuation 

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.2.2.json:   0%|   …

2022-07-26 18:42:44 INFO: Downloading default packages for language: en (English)...


Downloading http://nlp.stanford.edu/software/stanza/1.2.2/en/default.zip:   0%|          | 0.00/412M [00:00<?,…

2022-07-26 18:52:11 INFO: Finished downloading models and saved to C:\Users\Shark\stanza_resources.


## Loading the preprocessed dataframe

In [17]:
# open filtered dataframe

path = "data/cleaned_dataframe.csv"
filtered_df = pd.read_csv(path)

# remove unwanted columns
filtered_df.drop(columns = ['sentences', 'Unnamed: 0'], inplace=True)
filtered_df.head(10)

# get sentences into string for spacy
instances = filtered_df['raw_sentences'].tolist()
joined_instances = ' '.join(instances)

In [16]:
instances[:10]

['@115712 Can you please send us a private message , so that I can gain further details about your account ? .',
 '@115715 Please send me a private message so that I can send you the link to access your account .',
 "@115714 whenever I contact customer support , they tell me I have shortcode enabled on my account , but I have never in the 4 years I 've tried https://t.co/0G98RtNxPK .",
 '@Ask_Spectrum Would you like me to email you a copy of one since Spectrum is not updating your training ? .',
 '@Ask_Spectrum',
 'The correct way to do it is via an OCS Account Takeover and email Consent Form it does not need to be done in a local office .',
 '@115716',
 'The information pertaining to the account assumption is correct .',
 'This does need to be done at a local outlet wit ... https://t.co/P7XCmTzPQj .',
 "actually that 's a broken link you sent me and incorrect information https://t.co/V4yfrHR8VI ."]

# Collecting data statistics 

Here I collect some explorative statistics on the data and save them into a a format that can easily be visualized.
The following statistics are extracted:
- 20 most common words (function words + interpunction not included)
- Distribution of POS-tags
- Distribution of Named Entity tags
- Sentiment analysis distribution

In [18]:
# open text in Dframcy module 
dframcy = DframCy(nlp)
doc = dframcy.nlp(joined_instances)

# create dataframe with dFramcy features
dframcy_df = dframcy.to_dataframe(doc)

# display Dframcy dataframe
dframcy_df

Unnamed: 0,token_text,token_start,token_end,token_pos_,token_tag_,token_dep_,token_head,token_ent_type_
0,@115712,0,7,PROPN,NNP,npadvmod,send,
1,Can,8,11,AUX,MD,aux,send,
2,you,12,15,PRON,PRP,nsubj,send,
3,please,16,22,INTJ,UH,intj,send,
4,send,23,27,VERB,VB,ROOT,send,
...,...,...,...,...,...,...,...,...
59752,just,300549,300553,ADV,RB,advmod,'re,
59753,a,300554,300555,DET,DT,det,tweet,
59754,tweet,300556,300561,NOUN,NN,npadvmod,away,
59755,away,300562,300566,ADV,RB,advmod,'re,


In [20]:
# collect needed columns to list
tokens = dframcy_df['token_text'].tolist()
pos = dframcy_df['token_pos_'].tolist()
ents = dframcy_df['token_ent_type_'].tolist()

# count data statistics
word_freq = Counter(list(tokens))
pos_freq = Counter(list(pos))
ent_freq = Counter(list(ents))


In [45]:
# remove stopwords from word frequency dict
for word in StopWords:
    if word in word_freq:
        del word_freq[word]
        
# remove punctuation from word frequency dict
for p in punct:
    if p in word_freq:
        del word_freq[p]
        
# keep only 20 most frequent words
most_common = word_freq.most_common(20)

common_words = []
common_counts = []

for item in most_common:
    common_words.append(item[0])
    common_counts.append(item[1])

In [51]:
# get frequency counts into dataframe 

# word freqs
d = {'word': common_words, 'count': common_counts}
word_df = pd.DataFrame(d)

# pos freqs
keys = list(pos_freq.keys())
vals = list(pos_freq.values())
d = {'pos': keys, 'count': vals}
pos_df = pd.DataFrame(d)

# ent freqs
keys = list(ent_freq.keys())
vals = list(ent_freq.values())
d = {'ent': keys, 'count': vals}
ent_df = pd.DataFrame(d)

In [52]:
# save frequency dicts to file for visualization

word_df.to_excel("data/word_freq.xlsx", header=False)
pos_df.to_excel("data/pos_freq.xlsx", header=False)
ent_df.to_excel("data/ent_freq.xlsx", header=False)

## Sentiment Analysis with Stanza

In [56]:
# sentiment analysis

# define pipeline with stanza
nlp = stanza.Pipeline('en')

# extract sentiments 
sentence = []
sentiment = []

doc = nlp(joined_instances)
for sent in doc.sentences:
    sentence.append(sent.text)
    sentiment.append(sent.sentiment)

2022-07-26 19:03:04 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2022-07-26 19:03:04 INFO: Use device: cpu
2022-07-26 19:03:04 INFO: Loading: tokenize
2022-07-26 19:03:04 INFO: Loading: pos
2022-07-26 19:03:04 INFO: Loading: lemma
2022-07-26 19:03:04 INFO: Loading: depparse
2022-07-26 19:03:05 INFO: Loading: sentiment
2022-07-26 19:03:05 INFO: Loading: ner
2022-07-26 19:03:06 INFO: Done loading processors!


In [69]:
# put results in dataframe 

sentiment_freq = Counter(list(sentiment))

keys = ['neutral', 'negative', 'positive']
vals = list(sentiment_freq.values())
d = {'sentiment': keys, 'count': vals}
sentiment_df = pd.DataFrame(d)

# save dataframe to file for visualization
sentiment_df.to_excel("data/sentiment_frequency.xlsx", header=False)

In [64]:
# display sentiment dataframe for manual inspection
d = {'sentence': sentence, 'sentiment': sentiment}
df = pd.DataFrame(d)
df.head(10)

Unnamed: 0,sentence,sentiment
0,@115712,1
1,"Can you please send us a private message , so ...",1
2,Please send me a private message so that I can...,0
3,Would you like me to email you a copy of one s...,0
4,. @Ask_Spectrum,1
5,The correct way to do it is via an OCS Account...,0
6,The information pertaining to the account assu...,1
7,This does need to be done at a local outlet wi...,1
8,https://t.co/P7XCmTzPQj . actually that 's a b...,1
9,My apologies for any frustrations or inconveni...,0
