In [2]:
# import necessary libraries
import pandas as pd
import spacy

# Step 1: Read in dataset and structure it in a way that is beneficial for further processing.

In [4]:
# read in data
# currently reading in full dataset; small batch of data also available
# path when running notebook via VS Code
df = pd.read_csv("data/pnlp_data_en.csv", sep=";")

# path when running notebook via jupyter
#df = pd.read_csv("../../data/pnlp_data_en.csv", sep=";")
df.head()

Unnamed: 0,Report Grouping,Question Text,Comments
0,Large Department,Please tell us what is working well.,"we do what our customers need, we communicate ..."
1,Large Department,Please tell us what is working well.,Customs business development continues to grow...
2,Large Department,Please tell us what is working well.,"I think the team work hard, are committed to c..."
3,Large Department,Please tell us what is working well.,Overall working towards a customer centric env...
4,Large Department,Please tell us what is working well.,Customer centricity is a growing culture in th...


In [68]:
# split up dataframe into individual sets to continue processing
large_department = df[df['Report Grouping'] == 'Large Department']
ld_q1 = large_department[large_department['Question Text'] == 'Please tell us what is working well.']
ld_q2 = large_department[large_department['Question Text'] == 'Please tell us what needs to be improved.']

small_department = df[df['Report Grouping'] == 'Small Department']
sd_q1 = small_department[small_department['Question Text'] == 'Please tell us what is working well.']
sd_q2 = small_department[small_department['Question Text'] == 'Please tell us what needs to be improved.']

# Step 2: Extract features and enrich corresponding dataframes with them.

In [6]:
nlp = spacy.load("en_core_web_sm")

In [83]:
print(ld_q1['Comments'])

0       we do what our customers need, we communicate ...
1       Customs business development continues to grow...
2       I think the team work hard, are committed to c...
3       Overall working towards a customer centric env...
4       Customer centricity is a growing culture in th...
                              ...                        
8018    Nothing is working well. industry standards ar...
8019    I think the additional training has been posit...
8020    I believe we have a good core structure, despi...
8021    People are gaining more confidence within thei...
8022    i enjoy working with some of the team and can ...
Name: Comments, Length: 8023, dtype: object


In [84]:
def pos_tag(df, verbose=False, limit=-1):
    """
    Extracts POS tags for every answer (important: one answer may contain multiple sentences) in a given dataframe and returns them as a list of lists.
    Note: Function is specified for df structure of complete dataset provided by deepsight. Index adjustments might be necessary on different dfs.

    df: The dataframe containing the sentences to be tagged.
    verbose: Should the function report progress? False by default.
    limit: If only the first n sents should be tagged, give n as limit. Tags entire df by default.
    """
    # list used to store the individual lists of POS tags
    pos_tags = []

    # running index used for limit and state updates
    i = 0
    # set variable encoding amount of total answers to tag
    if limit == -1:
        end = len(df.index)
    else:
        end = limit

    # iterature over entire dataframe
    for row in df.iterrows():
        # list used to store POS tags of any given answer
        temp = []
        # iterate over given answer
        # adjust here for different df structures!
        for token in nlp(row[1][2]):
            temp.append(token.pos_)
        # add new tags to complete list of tags
        pos_tags.append(temp)

        # defines output of verbose run of function
        if verbose:
            print("Tagging answer", i+1, "of", end+1)

        # stops function when limit is reached 
        if limit != -1:
            if i == limit: return(pos_tags)
        
        return(pos_tags)

In [82]:
i = 0
pos_tags = []
for row in ld_q1.iterrows():
    doc = nlp(row[1][2])
    temp = []
    for token in doc:
        temp.append(token.pos_)
    pos_tags.append(temp)
    i += 1
    if i == 100: break
print(pos_tags)

[['PRON', 'AUX', 'PRON', 'DET', 'NOUN', 'VERB', 'PUNCT', 'PRON', 'VERB', 'ADV', 'PUNCT'], ['PROPN', 'NOUN', 'NOUN', 'VERB', 'PART', 'VERB', 'CCONJ', 'VERB', 'PUNCT', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'PROPN', 'PROPN', 'CCONJ', 'DET', 'NOUN', 'ADP', 'NOUN', 'DET', 'AUX', 'ADP', 'NOUN', 'ADP', 'DET', 'NOUN'], ['PRON', 'VERB', 'DET', 'NOUN', 'VERB', 'ADV', 'PUNCT', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'CCONJ', 'VERB', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'PUNCT'], ['ADV', 'VERB', 'ADP', 'DET', 'NOUN', 'ADJ', 'NOUN', 'AUX', 'VERB', 'ADV', 'CCONJ', 'ADV', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN'], ['NOUN', 'NOUN', 'AUX', 'DET', 'VERB', 'NOUN', 'ADP', 'DET', 'NOUN', 'VERB', 'DET', 'ADV', 'ADJ', 'NOUN', 'NOUN'], ['DET', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'CCONJ', 'VERB', 'DET', 'NOUN', 'ADP', 'NOUN', 'CCONJ', 'NOUN', 'SPACE', 'NUM', 'NOUN', 'NOUN', 'ADP', 'NOUN', 'CCONJ', 'NOUN', 'PUNCT'], ['DET', 'NOUN', 'VERB', 'DET', 'NOUN', 'PUNCT'], ['PRON', 'AUX', 'ADV', 'VERB', 'NOUN', 'NO