In [None]:
# if you haven't installed all the nltk assets required for all functions used in this notebook,
# just uncomment all the lines in this cell and run it once
# import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')

In [None]:
# import necessary libraries
import pandas as pd
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import re, string, random

# global variable for english stopwords
stop_words = stopwords.words('english')

# Read in dataset and structure it in a way that is beneficial for further processing.

In [None]:
# read in data
# currently reading in full dataset; small batch of data also available
# path when running notebook via VS Code
#df = pd.read_csv("data/pnlp_data_en.csv", sep=";")

# path when running notebook via jupyter
df = pd.read_csv("../../data/pnlp_data_en.csv", sep=";")
df.head()

In [None]:
# split up dataframe into individual sets to continue processing
large_department = df[df['Report Grouping'] == 'Large Department']
ld_q1 = large_department[large_department['Question Text'] == 'Please tell us what is working well.']
ld_q2 = large_department[large_department['Question Text'] == 'Please tell us what needs to be improved.']

small_department = df[df['Report Grouping'] == 'Small Department']
sd_q1 = small_department[small_department['Question Text'] == 'Please tell us what is working well.']
sd_q2 = small_department[small_department['Question Text'] == 'Please tell us what needs to be improved.']

## A Function for Data-Cleaning

In [None]:
def remove_noise(tweet_tokens):
    """
    Removes irrelevant information. Furthermore lemmatizes tokens and performs stopword elimination.
    """

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        # remove hyperlinks (specific to twitter dataset in case we want to do transfer learning)
        # token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
        #               '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        # remove mentions of users (also specific to twitter)
        # token = re.sub("(@[A-Za-z0-9_]+)","", token)

        # simplify POS tags for lemmatizer
        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        # perform stopword elimination, punctuation removal
        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
            
        # @TODO: implement spell correction?
    return cleaned_tokens

### A Function for POS Tagging
This was written in a context that was abandoned. I will leave the code here in case I ever need to circle back to it.

In [None]:
def pos_tag(df, verbose=False, limit=-1):
    """
    Extracts POS tags for every answer (important: one answer may contain multiple sentences) in a given dataframe and returns them as a list of lists.
    Note: Function is specified for df structure of complete dataset provided by deepsight. Index adjustments might be necessary on different dfs.

    df: The dataframe containing the sentences to be tagged.
    verbose: Should the function report progress? False by default.
    limit: If only the first n sents should be tagged, give n as limit. Tags entire df by default.
    """
    # list used to store the individual lists of POS tags
    pos_tags = []

    # running index used for limit and state updates
    i = 0
    # set variable encoding amount of total answers to tag
    if limit == -1:
        end = len(df.index)
    else:
        end = limit

    # iterature over entire dataframe
    for row in df.iterrows():
        # list used to store POS tags of any given answer
        temp = []
        # iterate over given answer
        # adjust here for different df structures!
        for token in nlp(row[1][2]):
            temp.append(token.pos_)
        # add new tags to complete list of tags
        pos_tags.append(temp)

        # defines output of verbose run of function
        if verbose:
            print("Tagging answer", i+1, "of", end)

        # stops function when limit is reached 
        if limit != -1:
            if i == limit - 1: return(pos_tags)
        
        i += 1

    return(pos_tags)

In [None]:
# testing the POS tag function
taglist = pos_tag(ld_q1, verbose=True, limit=10)
print(ld_q1['Comments'][0])
print(taglist[0])

### A Function for Lemmatization
This was written in a context that was abandoned. I will leave the code here in case I ever need to circle back to it.

In [None]:
def lemmatize(df, verbose=False, limit=-1):
    """
    Extracts lemmas for every answer (important: one answer may contain multiple sentences) in a given dataframe and returns them as a list of lists.
    Note: Function is specified for df structure of complete dataset provided by deepsight. Index adjustments might be necessary on different dfs.

    df: The dataframe containing the sentences to be lemmatized.
    verbose: Should the function report progress? False by default.
    limit: If only the first n sents should be lemmatized, give n as limit. Lemmatizes entire df by default.
    """
    # list used to store the individual lists of POS tags
    lemmas = []

    # running index used for limit and state updates
    i = 0
    # set variable encoding amount of total answers to tag
    if limit == -1:
        end = len(df.index)
    else:
        end = limit

    # iterature over entire dataframe
    for row in df.iterrows():
        # list used to store POS tags of any given answer
        temp = []
        # iterate over given answer
        # adjust here for different df structures!
        for token in nlp(row[1][2]):
            temp.append(token.lemma_)
        # add new tags to complete list of tags
        lemmas.append(temp)

        # defines output of verbose run of function
        if verbose:
            print("Lemmatizing answer", i+1, "of", end)

        # stops function when limit is reached 
        if limit != -1:
            if i == limit - 1: return(lemmas)
        
        i += 1

    return(lemmas)

In [None]:
# testing the POS tag function
lemmalist = lemmatize(ld_q1, verbose=True, limit=10)
print(ld_q1['Comments'][0])
print(lemmalist[0])