In [1]:
# import necessary libraries
import pandas as pd
import spacy

# Read in dataset and structure it in a way that is beneficial for further processing.

In [3]:
# read in data
# currently reading in full dataset; small batch of data also available
# path when running notebook via VS Code
#df = pd.read_csv("data/pnlp_data_en.csv", sep=";")

# path when running notebook via jupyter
df = pd.read_csv("../../data/pnlp_data_en.csv", sep=";")
df.head()

Unnamed: 0,Report Grouping,Question Text,Comments
0,Large Department,Please tell us what is working well.,"we do what our customers need, we communicate ..."
1,Large Department,Please tell us what is working well.,Customs business development continues to grow...
2,Large Department,Please tell us what is working well.,"I think the team work hard, are committed to c..."
3,Large Department,Please tell us what is working well.,Overall working towards a customer centric env...
4,Large Department,Please tell us what is working well.,Customer centricity is a growing culture in th...


In [4]:
# split up dataframe into individual sets to continue processing
large_department = df[df['Report Grouping'] == 'Large Department']
ld_q1 = large_department[large_department['Question Text'] == 'Please tell us what is working well.']
ld_q2 = large_department[large_department['Question Text'] == 'Please tell us what needs to be improved.']

small_department = df[df['Report Grouping'] == 'Small Department']
sd_q1 = small_department[small_department['Question Text'] == 'Please tell us what is working well.']
sd_q2 = small_department[small_department['Question Text'] == 'Please tell us what needs to be improved.']

# Evaluate whether Twitter-based datasets are viable as training data

Idea: from initial evaluation of data, it seemed that many answers were written in an almost stream-of-consciousness style, corresponding to tweets. I will attempt to compare our dataset to the publicly available Sentiment140 dataset (http://help.sentiment140.com/for-students. To comprehend the meaning of the data, here is a description of it from the site mentioned above:

0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)

1 - the id of the tweet (2087)

2 - the date of the tweet (Sat May 16 23:58:44 UTC 2009)

3 - the query (lyx). If there is no query, then this value is NO_QUERY.

4 - the user that tweeted (robotickilldozr)

5 - the text of the tweet (Lyx is cool)

In [8]:
# load dataset
s140_train = pd.read_csv("../../data/sentiment140_training.csv", sep=",", encoding="latin1", header=None)
s140_train.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


# Extract features and enrich corresponding dataframes with them.

In [4]:
nlp = spacy.load("en_core_web_sm")

In [20]:
print(ld_q1['Comments'])

0       we do what our customers need, we communicate ...
1       Customs business development continues to grow...
2       I think the team work hard, are committed to c...
3       Overall working towards a customer centric env...
4       Customer centricity is a growing culture in th...
                              ...                        
8018    Nothing is working well. industry standards ar...
8019    I think the additional training has been posit...
8020    I believe we have a good core structure, despi...
8021    People are gaining more confidence within thei...
8022    i enjoy working with some of the team and can ...
Name: Comments, Length: 8023, dtype: object


### A Function for POS Tagging

In [12]:
def pos_tag(df, verbose=False, limit=-1):
    """
    Extracts POS tags for every answer (important: one answer may contain multiple sentences) in a given dataframe and returns them as a list of lists.
    Note: Function is specified for df structure of complete dataset provided by deepsight. Index adjustments might be necessary on different dfs.

    df: The dataframe containing the sentences to be tagged.
    verbose: Should the function report progress? False by default.
    limit: If only the first n sents should be tagged, give n as limit. Tags entire df by default.
    """
    # list used to store the individual lists of POS tags
    pos_tags = []

    # running index used for limit and state updates
    i = 0
    # set variable encoding amount of total answers to tag
    if limit == -1:
        end = len(df.index)
    else:
        end = limit

    # iterature over entire dataframe
    for row in df.iterrows():
        # list used to store POS tags of any given answer
        temp = []
        # iterate over given answer
        # adjust here for different df structures!
        for token in nlp(row[1][2]):
            temp.append(token.pos_)
        # add new tags to complete list of tags
        pos_tags.append(temp)

        # defines output of verbose run of function
        if verbose:
            print("Tagging answer", i+1, "of", end)

        # stops function when limit is reached 
        if limit != -1:
            if i == limit - 1: return(pos_tags)
        
        i += 1

    return(pos_tags)

In [19]:
# testing the POS tag function
taglist = pos_tag(ld_q1, verbose=True, limit=10)
print(ld_q1['Comments'][0])
print(taglist[0])

Tagging answer 1 of 10
Tagging answer 2 of 10
Tagging answer 3 of 10
Tagging answer 4 of 10
Tagging answer 5 of 10
Tagging answer 6 of 10
Tagging answer 7 of 10
Tagging answer 8 of 10
Tagging answer 9 of 10
Tagging answer 10 of 10
we do what our customers need, we communicate aperiodically.
['PRON', 'AUX', 'PRON', 'DET', 'NOUN', 'VERB', 'PUNCT', 'PRON', 'VERB', 'ADV', 'PUNCT']


### A Function for Lemmatization

In [23]:
def lemmatize(df, verbose=False, limit=-1):
    """
    Extracts lemmas for every answer (important: one answer may contain multiple sentences) in a given dataframe and returns them as a list of lists.
    Note: Function is specified for df structure of complete dataset provided by deepsight. Index adjustments might be necessary on different dfs.

    df: The dataframe containing the sentences to be lemmatized.
    verbose: Should the function report progress? False by default.
    limit: If only the first n sents should be lemmatized, give n as limit. Lemmatizes entire df by default.
    """
    # list used to store the individual lists of POS tags
    lemmas = []

    # running index used for limit and state updates
    i = 0
    # set variable encoding amount of total answers to tag
    if limit == -1:
        end = len(df.index)
    else:
        end = limit

    # iterature over entire dataframe
    for row in df.iterrows():
        # list used to store POS tags of any given answer
        temp = []
        # iterate over given answer
        # adjust here for different df structures!
        for token in nlp(row[1][2]):
            temp.append(token.lemma_)
        # add new tags to complete list of tags
        lemmas.append(temp)

        # defines output of verbose run of function
        if verbose:
            print("Lemmatizing answer", i+1, "of", end)

        # stops function when limit is reached 
        if limit != -1:
            if i == limit - 1: return(lemmas)
        
        i += 1

    return(lemmas)

In [24]:
# testing the POS tag function
lemmalist = lemmatize(ld_q1, verbose=True, limit=10)
print(ld_q1['Comments'][0])
print(lemmalist[0])

Lemmatizing answer 1 of 10
Lemmatizing answer 2 of 10
Lemmatizing answer 3 of 10
Lemmatizing answer 4 of 10
Lemmatizing answer 5 of 10
Lemmatizing answer 6 of 10
Lemmatizing answer 7 of 10
Lemmatizing answer 8 of 10
Lemmatizing answer 9 of 10
Lemmatizing answer 10 of 10
we do what our customers need, we communicate aperiodically.
['-PRON-', 'do', 'what', '-PRON-', 'customer', 'need', ',', '-PRON-', 'communicate', 'aperiodically', '.']
