In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
nltk.download('words')

[nltk_data] Downloading package words to C:\Users\Jordan
[nltk_data]     Bowman\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [2]:
# Read in csv containing scraped reviews.
raw_reviews = pd.read_csv("reviews.csv", low_memory=False)
raw_reviews.head()

Unnamed: 0.1,Unnamed: 0,__typename,adminReviewedAt,attendanceMandatory,clarityRating,class,comment,createdByUser,date,difficultyRating,...,isForCredit,isForOnlineClass,legacyId,ratingTags,teacherNote,textbookUse,thumbs,thumbsDownTotal,thumbsUpTotal,wouldTakeAgain
0,0,Rating,2011-01-26 17:30:48 +0000 UTC,,5,HIS194,He is amazing! I had lots of fun in his class....,False,2011-01-26 17:21:05 +0000 UTC,1,...,False,False,18184368,,,5.0,[],0,0,
1,1,Rating,2010-04-06 11:26:35 +0000 UTC,,2,HIS194A,"Bad prof, simply put. Thankfully, Borgen is ap...",False,2010-01-11 21:59:48 +0000 UTC,2,...,False,False,16712359,,,5.0,[],0,0,
2,2,Rating,2010-04-01 21:59:55 +0000 UTC,,2,HIS194A,"Pretentious, disorganized, rants, and goes off...",False,2009-12-22 00:52:25 +0000 UTC,2,...,False,False,16620461,,,5.0,[],1,0,
3,3,Rating,2010-04-01 00:55:36 +0000 UTC,,2,HIS194A,"Very disorganized in lecture, does not go into...",False,2009-12-20 04:13:29 +0000 UTC,3,...,False,False,16608094,,,3.0,[],0,0,
4,4,Rating,2010-03-09 21:34:28 +0000 UTC,,3,JPN101,"Pretentious, scatter-brained, and enjoys belit...",False,2009-11-12 23:29:20 +0000 UTC,2,...,False,False,16361756,,,4.0,[],0,0,


In [3]:
# Subset to relevant information from the raw data.
reviews = raw_reviews[['clarityRating', 'helpfulRating', 'difficultyRating', 'comment']]
reviews

Unnamed: 0,clarityRating,helpfulRating,difficultyRating,comment
0,5,5,1,He is amazing! I had lots of fun in his class....
1,2,1,2,"Bad prof, simply put. Thankfully, Borgen is ap..."
2,2,1,2,"Pretentious, disorganized, rants, and goes off..."
3,2,2,3,"Very disorganized in lecture, does not go into..."
4,3,2,2,"Pretentious, scatter-brained, and enjoys belit..."
...,...,...,...,...
65888,2,2,4,Probably one of the worst classes I've taken h...
65889,2,2,4,Reading memos are time consuming and there is ...
65890,4,4,4,This class was great and the professor is real...
65891,4,4,4,Professor Clerge gave us good readings and add...


Although RateMyProfessors gives one score for each review, the score is actually composed of an average of two internal values for clarity and helpfulness. So we will create a new column of `qualityRating` that reflects the quality score actually presented on the website with each review.

In [4]:
# Recreates the "qualityRating" score from the mean of "clarityRating" and "helpfulRating".
reviews.insert(3, 'qualityRating', (reviews.clarityRating+reviews.helpfulRating)/2)
reviews

Unnamed: 0,clarityRating,helpfulRating,difficultyRating,qualityRating,comment
0,5,5,1,5.0,He is amazing! I had lots of fun in his class....
1,2,1,2,1.5,"Bad prof, simply put. Thankfully, Borgen is ap..."
2,2,1,2,1.5,"Pretentious, disorganized, rants, and goes off..."
3,2,2,3,2.0,"Very disorganized in lecture, does not go into..."
4,3,2,2,2.5,"Pretentious, scatter-brained, and enjoys belit..."
...,...,...,...,...,...
65888,2,2,4,2.0,Probably one of the worst classes I've taken h...
65889,2,2,4,2.0,Reading memos are time consuming and there is ...
65890,4,4,4,4.0,This class was great and the professor is real...
65891,4,4,4,4.0,Professor Clerge gave us good readings and add...


Now remove all rows containing NAs.

In [5]:
# Remove any rows containing NAs.
reviews = reviews.dropna()
reviews

Unnamed: 0,clarityRating,helpfulRating,difficultyRating,qualityRating,comment
0,5,5,1,5.0,He is amazing! I had lots of fun in his class....
1,2,1,2,1.5,"Bad prof, simply put. Thankfully, Borgen is ap..."
2,2,1,2,1.5,"Pretentious, disorganized, rants, and goes off..."
3,2,2,3,2.0,"Very disorganized in lecture, does not go into..."
4,3,2,2,2.5,"Pretentious, scatter-brained, and enjoys belit..."
...,...,...,...,...,...
65888,2,2,4,2.0,Probably one of the worst classes I've taken h...
65889,2,2,4,2.0,Reading memos are time consuming and there is ...
65890,4,4,4,4.0,This class was great and the professor is real...
65891,4,4,4,4.0,Professor Clerge gave us good readings and add...


In addition to the rows containing NAs, there are many rows containing only "No Comments" for in their comment column. This is presumably a default response of the database in any case where the user did not input a comment. However it may be interesting to examine the distribution of the `qualityRating` scores for these uncommented reviews.

In [None]:
# Subset the rows with "No Comments".
no_comment = reviews[reviews.comment == "No Comments"]
# Create a simple graph of the qualityRating of these rows.


In [6]:
# Remove all rows with "No Comments".
reviews = reviews[reviews.comment != "No Comments"]

Now, we will begin to process the actual comment strings themselves.

In [7]:
# Convert review comments to strings.
reviews = reviews.astype({'comment':'string'})
reviews

Unnamed: 0,clarityRating,helpfulRating,difficultyRating,qualityRating,comment
0,5,5,1,5.0,He is amazing! I had lots of fun in his class....
1,2,1,2,1.5,"Bad prof, simply put. Thankfully, Borgen is ap..."
2,2,1,2,1.5,"Pretentious, disorganized, rants, and goes off..."
3,2,2,3,2.0,"Very disorganized in lecture, does not go into..."
4,3,2,2,2.5,"Pretentious, scatter-brained, and enjoys belit..."
...,...,...,...,...,...
65888,2,2,4,2.0,Probably one of the worst classes I've taken h...
65889,2,2,4,2.0,Reading memos are time consuming and there is ...
65890,4,4,4,4.0,This class was great and the professor is real...
65891,4,4,4,4.0,Professor Clerge gave us good readings and add...


In [8]:
# Convert / used in the cases of descriptions into spaces for tokenization.
reviews['comment'] = [comment.replace('/', ' ') for comment in reviews.comment]
# Convert - used in the cases of descriptions into spaces for tokenization.
reviews['comment'] = [comment.replace('-', ' ') for comment in reviews.comment]

In [9]:
# Tokenize comments into words.
comment_words = [nltk.word_tokenize(comment) for comment in reviews.comment]

In [10]:
# Convert tokens to lowercase.
comment_words = [[word.lower() for word in word_list] for word_list in comment_words]

We next need to lemmatize the words.

In [11]:
# Creates a list of lists of tuples with word and part of speech for each coment 
comment_words_pos = [nltk.pos_tag(word_list) for word_list in comment_words]

In [12]:
# Function to convert Treebank part of speech tags into WordNet tags.
def wordnet_pos(treebank_pos):
    '''Converts Treebank pos tags into WordNet tags.
    '''
    if treebank_pos.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif treebank_pos.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif treebank_pos.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif treebank_pos.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return None

In [13]:
# Convert Treebank part of speech tags into WordNet tags.
comment_words_pos = [[(word, wordnet_pos(pos)) for word, pos in comment] for comment in comment_words_pos]

In [14]:
# Fuction to lemmatize a word with ntlk.WordNetLemmatizer.
def wordnet_lemmatize(word, pos):
    '''This function takes in a word and a part of speech (word, pos) tuple and uses an
    ntlk.WordNetLemmatizer object to lemmatize the word.  Part of speech must be given
    as a WordNet tag, or as None.
    '''
    # Create lemmatizer.
    lemmatizer = nltk.WordNetLemmatizer()
    # If part of speech provided, lemmatize using it, otherwise do not.
    if pos is not None:
        lemma_word = lemmatizer.lemmatize(word, pos)
    else:
        lemma_word = lemmatizer.lemmatize(word)
    return lemma_word

In [15]:
# Lemmatize.
comment_words = [[wordnet_lemmatize(word, pos) for word, pos in comment] for comment in comment_words_pos]

Now we will remove punctuation and stopwords.

In [16]:
# Create a list of punctuation to remove
punct = ['.', ',', ';', ':', '-', '--', '(', ')', '[', ']', '...', '&', '!', '@', '$']
# Remove punctuation.
comment_words = [[word for word in comment if word not in punct] for comment in comment_words]

In [17]:
# Get a list of stopwords English from nltk corpus.
stopwords = nltk.corpus.stopwords.words('english')
# This list of stopwords contains 'not'.  However 'not' is important for sentiment negation.
stopwords.remove('not')
# Remove stopwords from list of words.
comment_words = [[word for word in comment if word not in stopwords] for comment in comment_words]

There is some remaining non-English text that must be removed. This is a computational problem because the NLTK corpus of English words that we intend to use contains 236736 words. (This dataset does not contain conjugations which is why we needed to lemmatize first) Directly checking if each word in our dataset is within that corpus or not would take a very long time. A much faster approach is to first create a set of all the unique words and then only compare the each unique word. From there, we can work backwards to elmininate all of the non-English words from the original list of words.

In [18]:
# Create a list of all words in English (from NLTK).
eng_words = nltk.corpus.words.words()

In [19]:
# Create a list of all unique words in comment_words.
words_list = [word for comment in comment_words for word in comment]
unique_words = list(set(words_list))

In [20]:
# Create a list of non-English words in our list of words.
non_eng_words = [word for word in unique_words if word not in eng_words]

In [21]:
# Remove non-English words from list of words.
comment_words = [[word for word in comment if word not in non_eng_words] for comment in comment_words]

Convert list of lists of words into a list of strings of words separated by spaces. We must do this because CSVs don't store datatypes.

In [22]:
comment_words = [" ".join(words) for words in comment_words]

Concatenate this final list back onto the dataframe.

In [24]:
# Append the new column of word lists onto the dataframe.
reviews = reviews.assign(commentWords=comment_words)
reviews.head()

Unnamed: 0,clarityRating,helpfulRating,difficultyRating,qualityRating,comment,commentWords
0,5,5,1,5.0,He is amazing! I had lots of fun in his class....,amazing lot fun class get use may seem little ...
1,2,1,2,1.5,"Bad prof, simply put. Thankfully, Borgen is ap...",bad prof simply put thankfully apparently reti...
2,2,1,2,1.5,"Pretentious, disorganized, rants, and goes off...",pretentious disorganize rant go tangent readin...
3,2,2,3,2.0,"Very disorganized in lecture, does not go into...",disorganize lecture not go detail explain term...
4,3,2,2,2.5,"Pretentious, scatter brained, and enjoys belit...",pretentious scatter brain enjoy student stupid...


In [25]:
reviews.to_csv('reviews_cleaned.csv')