In [38]:
import pandas as pd
import os
import numpy as np
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import random

[nltk_data] Downloading package stopwords to C:\Users\Will
[nltk_data]     Boyd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Will
[nltk_data]     Boyd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [45]:
# Load in the dictionary dataset into a pandas df
data_path = os.path.join(os.getcwd(), 'data', 'dictionary.txt')
dic = pd.read_csv(data_path, sep='|', header=None)

# ...and do the same with the sentiment_labels data
data_path = os.path.join(os.getcwd(), 'data', 'sentiment_labels.txt')
cents = pd.read_csv(data_path, sep='|')

dic = dic.rename(columns={0: "feature", 1: "ID"})# We rename the columns
# dic = dic.rename(index={range(239232)})
dic = dic.sort_values(by="ID") # We change the order of the rows to be sorted by ID number
movie_data = dic[["ID", "feature"]] # We change the order of the columns and change the name of the the df

# We make an array of the sentiments (already in the right order) and add it to our df
y = np.array(cents.iloc[:, -1])
movie_data.insert(2, "sentiment", y)

phrase_data = movie_data.sort_index()
phrase_data.drop("ID",axis=1,inplace=True)
phrase_data.insert(2,"label",0)

print(phrase_data.head())

       feature  sentiment  label
0            !    0.50000      0
1          ! '    0.52778      0
2         ! ''    0.50000      0
3       ! Alas    0.44444      0
4  ! Brilliant    0.86111      0


In [46]:
### We create a smaller subsample of the dataset - to speed up the computation when working on our code

# specify a smaller number of reviews
small_N = 2390

# choose small_N random and distinct integers between 0 and 239231
rand = random.sample(range(239231), small_N)

# find these indices in the original dictionary - and make a new array of them
rand_sample = phrase_data.iloc[rand]

rand_sample.head()

Unnamed: 0,feature,sentiment,label
36093,"Its compelling mix of trial movie , escape mov...",0.83333,0
142154,it 's a bargain-basement European pickup .,0.31944,0
62476,a ` very sneaky ' butler who excels in the art...,0.55556,0
23264,Charming and funny ( but ultimately silly ) mo...,0.56944,0
73120,acrid test,0.47222,0


In [47]:
phrases = rand_sample

In [48]:
### add correct labels based on sentiment column (uses qualities of numpy for efficiency)

np_phrase = np.array(phrases) # make it a numpy array

# create a series of boolean masks
vpos = (0.8 < np_phrase[:, 1]).astype(int)
pos =  (0.6 < np_phrase[:, 1]).astype(int)
ntrl = (0.4 < np_phrase[:, 1]).astype(int)
neg = (0.2 < np_phrase[:, 1]).astype(int)
vneg = (0 <= np_phrase[:, 1]).astype(int)

# add the masks together to get the correct label numbers for each review based on sentiment value
labels = vneg + neg + ntrl + pos + vpos - 1

# make a binary label class
binary_labels = (0.5 >= np_phrase[:, 1]).astype(int)

# update the array with our new values
np_phrase[:, 2] = labels
# np_phrase[:, 2] = binary_labels

# change back to a pandas
phrases = pd.DataFrame(np_phrase)

In [49]:
print(len(binary_labels[binary_labels == 0])/len(binary_labels[binary_labels == 1]))

0.7586460632818248


In [50]:
import pickle

# Feature importance

filler_words = set(stopwords.words('english'))
lemmatize = WordNetLemmatizer()

#values =[[0, 0.2], [0.2, 0.4], [0.4, 0.6], [0.6, 0.8], [0.8, 1.0]]
labels = ['very negative', 'negative', 'neutral', 'positive', 'very positive']
to_drop = []

phrases_list = list(phrases.iloc[:, 0])

for i in range(len(phrases_list)):
    # 'clean' phrases: remove numbers, punctuation and filler words
    phrase = phrases_list[i]
    phrase = re.sub(r'[^\w]', " ", phrase) #remove all special characters 
    cleaned = re.sub(r'[\d]', " ", phrase)  #remove all numbers

    if (cleaned.replace(" ","")==""):
        to_drop.append(i)
        cleaned=""
    else:
        cleaned = word_tokenize(cleaned.lower()) #tokenise for bag of words
        cleaned = [w for w in cleaned if w not in filler_words] # #remove all filler words
        cleaned = [lemmatize.lemmatize(word) for word in cleaned]
    
    phrases_list[i] = cleaned
    
phrases.iloc[:, 0] = phrases_list

# remove unnecessary data
phrases.drop(1,axis=1,inplace=True) 
phrases.drop(to_drop,axis=0,inplace=True)
df = phrases[~phrases.astype(str).duplicated()]
df.reset_index(inplace=True)
df.drop("index",axis=1,inplace=True)
print(df.head())
print(df.shape)

#save this as file
df.to_pickle("clean_doc.pkl", protocol=4)

                                                   0  2
0  [compelling, mix, trial, movie, escape, movie,...  4
1              [bargain, basement, european, pickup]  1
2  [sneaky, butler, excels, art, impossible, disa...  2
3        [charming, funny, ultimately, silly, movie]  2
4                                      [acrid, test]  2
(2341, 2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop("index",axis=1,inplace=True)


In [61]:
import pickle
### we are going to train our embedding model on the original review corpus
### in the hope to capture more semantic information
lemmatize = WordNetLemmatizer()
# opening the file in read mode and reading the file
orig_snip = open("original_rt_snippets.txt")
orig_snip = orig_snip.read()
orig_snip = orig_snip.split("\n")

In [62]:
# looping over each review
for i in range(len(orig_snip)):
    
    orig_snip[i] = orig_snip[i].lower() # lower case
    
    orig_snip[i] = re.sub(r'[^\w]', " ", orig_snip[i]) # remove all special characters
    orig_snip[i] = re.sub(r'[\d]', " ", orig_snip[i]) # ...and numbers
    
    orig_snip[i] = nltk.word_tokenize(orig_snip[i]) # make each word an individual string, thus each review is a sublist
    
    orig_snip[i] = [word for word in orig_snip[i] if word not in stopwords.words("english")] # removing stop-words
    orig_snip[i] = [lemmatize.lemmatize(word) for word in orig_snip[i]] # lemmatizing (converting to the grammatical root)

with open("clean_snippets.pkl", "wb") as file:
    pickle.dump(orig_snip, file)
    file.close()

In [None]:
clean_snippets = pd.read_pickle("clean_snippets.pkl").iloc[:-1, :][0]

In [None]:
print(clean_snippets)