<center style="color:maroon"> <h1> Part II NLP Coursework 1 </h1> </center>
<center> <h3> Sentiment classifier for movie reviews from the BBC archive of film reviews </h3> </center>


GOAL: develop a sentiment classifier for movie reviews trained on a DIY corpus built from the BBC archive of film reviews

In [28]:
# IMPORT LIBRARIES
import nltk
from urllib import request
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np
import random
from nltk.classify import scikitlearn
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
from sklearn.naive_bayes import BernoulliNB
from tqdm import tqdm_notebook as tqdm
from nltk import word_tokenize
import operator
from functools import reduce

<font color='maroon'> <h2> 1. Obtaining the Corpus </font> 

The **goal** of this section is the one of **obtaining the corpus**, composed by the list of reviews  for each movie and the grade, which goes from 1 to 5. 

In [29]:
# [ Movies are ordered alphabetically + a page for movies which starts with a digit ]
# OBTAIN A LIST WITH LINKS (link_az) . Each link is the page containing movies for the specific initial letter.

# 1. Get list of letters and '0-9'
a_z = [chr(i) for i in range(ord('a'),ord('z')+1)] # stackoverflow.com/questions/16060899/alphabet-range-python
a_z.append('0-9')

# 2. Get the list of link_az
link_az = []
for x in a_z:
    link =  'http://www.bbc.co.uk/films/gateways/az/review/cinema/'+str(x)+ '.shtml'
    link_az.append(link)

In [30]:
# The page for each link leads to the movie review, through a new link.
# So, I Obtain a new list (all_links) with all links to be used for scraping the reviews

all_links = []
for link in link_az:
    url = link
    html = request.urlopen(url).read().decode('utf8')
    soup = BeautifulSoup(html,'lxml')
    for i in soup.findAll('a'):          # Would be better to narrow the analysis in the div class = 'content' portion
        all_links.append(i.get('href')) 

In [31]:
# Overcome the issue of having not only links of the review, but every link found in the page

all_links = [link for link in all_links if link is not None ]    # avoid the None value
all_links = [link for link in all_links if link.startswith('/films/')]   # all useful link start with this string
all_links = [link for link in all_links if link[7].isdigit()]   # only the useful link have this characteristics

In [32]:
# Define function to get the review, in the later scraping 
def get_review(soup):
    review = []
    for p in soup.findAll("div", {"class": "content"}):
        review.append(p.text)
        return(review)

In [33]:
# Define function to get the score for each review, when scraping
def get_score(soup):
    score = []
    for td in soup.find_all('td', 'film-ratings'):
        for img in td.find_all('img', alt=True):
            score.append((img['alt'][0]))
            return(score)

In [34]:
# Scrape the review and their label 

links = all_links[:] # Prepare this line in order to eventually modify the size later
reviews = []
scores = []
for link in tqdm(links):
    try:
        url = 'http://www.bbc.co.uk' + str(link)
        html = request.urlopen(url).read().decode('utf8')
        soup = BeautifulSoup(html,'lxml')
        r = get_review(soup)
        s = get_score(soup)
        reviews.append(r)
        scores.append(s)
    except:
        pass

HBox(children=(IntProgress(value=0, max=3591), HTML(value='')))

In [35]:
# Turn the collection of reviews and scores into a dataframe

df = pd.DataFrame({'rev' : reviews ,
                  'sc' : scores},
                  columns=['rev','sc'])

In [36]:
# eliminate None/Nan values
df = df.replace(to_replace=[None], value = np.nan)
df = df.dropna(axis=0, how = 'any')

In [37]:
# I have few issues:
# 1. Need to convert each review in a string
# 2. Need to remove the escape sequence

review = []          
for i in df.rev:
    a = re.sub(r'<.*?>' ,'', i[0])
    a = a.split('\n')
    a = ''.join(a).strip()
    review.append(a)
    
df['review'] = review
df = df.drop(['rev'], axis=1)



In [38]:
# Trasform the Score variable to 'int' variables

score = []
for x in df.sc:
    b = x[0]
    score.append(b)
    
df['score'] = score
df = df.drop(['sc'], axis = 1)
df['score'] = df.score.astype('int')

In [39]:
# Create the label sentiment with pos and neg, and label = 0 for values = 3.
# In this way, I can eventually drop this label if the output pos and neg are unbalanced.

sentiment = []
for x in df.score:
    if x > 3:
        a = 'pos'
        sentiment.append(a)
    elif x < 3:
        a = 'neg'
        sentiment.append(a)
    else:
        a = 0 # this is going to be dropped to balance the output
        sentiment.append(a)
        
df['sentiment'] = sentiment

In [43]:
# Verify the balance between pos and neg
df.sentiment.value_counts()

pos    1168
neg     965
Name: sentiment, dtype: int64

In [41]:
# Balance the dataset 
df = df[df.sentiment != 0]

In [42]:
# Save the dataset in order to avoid re-running all the code in case of failure of my kernel
df.to_csv('reviews.csv')

---

<font color='maroon'> <h2> 2. Prepare the Corpus for the Sentiment Analysis </font> 

In [44]:
# Load the dataset obtained 
df = pd.read_csv('reviews.csv')
df = df.drop(['Unnamed: 0'], axis = 1)

In [45]:
# Obtain a list of tuples containing the words per review and the score 
documents = [(word_tokenize(review), cat) for review, cat in zip(df.review,df.sentiment)]
random.shuffle(documents)
len(documents)

2133

In [46]:
# Obtain a list containing all the tokenized reviews
rev_words = []
for r in df.review:
    res = word_tokenize(r)
    rev_words.append(res)

In [47]:
# Unnest the nested list ( http://book.pythontips.com/en/latest/map_filter.html )
rev_words = reduce(operator.add, rev_words )

In [48]:
# Frequency distribution of all the words included in the rev_words list 
all_words = nltk.FreqDist(w.lower() for w in rev_words)
all_words

FreqDist({'the': 32940, ',': 32797, '.': 19657, 'a': 19478, 'of': 17566, 'and': 16014, 'to': 13758, "'s": 11772, 'in': 9993, 'is': 9849, ...})

In [49]:
# Creating a list of the x most common words included in the all_words list
# In this case, x = 500
word_features = [w for (w,ct) in all_words.most_common(500)]

In [50]:
def document_features(document): 
    document_words = set(document) 
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [51]:
featuresets = [(document_features(d), c) for (d,c) in documents]
len(featuresets)

2133

In [52]:
# Split Train and Test
train_set, test_set = featuresets[500:], featuresets[:500]

---

<font color='maroon'> <h2> 3. Apply ML classifier  </font> 
    
The following section aims to lead the analysis by applying different classifiers:

- Naive Bayes
- Decision Tree
- Multinomial Naive Bayes
- Bernoulli Naive Bayes
- Logistic Regression 

In [53]:
# Naive Bayes
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [54]:
# Decision Tree
DT_classifier = nltk.DecisionTreeClassifier.train(train_set)

In [55]:
# Multinomial NB
classifierMNB = SklearnClassifier(MultinomialNB())
classifierMNB.train(train_set)

<SklearnClassifier(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))>

In [56]:
# Bernoulli NB
classifierBNB=SklearnClassifier(BernoulliNB())
classifierBNB.train(train_set)

<SklearnClassifier(BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))>

In [57]:
# Logistic Regression 
classifierLR = SklearnClassifier(LogisticRegression())
classifierLR.train(train_set)



<SklearnClassifier(LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))>

In [58]:
print('Decision Tree:', nltk.classify.accuracy(DT_classifier, test_set))
print('Multinomial NB:',nltk.classify.accuracy( classifierMNB,test_set))
print('Bernoulli NB:',nltk.classify.accuracy(classifierBNB, test_set))
print('Logistic Regression:', nltk.classify.accuracy(classifierLR,test_set))
print('Naive Bayes:', nltk.classify.accuracy(classifier, test_set))

Decision Tree: 0.626
Multinomial NB: 0.78
Bernoulli NB: 0.768
Logistic Regression: 0.758
Naive Bayes: 0.77


In [59]:
# Check most informative words for Naive Bayes
print(classifier.show_most_informative_features(5))

Most Informative Features
 contains(unfortunately) = True              neg : pos    =      6.6 : 1.0
        contains(moving) = True              pos : neg    =      6.6 : 1.0
         contains(sadly) = True              neg : pos    =      4.7 : 1.0
   contains(documentary) = True              pos : neg    =      3.2 : 1.0
        contains(review) = True              pos : neg    =      3.1 : 1.0
None


<font color='maroon'> <h2> 4. Attempts to improve accuracy  </font> 
    
In the following section I will perform different attempts in order to improve the accuracy.

**NOTE:** Regression Tree will not be tested anymore, due to excessive sloweness of the evaluation of the code. Moreover, its accuracy seems to be the weakest so far.

### a- Remove Punctuation 

In [396]:
# Obtain a list containing all the tokenized reviews

rev_words = []
for r in df.review:
    res = re.findall(r'\w+', r)
    rev_words.append(res)

# Unnest the nested list ( http://book.pythontips.com/en/latest/map_filter.html )
rev_words = reduce(operator.add, rev_words )

In [397]:
# Frequency distribution of all the words included in the rev_words list 
all_words = nltk.FreqDist(w.lower() for w in rev_words)
all_words

FreqDist({'the': 33737, 'a': 19644, 'of': 17666, 'and': 16153, 'to': 13863, 's': 11978, 'in': 10522, 'is': 9611, 'with': 6229, 'it': 5848, ...})

In [398]:
# Creating a list of the x most common words included in the all_words list
# In this case, x = 3000
word_features = [w for (w,ct) in all_words.most_common(3000)]

In [399]:
# Obtain Feature sets
featuresets = [(document_features(d), c) for (d,c) in documents]
# Split Train and Test
train_set, test_set = featuresets[500:], featuresets[:500]

In [400]:
# Create a function to test all the model quicker

def tryall_models():
    classifier = nltk.NaiveBayesClassifier.train(train_set) # Naive Bayes
    classifierMNB = SklearnClassifier(MultinomialNB()) # Multinomial NB
    classifierMNB.train(train_set)
    classifierBNB=SklearnClassifier(BernoulliNB()) # Bernoulli NB
    classifierBNB.train(train_set)
    classifierLR = SklearnClassifier(LogisticRegression()) #Logistic Regression 
    classifierLR.train(train_set)
    
    print('Multinomial NB:',nltk.classify.accuracy( classifierMNB,test_set))
    print('Bernoulli NB:',nltk.classify.accuracy(classifierBNB, test_set))
    print('Logistic Regression:', nltk.classify.accuracy(classifierLR,test_set))
    print('Naive Bayes:', nltk.classify.accuracy(classifier, test_set))

In [401]:
tryall_models()



Multinomial NB: 0.844
Bernoulli NB: 0.842
Logistic Regression: 0.836
Naive Bayes: 0.844


### b- Vary the size of the Feature set

In [158]:
# Define a function with number of feature sets as parameter

def attempts(numb):
    word_features = [w for (w,ct) in all_words.most_common(numb)]
    
    def document_features(document): 
        document_words = set(document) 
        features = {}
        for word in word_features:
            features['contains({})'.format(word)] = (word in document_words)
        return features
    
    featuresets = [(document_features(d), c) for (d,c) in documents]
    global train_set,test_set
    train_set, test_set = featuresets[500:], featuresets[:500]

In [407]:
for x in tqdm(range(1000,8000, 1000)):
    attempts(x)
    print('Attempt with',x,'Common words:')
    tryall_models()
    print('\n\n')

HBox(children=(IntProgress(value=0, max=7), HTML(value='')))

Attempt with 1000 Common words:




Multinomial NB: 0.786
Bernoulli NB: 0.798
Logistic Regression: 0.764
Naive Bayes: 0.798



Attempt with 2000 Common words:




Multinomial NB: 0.82
Bernoulli NB: 0.82
Logistic Regression: 0.828
Naive Bayes: 0.818



Attempt with 3000 Common words:




Multinomial NB: 0.844
Bernoulli NB: 0.842
Logistic Regression: 0.836
Naive Bayes: 0.844



Attempt with 4000 Common words:




Multinomial NB: 0.848
Bernoulli NB: 0.848
Logistic Regression: 0.842
Naive Bayes: 0.854



Attempt with 5000 Common words:




Multinomial NB: 0.848
Bernoulli NB: 0.854
Logistic Regression: 0.834
Naive Bayes: 0.848



Attempt with 6000 Common words:




Multinomial NB: 0.852
Bernoulli NB: 0.856
Logistic Regression: 0.842
Naive Bayes: 0.854



Attempt with 7000 Common words:




Multinomial NB: 0.852
Bernoulli NB: 0.866
Logistic Regression: 0.842
Naive Bayes: 0.854





---

### c. Eliminate stopwords from the set 

In [61]:
# With stopwords the feature size will change.
# Define a function to attempt the different sizes of the feature sets out of the stopwords

def attempts_stopwords(numb):
    stop = stopwords.words('english')
    word_features = [w for (w,ct) in all_words.most_common(numb)]
    for word in word_features:
        if word in stop:
            word_features.remove(word)
    
    
    def document_features(document): 
        document_words = set(document) 
        features = {}
        for word in word_features:
            features['contains({})'.format(word)] = (word in document_words)
        return features
    
    featuresets = [(document_features(d), c) for (d,c) in documents]
    global train_set,test_set
    train_set, test_set = featuresets[500:], featuresets[:500]

In [413]:
for x in tqdm(range(2000,8000, 1000)):
    attempts_stopwords(x)
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print('accuracy with',x,'most common words = ', nltk.classify.accuracy(classifier, test_set))   

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

accuracy with 2000 most common words =  0.816
accuracy with 3000 most common words =  0.84
accuracy with 4000 most common words =  0.852
accuracy with 5000 most common words =  0.842
accuracy with 6000 most common words =  0.848


In [62]:
for x in tqdm(range(4000,12000, 1000)):
    attempts_stopwords(x)
    classifierBNB=SklearnClassifier(BernoulliNB())
    classifierBNB.train(train_set)
    print('accuracy with',x,'most common words = ', nltk.classify.accuracy(classifierBNB, test_set))

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

accuracy with 4000 most common words =  0.856
accuracy with 5000 most common words =  0.84
accuracy with 6000 most common words =  0.846
accuracy with 7000 most common words =  0.852
accuracy with 8000 most common words =  0.862
accuracy with 9000 most common words =  0.86
accuracy with 10000 most common words =  0.854
accuracy with 11000 most common words =  0.85


### d. Modify the size of the words

In [454]:
# Words lower than 7
word = [w for (w,ct) in all_words.most_common(7000)]
word_features = []
for word in word:
    if len(word) < 7:
        word_features.append(word)

In [455]:
featuresets = [(document_features(d), c) for (d,c) in documents]
global train_set,test_set
train_set, test_set = featuresets[500:], featuresets[:500]

In [456]:
classifierBNB=SklearnClassifier(BernoulliNB())
classifierBNB.train(train_set)
print('Bernoulli NB:',nltk.classify.accuracy(classifierBNB, test_set))

Bernoulli NB: 0.82


In [67]:
# Higher than 4
word = [w for (w,ct) in all_words.most_common(7000)]
word_features = []
for word in word:
    if len(word) > 3:
        word_features.append(word)

In [68]:
featuresets = [(document_features(d), c) for (d,c) in documents]
global train_set,test_set
train_set, test_set = featuresets[500:], featuresets[:500]

In [69]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.856

In [70]:
print(classifier.show_most_informative_features(5))

Most Informative Features
       contains(unfunny) = True              neg : pos    =     22.1 : 1.0
       contains(creates) = True              pos : neg    =     17.9 : 1.0
      contains(physical) = True              pos : neg    =     17.3 : 1.0
     contains(affecting) = True              pos : neg    =     13.1 : 1.0
  contains(unconvincing) = True              neg : pos    =     12.9 : 1.0
None


In [475]:
classifierBNB=SklearnClassifier(BernoulliNB())
classifierBNB.train(train_set)
print('Bernoulli NB:',nltk.classify.accuracy(classifierBNB, test_set))

Bernoulli NB: 0.856


---