## 統計碩二 108354021 柳瑞俞

## Import package

In [1]:
import nltk
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
nltk.download("movie_reviews")
nltk.download("punkt")

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [15]:
X, Y = build_dataset()

## Build model and Evaluation

In [2]:
def build_dataset():
  labels = []
  instances = []
  for label in movie_reviews.categories():
    for fileid in movie_reviews.fileids(label):
      instances.append(movie_reviews.raw(fileid))
      if label == 'pos':
        labels.append(1)
      else:
        labels.append(0)
  return instances, labels

In [25]:
from collections import Counter, defaultdict
import math

class NaiveBayesClassifier:
  def __init__(self):
    self.k = 0.01  # Smoothing factor
    self.feature_table = set()
    self.y_counts = Counter()
    self.x_y_counts = defaultdict(lambda: Counter())
    self.num_instances = 0

  def extract_features(self, instance) -> set:
    return set(word_tokenize(instance))

  def train(self, instances, labels):
    for instance, label in zip(instances, labels):
      self.y_counts[label] += 1
      for word in self.extract_features(instance):
        self.x_y_counts[word][label] += 1
        self.feature_table.add(word)
    self.num_instances = len(instances)
    print("Number of features: %d" % len(self.feature_table))

  def smooth_prob(self, word, label):
    return (self.x_y_counts[word][label] + self.k) / (self.y_counts[label] + 2 * self.k)

  def predict(self, instance):
    y_probs = Counter()
    features = self.extract_features(instance)
    for y in self.y_counts:
      y_probs[y] = math.log(self.y_counts[y] / self.num_instances)
      for word in self.feature_table:
        prob = self.smooth_prob(word, y)
        if word in features:
          y_probs[y] += math.log(prob)          
        else:
          y_probs[y] += math.log(1.0 - prob)
    return y_probs.most_common(1)[0][0]

In [4]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix

def cross_validation(instances, labels, k, train_pred_func):
  golden_labels = []
  pred_labels = []
  for fold in range(k):
    training_instances = []
    training_labels = []
    test_instances = []
    test_labels = []
    for i in range(len(instances)):
      if i % k == fold:
        test_instances.append(instances[i])
        test_labels.append(labels[i])
      else:
        training_instances.append(instances[i])
        training_labels.append(labels[i])
    pred_labels += train_pred_func(training_instances, training_labels, test_instances)
    golden_labels += test_labels
    #print(pred_labels)
    #print(golden_labels)
  print("Accuracy: %.4f\nPrecision: %.4f\nRecall: %.4f\nF-score: %.4f" % (
      accuracy_score(golden_labels, pred_labels), 
      precision_score(golden_labels, pred_labels), 
      recall_score(golden_labels, pred_labels), 
      f1_score(golden_labels, pred_labels)))
  print(confusion_matrix(golden_labels, pred_labels))

## Features

In [6]:
nltk.download('averaged_perceptron_tagger')
print(nltk.pos_tag(word_tokenize("My black dog ate my cake yesterday.")))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[('My', 'PRP$'), ('black', 'JJ'), ('dog', 'NN'), ('ate', 'VB'), ('my', 'PRP$'), ('cake', 'NN'), ('yesterday', 'NN'), ('.', '.')]


In [7]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize('opened',pos='v'))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
open


In [8]:
instance = "This is a good movie to see"
words = word_tokenize(instance)
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopword_list = stopwords.words('english')
def remove_stopwords(tokens):
 tokens_clean = []
 for tok in tokens:
  if tok not in stopword_list:
   tokens_clean.append(tok)
 return tokens_clean
cc = remove_stopwords(words)
cc

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['This', 'good', 'movie', 'see']

In [9]:
import string
print(string.punctuation) 
def remove_punctuation_marks(tokens):
 clean_tokens = []
 for tok in tokens:
  if tok not in string.punctuation:
    clean_tokens.append(tok)
 return clean_tokens

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [10]:
from nltk.stem.snowball import SnowballStemmer
snowball_stemmer = SnowballStemmer("english")
print(snowball_stemmer.stem('opened'))


open


## Final Model

In [28]:
from itertools import combinations

class MyNBC3(NaiveBayesClassifier):
  '''Define your own feature extraction function'''
  def extract_features(self, instance) -> set:
    words = word_tokenize(instance)
    #words = remove_stopwords(words) 
    words = remove_punctuation_marks(words)
    tags = nltk.pos_tag(words)
    features = set()
    bigram_list = set()
    speech_dict = ['J', 'V', 'N', 'R']
    bigram_speech_list = list(map(list,combinations(speech_dict,2)))
    q,p = [],[]

    for bigram in bigram_speech_list:
      locals()[f"{bigram[0]}_{bigram[1]}_word"] = []

    for w, t in tags:
      # Focus on only adjectives (J), adverbs (R), verbs (V), and nouns (N).
      if t[0] in {'J', 'R', 'V', 'N'}:    
        features.add(w)
      for bigram_speech in bigram_speech_list:
        if t[0] in bigram_speech:
          locals()[f"{bigram_speech[0]}_{bigram_speech[1]}_word"].append(w)

      q.append(t)
      p.append(w)
      for i in range(len(p) - 1):
        if (q[i][0],q[i+1][0]) == ('J', 'N'):
          features.add("%s_%s" % (p[i],p[i+1]))
        elif (q[i][0],q[i+1][0]) == ('R', 'V'):
          features.add("%s_%s" % (p[i],p[i+1]))
        elif (q[i][0],q[i+1][0]) == ('V', 'R'):
          features.add("%s_%s" % (p[i],p[i+1]))
        elif (q[i][0],q[i+1][0]) == ('V', 'N'):
          features.add("%s_%s" % (p[i],p[i+1]))
        elif (q[i][0],q[i+1][0]) == ('V', 'I'):
          features.add("%s_%s" % (p[i],p[i+1]))
        elif (q[i][0],q[i+1][0]) == ('J', 'R'):
          features.add("%s_%s" % (p[i],p[i+1]))


    for bigram_speech in bigram_speech_list:
      bigram_list = locals()[f"{bigram_speech[0]}_{bigram_speech[1]}_word"]

      for k in range(len(bigram_list)-1):
        features.add("%s_%s" % (bigram_list[k], bigram_list[k+1]))
        features.add("%s_%s" % (bigram_list[k+1], bigram_list[k]))

    for i in range(len(words) - 1):
      features.add("bigram_%s_%s" % (words[i], words[i+1]))

    for i in range(len(words) - 2):
      features.add("trigram_%s_%s_%s" % (words[i], words[i+1], words[i+2]))
    #for i in range(len(words) - 3):
     # features.add("fourgram_%s_%s_%s_%s" % (words[i], words[i+1], words[i+2], words[i+3]))
    #for i in range(len(words) - 4):
     # features.add("fifgram_%s_%s_%s_%s_%s" % (words[i], words[i+1], words[i+2], words[i+3], words[i+4]))

    return features

  def train(self, instances, labels):
    for instance, label in zip(instances, labels):
      self.y_counts[label] += 1
      for f in self.extract_features(instance):
        self.x_y_counts[f][label] += 1
        self.feature_table.add(f)
    self.num_instances = len(instances)

    # Reduce less frequent features with a threshold of 5 occurrences.

    for f in self.x_y_counts:
      if sum(self.x_y_counts[f].values()) < 3:
        self.feature_table.remove(f)
    print("Number of features: %d" % len(self.feature_table))

clf = MyNBC3()
print(clf.extract_features("This is my first movie review. I love this movie so much!"))

{'movie_so', 'love_review', 'movie_first', 'trigram_first_movie_review', 'first_love', 'bigram_so_much', 'review_love', 'bigram_movie_so', 'trigram_my_first_movie', 'love', 'so', 'bigram_first_movie', 'bigram_This_is', 'trigram_movie_so_much', 'bigram_my_first', 'bigram_is_my', 'bigram_review_I', 'first_movie', 'review_movie', 'trigram_movie_review_I', 'is_first', 'love_so', 'much_love', 'love_movie', 'much', 'movie_love', 'so_love', 'is_movie', 'trigram_this_movie_so', 'bigram_movie_review', 'love_is', 'trigram_love_this_movie', 'much_so', 'so_movie', 'first_is', 'love_much', 'trigram_This_is_my', 'trigram_I_love_this', 'so_first', 'review', 'so_much', 'movie', 'movie_review', 'first', 'bigram_love_this', 'first_so', 'bigram_this_movie', 'trigram_is_my_first', 'is_love', 'bigram_I_love', 'love_first', 'movie_much', 'trigram_review_I_love', 'much_movie', 'is', 'movie_is'}


In [29]:
def train_then_predict_mynbc(training_instances, training_labels, test_instances):
  clf3 = MyNBC3()
  clf3.train(training_instances, training_labels)
  pred = []
  for test_instance in test_instances:
    pred.append(clf3.predict(test_instance))
  return pred

cross_validation(X, Y, 5, train_then_predict_mynbc)

Number of features: 220695
Number of features: 215789
Number of features: 219053
Number of features: 218186
Number of features: 220793
Accuracy: 0.8575
Precision: 0.8607
Recall: 0.8530
F-score: 0.8569
[[862 138]
 [147 853]]
