**Team Brooklyn:**

*   Jiayi Bao, jb2578
*   Siqi Dai, sd854
*   Guanyunbo Yang, gy92

# Opinion Spam Classification using Language Model

In [0]:
from collections import Counter
import pandas as pd
import math

### <font color='navy'> Mounting Google Drive and Retrieve Dataset </font>

In [0]:
# mount google drive
from google.colab import drive
drive.mount("/content/gdrive")
path = "/content/gdrive/My Drive/P1/DATASET/"  # path that stores the dataset folder

### <font color='navy'> Data Preprocessing </font>

In [0]:
# preprocessing the corpus
def preprocess(fn):
  with open(fn) as f:
    content = f.readlines()
  content = ["<s> " + x.strip() for x in content]  # remove '\n' at end and add <s> to indicate the start of a review
  content = "".join(content)
  for char in ['-', '.', ',', ';', '!', '?', ':', '(', ')', '[', ']', '{', '}',]:  # remove punctuations
    content = content.replace(char,'')
  corpus = content.lower()  # normalization
  for char in [' a ', ' an ', ' the ', ' that ', ' this ', ' these ', ' those ', ' and ', ' but ', ' or ', ' at ', ' in ', ' on ', ' be ', ' is ', ' are ', ' am ', ' were ', ' was ', ' to ', ' with ', ' of ', '/']:  # remove stop words
    corpus = corpus.replace(char, ' ')
  return corpus


# get the preprocessed corpus
tru_train, decp_train = preprocess(path + "train/truthful.txt"), preprocess(path + "train/deceptive.txt")

### <font color='navy'> Unsmoothed Unigram and Bigram </font>

In [0]:
# build a unigram table over the corpus
def unigram_table(content):
  word_list = Counter(content.split())  # count every sigle word
  word_df = pd.DataFrame.from_dict(word_list, orient='index').reset_index()
  word_df.columns = ["word","count"]
  word_df["frequency"] = word_df["count"]/word_df.shape[0]
  word_df = word_df.sort_values(by ='count', ascending=False)
  return word_df


# build a bigram table over the corpus
def bigram_table(content):
  word_list2 = Counter(zip(content.split(),content.split()[1:]))  # count every two words
  keys = list(word_list2.keys())
  # <s> can only be start of a review, delete counts for ('xxx', '<s>')
  for i in keys:
    if i[1] == '<s>':
      del word_list2[i]
  word_df2 = pd.DataFrame.from_dict(word_list2, orient='index').reset_index()
  word_df2.columns = ["word","count"]
  word_df2["frequency"] = word_df2["count"]/word_df2.shape[0]
  word_df2 = word_df2.sort_values(by ='count', ascending=False)
  return word_df2


# compute P(word) (unsmoothed)
def unigram(corpus, table, word):
  if word in table.word.values:
    return table.loc[table['word'] == word, 'count'].iloc[0] / len(corpus)
  else:
    return 0

  
# compute P(word1|word2) (unsmoothed)
def bigram(uni_table, bi_table, word1, word2):
  if (word2, word1) in list(bi_table.word.values):
    count_w1w2 = bi_table.loc[bi_table['word'] == (word2, word1), 'count'].iloc[0]
    count_w2 = uni_table.loc[uni_table['word'] == word2, 'count'].iloc[0] 
    return count_w1w2 / count_w2
  else:
    return 0

In [0]:
# create unigram and bigram tables for the unsmoothed corpus
tru_train_uni, decp_train_uni = unigram_table(tru_train), unigram_table(decp_train)
tru_train_bi, decp_train_bi = bigram_table(tru_train), bigram_table(decp_train)

### <font color='navy'> Smoothing and Unknown Word Handling </font>

In [0]:
# method 1 - handle unknown words by replacing the first occurrence of each word type by <unk>
def handle_unk1(corpus):
  set = []
  c = corpus.split()
  for i in range(len(c)):
    if c[i] == '<s>': continue
    if c[i] not in set:
        set.append(c[i])
        c[i] = "<unk>"
  corpus = ' '.join(c)
  return corpus 


# method 2 - handle unknown words: decide k most common terms in advance on the vocab and replace others as '<unk>'
def handle_unk2(corpus, uni_table, k):
  w = list(uni_table.word.values)
  kMostFreq = w[:k]
  c = corpus.split()
  for i in range(len(c)):
    if c[i] == '<s>': continue
    if c[i] not in kMostFreq:
      c[i] = "<unk>"
  corpus = ' '.join(c)
  return corpus


# compute P(word) (smoothed)
def unigram_smooth(corpus, table, word, addK):
  N = len(corpus)
  V = len(table)
  if word in table.word.values:
    return (table.loc[table['word'] == word, 'count'].iloc[0] + addK) / (N + addK*V)
  else:
    return 0 if addK == 0 else 1 / (N + V)
  

# compute P(word1|word2) (smoothed)
def bigram_smooth(uni_table, bi_table, word1, word2, addK):
  V = len(uni_table)
  if word2 in uni_table.word.values:
    count_w2 = uni_table.loc[uni_table['word'] == word2, 'count'].iloc[0]
  else:
    count_w2 = 1
  if (word2, word1) in list(bi_table.word.values):
    count_w1w2 = bi_table.loc[bi_table['word'] == (word2, word1), 'count'].iloc[0]
    return (count_w1w2 + addK) / (count_w2 + addK*V)
  else:
    #return 1 / (count_w2 + V)
    return 0 if addK == 0 else addK / (addK * V)

In [0]:
# model parameters
K = 500  # K most common terms
addK = 1  # add-k smoothing

# update the corpus （handle '<unk>'）
# tru_train_smooth = handle_unk1(tru_train)
# decp_train_smooth = handle_unk1(decp_train)
tru_train_smooth = handle_unk2(tru_train, tru_train_uni, K)
decp_train_smooth = handle_unk2(decp_train, decp_train_uni, K)

# update unigram and bigram tables for the smoothed corpus
tru_train_uni_smooth, decp_train_uni_smooth = unigram_table(tru_train_smooth), unigram_table(decp_train_smooth)
tru_train_bi_smooth, decp_train_bi_smooth = bigram_table(tru_train_smooth), bigram_table(decp_train_smooth)

### <font color='navy'> Perplexity </font>

In [0]:
def plexity_unigram(train_corpus, val_corpus, uni_table, addK):
  """
  train_corpus: train corpus
  val_corpus: validation corpus after preprocessing
  uni_table: unigram table
  """
  N = len(val_corpus.split())
  s = 0
  for i in range(N):
    P_wi = unigram_smooth(train_corpus, uni_table, val_corpus.split()[i], addK) # compute P(wi|wi-1...w1)
    s += 100 if P_wi == 0 else -math.log(P_wi, 10)
  ans = math.exp(s / N)
  return ans


def plexity_bigram(train_corpus, val_corpus, uni_table, bi_table, addK):
  """
  train_corpus: train corpus
  val_corpus: validation corpus after preprocessing
  bi_model: bigram table
  """
  N = len(val_corpus.split())
  s = 0
  for i in range(1, N):
    if val_corpus[i] == '<s>': continue
    P_wi = bigram_smooth(uni_table, bi_table, val_corpus.split()[i], val_corpus.split()[i-1], addK) # compute P(wi|wi-1...w1)
    s += 100 if P_wi == 0 else -math.log(P_wi, 10)
  ans = math.exp(s / N)
  return ans

### <font color='navy'> LM-based Classification </font>

In [0]:
# Standard way: measure the perplexity of the ”truthful” vs. the ”deceptive” language models on a given review: 
# return the class (truthful, deceptive) associated with the model that produces the lower perplexity score
import csv

def preprocess_test_corpus(fn):
  with open(fn) as f:
    content = f.readlines()
  content = ["<s> " + x.strip() for x in content]  # remove '\n' at end and add <s> to indicate the start of a review
  for i in range(len(content)):
    for char in ['-', '.', ',', ';', '!', '?', ':', '(', ')', '[', ']', '{', '}',]:  # remove punctuations
      content[i] = content[i].replace(char,'')
      content[i] = content[i].lower()  # normalization
    for char in [' a ', ' an ', ' the ', ' that ', ' this ', ' these ', ' those ', ' and ', ' but ', ' or ', ' at ', ' in ', ' on ', ' be ', ' is ', ' are ', ' am ', ' were ', ' was ', ' to ', ' with ', ' of ', '/']:  # remove stop words
      content[i] = content[i].replace(char, ' ')
  return content

def perplexity_classify(test_corpus, real_label, file, ngram):
  '''
  test_corpus: the corpus to classify
  real_label: the real label of the corpus
  '''
  with open(file, 'w', encoding='utf-8-sig') as fileout:
    writer = csv.writer(fileout, delimiter=',')
    writer.writerow(['Id','Prediction'])
    
    N = len(test_corpus)
    num_wrong_label = 0
    for i in range(0, N):
      perp_tru = 0
      perp_decp = 0
      if ngram == 1:
        perp_tru = plexity_unigram(tru_train, test_corpus[i], tru_train_uni_smooth, addK)
        perp_decp = plexity_unigram(decp_train, test_corpus[i], decp_train_uni_smooth, addK)
        
      if ngram == 2:
        perp_tru = plexity_bigram(tru_train, test_corpus[i], tru_train_uni_smooth, tru_train_bi_smooth, addK)
        perp_decp = plexity_bigram(decp_train, test_corpus[i], decp_train_uni_smooth, decp_train_bi_smooth, addK)
      
      label = 0 if perp_tru < perp_decp else 1
      writer.writerow([i, label])
      if real_label != -1:
        num_wrong_label += abs(label - real_label)
    if real_label != -1:
      print("Accuracy = ", 1 - num_wrong_label / N)
    
  print("Classification Done")
  
tru_val, decp_val = preprocess_test_corpus(path+"validation/truthful.txt"), preprocess_test_corpus(path + "validation/deceptive.txt")
test = preprocess_test_corpus(path + "test/test.txt")
print('Classifying tru_val')
perplexity_classify(tru_val, 0, "truthful_out.csv", 1)
print('Classifying decp_val')
perplexity_classify(decp_val, 1, "deceptive_out.csv", 1)
#print('Classifying test')
#perplexity_classify(test, -1, "test_out.csv", 1)

# Opinion Spam Classification using Naive Bayes

### <font color='navy'> Read Data </font>

In [0]:
def read_data(fn): 
    with open(fn) as f:
        x = f.readlines()
    print(fn + " corpus has",len(x),"samples")
    return x
  
xtrain_true, xtrain_false = read_data(path + "train/truthful.txt"), read_data(path + "train/truthful.txt")
xval_true, xval_false =  read_data(path + "validation/truthful.txt"), read_data(path + "validation/deceptive.txt")
xtrain_raw = xtrain_true + xtrain_false
xval_raw = xval_true + xval_false
ytrain = [0] * len(xtrain_true) + [1] * len(xtrain_false)  # truthful -> 0, deceptive -> 1
yval = [0] * len(xval_true) + [1] * len(xval_false)

### <font color='navy'> Data Preprocessing </font>

In [0]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import re
import nltk
from nltk.corpus import stopwords

# preprocess raw data
def preprocess(data_raw):
    ps = PorterStemmer()
    x = []
    for review in data_raw:
        
        # remove punctuation 
        review = re.sub('[^A-Za-z]', ' ', review)
        tokenized = word_tokenize(review)

        for i in range(len(tokenized)):
            tokenized[i] = ps.stem(tokenized[i])
        
        r = " ".join(tokenized)  # group word back to sentence 
        x.append(r)
        
    return x

xtrain = preprocess(xtrain_raw)
xval = preprocess(xval_raw)

### <font color='navy'> Feature generation by vectorizing samples with Bag of Word representation  </font>

In [0]:
from sklearn.feature_extraction.text import CountVectorizer


def feature_extraction(train, df_min, df_max, ngram_min, ngram_max):
    vectorizer = CountVectorizer(min_df = df_min, max_df = df_max,ngram_range = (ngram_min, ngram_max))
    xtrain = vectorizer.fit_transform(train).todense()
    print("#features for ngram ("+str(ngram_min)+" , "+str(ngram_max)+") = "+str(len(vectorizer.vocabulary_ )))
    return xtrain, vectorizer

xtrain1,vectorizer = feature_extraction(xtrain,4,0.75,1,1)
xtrain2,bigram_vectorizer = feature_extraction(xtrain,4,0.75,2,2)
xtrain3,trigram_vectorizer = feature_extraction(xtrain,4,0.75,3,3)
xtrain4,mixgram_vectorizer = feature_extraction(xtrain,4,0.75,1,2)

### <font color='navy'> Train a Naive Bayes classifier

  </font>

In [0]:
# Train a Naive Bayes classifier
import numpy as np
from sklearn.naive_bayes import MultinomialNB
# train a naive bayes classifier and store acc result in an result array 
def model_training(alf, xtrain, ytrain, xval, yval, vectorizer):
    model = MultinomialNB(alpha=alf, class_prior=None, fit_prior=True)
    model.fit(xtrain, ytrain)
    
    xval = vectorizer.transform(xval).todense()
    train_pred = model.predict(xtrain) 
    val_pred = model.predict(xval) 
    result = [np.mean(train_pred == ytrain), np.mean(val_pred == yval)]
    return model, result, val_pred

In [0]:
# unigram
uni_model, uni_result, uni_pred= model_training(1, xtrain1, ytrain, xval, yval, vectorizer)
print("[unigram features] Naive Bayes training acc:", uni_result[0])   
print("[unigram features] Naive Bayes validation acc:", uni_result[1])  

# bigram
bi_model, bi_result, bi_pred = model_training(1, xtrain2, ytrain, xval, yval, bigram_vectorizer)
print("\n[bigram features] Naive Bayes training acc:", bi_result[0])   
print("[bigram features] Naive Bayes validation acc:", bi_result[1])  

# trigram
tri_model, tri_result, tri_pred = model_training(1, xtrain3, ytrain, xval, yval, trigram_vectorizer)
print("\n[trigram features] Naive Bayes training acc:", tri_result[0])   
print("[trigram features] Naive Bayes validation acc:", tri_result[1])   

# mixgram (1 and 2 gram)
mix_model, mix_result, mix_pred = model_training(1, xtrain4, ytrain, xval, yval, mixgram_vectorizer)
print("\n[mixgram features] Naive Bayes training acc:", mix_result[0])   
print("[mixgram features] Naive Bayes validation acc:", mix_result[1])   

### <font color='navy'> Visualization

  </font>

In [0]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
%matplotlib inline

uni_fpr, uni_tpr, uni_thresholds = roc_curve(yval, uni_pred)
bi_fpr, bi_tpr, bi_thresholds = roc_curve(yval, bi_pred)
tri_fpr, tri_tpr, tri_thresholds = roc_curve(yval, tri_pred)
mix_fpr, mix_tpr, mix_thresholds = roc_curve(yval, mix_pred)

# create plot
plt.plot(uni_fpr, uni_tpr, label='[unigram] ROC curve')
plt.plot(bi_fpr, bi_tpr, label='[bigram] ROC curve')
plt.plot(tri_fpr, tri_tpr, label='[trigram] ROC curve')
plt.plot(mix_fpr, mix_tpr, label='[mixgram] ROC curve')
plt.plot([0, 1], [0, 1], 'k--', label='Random guess')
_ = plt.xlabel('False Positive Rate')
_ = plt.ylabel('True Positive Rate')
_ = plt.title('ROC Curve For N-gram models')
_ = plt.xlim([-0.02, 1])
_ = plt.ylim([0, 1.02])
_ = plt.legend(loc="lower right")

In [0]:
# acc bar chart
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

x = ['unigram', 'bigram', 'trigram', 'mixgram']
energy = [uni_acc, bi_acc, tri_acc, mix_acc]
x_pos = [i for i, _ in enumerate(x)]
plt.ylim([0.7,1])
plt.bar(x_pos, energy, color=(0.2, 0.4, 0.6, 0.6))
plt.xlabel("N-gram models")
plt.ylabel("Accuracy")
plt.title("Naive Bayes Opinion Spam Classification Model")
plt.xticks(x_pos, x)
plt.show()

### <font color='navy'> Most Informative Features¶
 </font>

In [0]:
def most_informative_feature_for_binary_classification(vectorizer, classifier, n=10):
    class_labels = classifier.classes_
    feature_names = vectorizer.get_feature_names()
    topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
    topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]

    for coef, feat in topn_class1:
        print(class_labels[0], coef, feat)

    print("\n")

    for coef, feat in reversed(topn_class2):
        print(class_labels[1], coef, feat)


most_informative_feature_for_binary_classification(mixgram_vectorizer, mix_model)

### <font color='navy'> Predict on Text Data 
</font>

In [0]:
import numpy as np
import csv
with open("test.txt") as f:
    xtest_raw = f.readlines()
print("Test set has",len(xtest_raw),"samples")

x = preprocess(xtest_raw)
xtest = mixgram_vectorizer.transform(x).todense()  
print(xtest.shape)
test_pred = mix_model.predict(xtest)
a = np.array(test_pred)

with open('output.csv', 'w', encoding='utf-8-sig') as fh:
    writer = csv.writer(fh, delimiter=',')
    writer.writerow(['Id','Prediction'])
    writer.writerows(enumerate(a))