### Sentiment Analysis

#### WEI Chen, Pierre-Yves Casanova

## Baseline 1
## SentiWordNet

Import IMDb dataset for baseline test

In [None]:
import pandas as pd       
data = pd.read_csv("imdb_data/train_binary_sent.csv", header=0, delimiter=",")

X_train = data["sentence"]
y_train = data["label"]

data = pd.read_csv("imdb_data/test_binary_sent.csv", header=0, delimiter=",")

X_test = data["sentence"]
y_test = data["label"]

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag
# from nltk.sentiment.util import mark_negation
 
 
lemmatizer = WordNetLemmatizer()

def mark_negation(document, double_neg_flip=False, shallow=False):
    
    NEGATION = r"""
        (?:
            ^(?:never|no|nothing|nowhere|noone|none|not|
                havent|hasnt|hadnt|cant|couldnt|shouldnt|
                wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint|
                neither|nor|hardly|seldom|fail
            )$
        )
        |
        n't"""

    NEGATION_RE = re.compile(NEGATION, re.VERBOSE)

    CLAUSE_PUNCT = r'^[,.:;!?]$'
    CLAUSE_PUNCT_RE = re.compile(CLAUSE_PUNCT)
    
    
    neg_scope = False
    for i, word in enumerate(document):
        if NEGATION_RE.search(word[0]):
            if not neg_scope:
                neg_scope = not neg_scope
                continue
            else:
                document[i] = (document[i][0]+'_NEG',document[i][1])           
        elif neg_scope and CLAUSE_PUNCT_RE.search(word[0]):
            neg_scope = not neg_scope
        elif neg_scope and not CLAUSE_PUNCT_RE.search(word[0]):
            document[i] = (document[i][0]+'_NEG',document[i][1])

    return document

def penn_to_wn(tag):
    """
    Convert between the PennTreebank tags to simple Wordnet tags
    """
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None
 
def swn_polarity(text):
    """
    Return a sentiment polarity: 0 = negative, 1 = positive
    """
 
    sentiment = 0.0
    tokens_count = 0
 
#     text = clean_text(text)
 
    raw_sentences = sent_tokenize(text) #split document into sentences
    for raw_sentence in raw_sentences:
        tagged_sentence = mark_negation(pos_tag(word_tokenize(raw_sentence))) #split sentences into words, and attach pos tag to each word
#         print(tagged_sentence)
        for word, tag in tagged_sentence:
            wn_tag = penn_to_wn(tag)
            try:
                word, neg = word.split("_")
            except:
                neg = None
#             print("tag:",wn_tag)  
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV, wn.VERB):
                continue
              
            lemma = lemmatizer.lemmatize(word, pos=wn_tag) #find stem word
            if not lemma:
                continue
#             print("lemma:",lemma)
            synsets = wn.synsets(lemma, pos=wn_tag) #find the synonym
            if not synsets:
                continue
#             print(synsets)
            # Take the first sense, the most common
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name()) #search in the sentiwordnet dictionary
#             print("swn_synset:",swn_synset)
            if neg == "NEG":
                sentiment += -swn_synset.pos_score() + swn_synset.neg_score()
            else:
                sentiment += swn_synset.pos_score() - swn_synset.neg_score()
            tokens_count += 1
 
    # judgment call ? Default to positive or negative
    if not tokens_count:
        return 0
#         return -1
#     print(sentiment)
    # sum greater than 0 => positive sentiment
    if sentiment >= 0:
        return 1
 
    # negative sentiment
    return 0
#     return -1

In [None]:
from sklearn.metrics import accuracy_score
pred_y = [swn_polarity(text) for text in X_test.values]

print (accuracy_score(y_test.values, pred_y))

## Baseline 2
## n-gram + sklearn LinearSVC

In [None]:
from nltk import word_tokenize
from nltk.sentiment.util import mark_negation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.base import TransformerMixin 
from sklearn.model_selection import train_test_split, cross_val_score
import re

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
clf = Pipeline([
    ('vectorizer', CountVectorizer(analyzer="word",
#                                    tokenizer=word_tokenize,         # ! Comment line to include mark_negation and uncomment next line
                                   tokenizer=lambda text: mark_negation(word_tokenize(text)), 
#                                    preprocessor=lambda text: text.replace("<br />", " "),
#                                    preprocessor=lambda text: ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split()),
                                   ngram_range=(1, 2),
                                   stop_words = 'english',
#                                    min_df = 10,
                                   max_features=None) ),
    ('classifier', LinearSVC())
])
 
clf.fit(X_train.values, y_train.values)
clf.score(X_test.values, y_test.values)

# print(np.mean(cross_val_score(clf, trX, trY, cv=4)))