## Reading Libraries 

In [1]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline 
import nltk
from nltk.tokenize import MWETokenizer
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn import metrics
import numpy as np
from nltk.stem import WordNetLemmatizer
import re
from nltk.tokenize import RegexpTokenizer
import gensim
import logging
from gensim.models.doc2vec import LabeledSentence
from gensim.models import word2vec
from tqdm import tqdm
from nltk.tokenize import MWETokenizer

In [None]:
nltk.download('wordnet')

In [2]:
lab_data = pd.read_csv('./Training Dataset-20191010/labeled_data.csv')
# unlabeled_data = pd.read_csv('./Training Dataset-20191010/unlabeled_data.csv')

In [None]:
# lab_data.head()['text']

In [3]:
lab_data['text'][1]

'Flirted with giving this two stars, but that\'s a pretty damning rating for what might have just been an off night...\r\n\r\nNew to the East side, and so we don\'t know many of these hidden gems, but me and the fiance met her friend for drinks here and ended up getting some things to nibble. \r\n\r\nFirst off, service was pretty slow, which was unusual because the restaurant is pretty small and galley style. You would think it would be easy for servers to routinely hit up tables as you pass by. \r\n\r\nThe fiance ordered the Quinoa Salad, and said it was pretty good, but dry. I wasn\'t too hungry and so I simply ordered the Bruchetta 3-way which came with burnt crostinis. And I ordered a side of fries, which were either hard or chewy.\r\n\r\nThe friend ordered the macaroni & cheese, and added chicken and bacon (her usual order) and liked it.  \r\n\r\nCan\'t remember the last time I thought to myself- "Huh... they failed at fries..." So, like I said- two stars. But, the decor was good,

In [4]:
def remove_extra_characters(raw_text):
    processed_text = re.sub('\\n','', raw_text)
    processed_text = re.sub('\\r','', processed_text)
    processed_text = re.sub("\\'", "\'",processed_text)
    processed_text = re.sub(r'\d+','', processed_text)
    return processed_text

In [5]:
lab_data['text'] = lab_data.apply(lambda row: remove_extra_characters(row['text'].strip()), axis=1)

## Case Normalisation, Tokenization and Stop words removal

In [16]:
stopwords = []
with open('./stopwords_en.txt') as f:
    stopwords = f.read().splitlines()
stopwords = set(stopwords)

In [17]:
stopwords

{'a',
 "a's",
 'able',
 'about',
 'above',
 'according',
 'accordingly',
 'across',
 'actually',
 'after',
 'afterwards',
 'again',
 'against',
 "ain't",
 'all',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'an',
 'and',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'apart',
 'appear',
 'appreciate',
 'appropriate',
 'are',
 "aren't",
 'around',
 'as',
 'aside',
 'ask',
 'asking',
 'associated',
 'at',
 'available',
 'away',
 'awfully',
 'b',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'believe',
 'below',
 'beside',
 'besides',
 'best',
 'better',
 'between',
 'beyond',
 'both',
 'brief',
 'but',
 'by',
 'c',
 "c'mon",
 "c's",
 'came',
 'can',
 "can't",
 'cannot',
 'cant',
 'cause',
 'causes',
 'certain',
 'certainly',
 'changes',
 'clearly',
 'co',
 'com',
 'come',
 'c

In [6]:
def lemmatization(token_list):
    lemmatizer = WordNetLemmatizer()
    lem_token = []
    for each in token_list :
#         print(each ,":", lemmatizer.lemmatize(each)) 
        lem_token.append(lemmatizer.lemmatize(each))
    return lem_token

In [None]:
import contractions
def fix_contractions(text):
    return contractions.fix(text)

In [7]:
tokenizer = RegexpTokenizer("\w+(?:[']\w+)?")

In [11]:
def token(raw_data):
    raw_data_low = raw_data.lower()
#     raw_data1 = fix_contractions(raw_data_low)
    tokenised = tokenizer.tokenize(raw_data_low)
#     tokenised = nltk.tokenize.word_tokenize(raw_data1)
#     lem_token = lemmatization(tokenised)
#     stopwords_tokens = [w for w in tokenised if not w in stopwords]
    processed_data = ' '.join(tokenised)
        
    return(processed_data)

In [12]:
lab_data['text'] = lab_data.apply(lambda row: token(row['text'].strip()), axis=1)

In [13]:
lab_data['text'][1]

"flirted with giving this two stars but that's a pretty damning rating for what might have just been an off night new to the east side and so we don't know many of these hidden gems but me and the fiance met her friend for drinks here and ended up getting some things to nibble first off service was pretty slow which was unusual because the restaurant is pretty small and galley style you would think it would be easy for servers to routinely hit up tables as you pass by the fiance ordered the quinoa salad and said it was pretty good but dry i wasn't too hungry and so i simply ordered the bruchetta way which came with burnt crostinis and i ordered a side of fries which were either hard or chewy the friend ordered the macaroni cheese and added chicken and bacon her usual order and liked it can't remember the last time i thought to myself huh they failed at fries so like i said two stars but the decor was good it was a good place to have a conversation and i might be back to try more expens

In [14]:
sentences = []
for each in lab_data['text']:
    tokens = each.split()
    sentences.extend(tokens)

In [15]:
sentences

['the',
 'new',
 'rule',
 'is',
 'if',
 'you',
 'are',
 'waiting',
 'for',
 'a',
 'table',
 'which',
 'you',
 'almost',
 'always',
 'are',
 'you',
 'cant',
 'wait',
 'inside',
 'they',
 'just',
 'posted',
 'a',
 'sign',
 'upfront',
 'that',
 'it',
 'causes',
 'some',
 'concerns',
 'for',
 'the',
 'seated',
 'patrons',
 'how',
 'awful',
 'is',
 'that',
 'i',
 'like',
 'that',
 'they',
 'included',
 'the',
 'apology',
 'along',
 'with',
 'especially',
 'now',
 'in',
 'the',
 'cold',
 'p',
 's',
 'you',
 'can',
 'try',
 'calling',
 'in',
 'ahead',
 'to',
 'reserve',
 'a',
 'table',
 'but',
 'thats',
 'only',
 'if',
 'the',
 'waiting',
 'list',
 'is',
 'short',
 'otherwise',
 'you',
 'have',
 'to',
 'show',
 'up',
 'to',
 'reserve',
 'boourns',
 'this',
 'place',
 'could',
 'do',
 'no',
 'wrong',
 'in',
 'my',
 'eyes',
 'rattle',
 'away',
 'you',
 'equally',
 'clever',
 'witty',
 'name',
 'for',
 'a',
 'hot',
 'beverage',
 'must',
 'mention',
 'i',
 'am',
 'obsessed',
 'with',
 'mad',
 'ga

In [24]:
from nltk.util import ngrams
import collections

def get_bigram_without_stopwords(sentences):
    all_bigram = ngrams(sentences, 2)
    bigram_freq = collections.Counter(all_bigram)
    bigram_1000 = bigram_freq.most_common(5000)
    bigram_wo_stop = [(bigram[0],bigram[1]) for bigram, freq in bigram_1000 
              if (bigram[0].lower() not in stopwords and bigram[1].lower() not in stopwords)]
    bigram_wo_stop = bigram_wo_stop[0:400]
    return bigram_wo_stop
    
    

In [29]:
bigram_wo_stop_1 = get_bigram_without_stopwords(sentences)
mwe_tokenizer = MWETokenizer(bigram_wo_stop_1, separator='_')

def generate_bigram(raw_text, mwe_tokenizer):
    tokens = raw_text.split()
    tokens_b = mwe_tokenizer.tokenize(tokens)
    lem_token = lemmatization(tokens_b)
    processed_data = ' '.join(tokens_b)
    return processed_data

lab_data['text'] = lab_data.apply(lambda row: generate_bigram(row['text'].strip(), mwe_tokenizer), axis=1)

In [31]:
lab_data['text'][0]

'the new rule is if you are waiting for a table which you almost always are you cant wait inside they just posted a sign upfront that it causes some concerns for the seated patrons how awful is that i like that they included the apology along with especially now in the cold p s you can try calling in ahead to reserve a table but thats only if the waiting list is short otherwise you have to show up to reserve boourns this place could do no wrong in my eyes rattle away you equally clever witty name for a hot beverage must mention i am obsessed with mad gab xoxom'

## TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 


vectorizer = TfidfVectorizer(lowercase = True,analyzer = 'word',ngram_range = (1,1), min_df=3, max_df=.99)
    
train_review = vectorizer.fit_transform(lab_data['text'])

In [None]:
?TfidfVectorizer

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_review, lab_data['label'],test_size=0.20, random_state=1)

In [None]:
X_train.shape

## Logistic Regression

In [None]:
def instantiate_cross_val(model):
    # perfroming 10 fold cross validation
    skf = StratifiedKFold(n_splits=10)
    params = {}
    nb = model
    gs = GridSearchCV(nb, cv=skf, param_grid=params, return_train_score=False)
    return gs

In [None]:
model = LogisticRegression()
gs = instantiate_cross_val(model)

clf=gs.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

In [None]:
print('parameters:', clf.best_estimator_.get_params())

In [None]:
log_model = LogisticRegression(random_state=1, C=1, solver='sag', multi_class = 'multinomial')
log_model.fit(X_train, y_train)
y_pred = log_model.predict(X_test)

In [None]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

In [None]:
# Accuracy: 0.6123 2g

In [None]:
# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

# multi_class = ['multinomial','ovr']


# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

In [None]:
clf = GridSearchCV(model, hyperparameters, cv=10, verbose=0)

In [None]:
best_model = clf.fit(X_train, y_train)

In [None]:
# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

In [None]:
print('Best C:', best_model.best_estimator_.get_params())

In [None]:
y_best_pred = best_model.predict(X_test)

In [None]:
print("Accuracy:", metrics.accuracy_score(y_test, y_best_pred))

## SVM

In [None]:
from sklearn import svm
from sklearn.metrics import classification_report
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
classifier_linear.fit(X_train, y_train)
prediction_linear = classifier_linear.predict(X_test)
# results
classification_report(y_test, prediction_linear, output_dict=True)

In [None]:
print(metrics.accuracy_score( y_test,prediction_linear))

## Neural networks

In [None]:
seed = 7
np.random.seed(seed)
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [None]:
def batch_generator(X_data, y_data, batch_size):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    index = np.arange(np.shape(y_data)[0])
    while 1:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].toarray()
        y_batch = y_data[y_data.index[index_batch]]
        counter += 1
        yield X_batch,y_batch
        if (counter > number_of_batches):
            counter=0
            
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit_generator(generator=batch_generator(X_train, y_train, 32),
                    epochs=5, validation_data=(X_test, y_test),
                    steps_per_epoch=X_train.shape[0]/32)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    import itertools
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
plt.figure()
plot_confusion_matrix(cm, classes=['1', '2', '3', '4', '5'],
                      title='Confusion matrix')

In [None]:
pred_probab = clf.predict_proba(test_review)

In [None]:
pred_probab

In [None]:
p_test = []

In [None]:
for i in range(len(pred_probab)):
    p_test.append(max(pred_probab[i]))

In [None]:
train_data = pd.DataFrame({'text':X_test, 'label':y_test, 'p_test':p_test, 'y_pred':y_pred})
train_data.head()

In [None]:
len(train_data[(train_data['p_test'] > 0.9) & (train_data['label']==train_data['y_pred'])])

## Word2vec + Logistic

In [None]:
lab_data['text'][1]

In [None]:
# sentences = []
# for review in lab_data['text']:
#     sentences.append(review.split(' '))
sentences = lab_data.apply(lambda row: row['text'].split(), axis=1).values

In [None]:
sentences

In [None]:
from gensim.models import phrases
bigrams = phrases.Phrases(sentences)


In [None]:
print(bigrams["this is the new york".split()])

In [None]:
sentences[0]

In [None]:
# Set values for various parameters
num_features = 200    # Word vector dimensionality                      
min_word_count = 10   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
# context = 10          # Context window size                                                                                    

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(bigrams[sentences], workers=num_workers, \
            size=num_features, min_count=3)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
# model.init_sims(replace=True)

In [None]:
?word2vec.Word2Vec

In [None]:
model.doesnt_match("man woman child kitchen".split())

In [None]:
model.wv.syn0.shape

In [None]:
from itertools import islice
list(islice(model.wv.vocab, 11030, 13050))

In [None]:
train, test = train_test_split(lab_data, test_size=0.3, random_state = 42)

In [None]:
def w2v_tokenize_text(text):
    tokens = text.split(' ')
    return tokens

test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values

In [None]:
# test_tokenized = test['text'].values
train_tokenized

In [None]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, review) for review in text_list ])

In [None]:
X_train_word_average = word_averaging_list(model.wv,train_tokenized)
X_test_word_average = word_averaging_list(model.wv,test_tokenized)

In [None]:
logreg = LogisticRegression(random_state=1, solver='liblinear', multi_class = 'ovr')
logreg.fit(X_train_word_average, train['label'])
y_pred = logreg.predict(X_test_word_average)
print('accuracy %s' % metrics.accuracy_score(y_pred, test.label))
# print(classification_report(test.label, y_pred,target_names=my_tags))