## Reading Libraries 

In [None]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline 
import nltk
from nltk.tokenize import MWETokenizer
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn import metrics
import numpy as np
from nltk.stem import WordNetLemmatizer
import re
from nltk.tokenize import RegexpTokenizer
import gensim
import logging
from gensim.models.doc2vec import LabeledSentence
from gensim.models import word2vec
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# %pip install gensim
nltk.download('wordnet')

In [None]:
lab_data = pd.read_csv('./Training Dataset-20191023/labeled_data.csv')
unlabeled_data = pd.read_csv('./Training Dataset-20191023/unlabeled_data.csv')

In [None]:
lab_data.head()

In [None]:
lab_data['text'][1]

In [None]:
def remove_extra_characters(raw_text):
    processed_text = re.sub('\\n','', raw_text)
    processed_text = re.sub('\\r','', processed_text)
    processed_text = re.sub("\\'", "\'",processed_text)
    processed_text = re.sub(r'\d+','', processed_text)
    return processed_text

In [None]:
lab_data['text'] = lab_data.apply(lambda row: remove_extra_characters(row['text'].strip()), axis=1)

In [None]:
unlabeled_data['text'][0]

In [None]:
unlabeled_data['text'] = unlabeled_data.apply(lambda row: remove_extra_characters(row['text'].strip()), axis=1)

## Case Normalisation, Tokenization and Stop words removal

In [None]:
stopwords = []
with open('./stopwords_en.txt') as f:
    stopwords = f.read().splitlines()
stopwords = set(stopwords)

In [None]:
def lemmatization(token_list):
    lemmatizer = WordNetLemmatizer()
    lem_token = []
    for each in token_list :
#         print(each ,":", lemmatizer.lemmatize(each)) 
        lem_token.append(lemmatizer.lemmatize(each))
    return lem_token

In [None]:
tokenizer = RegexpTokenizer("\w+(?:[']\w+)?")

In [None]:
def token(raw_data):
    raw_data1 = raw_data.lower()
    tokenised = tokenizer.tokenize(raw_data1)
#     tokenised = nltk.tokenize.word_tokenize(raw_data1)
    lem_token = lemmatization(tokenised)
#     stopwords_tokens = [w for w in tokenised if not w in stopwords]
    processed_data = ' '.join(lem_token)
        
    return(processed_data)

In [None]:
lab_data['text'] = lab_data.apply(lambda row: token(row['text'].strip()), axis=1)

In [None]:
lab_data['text'][1]

In [None]:
unlabeled_data['text'] = unlabeled_data.apply(lambda row: token(row['text'].strip()), axis=1)

In [None]:
unlabeled_data['text'][0]

## TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 


vectorizer = TfidfVectorizer(lowercase = True,analyzer = 'word',ngram_range = (1,2), min_df=3, max_df=.99)
    
train_review = vectorizer.fit_transform(lab_data['text'])

In [None]:
?TfidfVectorizer

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_review, lab_data['label'],test_size=0.20, random_state=1)

## Logistic Regression

In [None]:
def instantiate_cross_val(model):
    # perfroming 10 fold cross validation
    skf = StratifiedKFold(n_splits=10)
    params = {}
    nb = model
    gs = GridSearchCV(nb, cv=skf, param_grid=params, return_train_score=False)
    return gs

In [None]:
model = LogisticRegression()
gs = instantiate_cross_val(model)

clf=gs.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

In [None]:
print('parameters:', clf.best_estimator_.get_params())

In [None]:
log_model = LogisticRegression(random_state=1, C=1, solver='sag', multi_class = 'multinomial')
log_model.fit(X_train, y_train)
y_pred = log_model.predict(X_test)

In [None]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

In [None]:
# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

# multi_class = ['multinomial','ovr']


# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

In [None]:
clf = GridSearchCV(model, hyperparameters, cv=10, verbose=0)

In [None]:
best_model = clf.fit(X_train, y_train)

In [None]:
# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

In [None]:
print('Best C:', best_model.best_estimator_.get_params())

In [None]:
y_best_pred = best_model.predict(X_test)

In [None]:
print("Accuracy:", metrics.accuracy_score(y_test, y_best_pred))

## SVM

In [None]:
from sklearn import svm
from sklearn.metrics import classification_report
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
classifier_linear.fit(X_train, y_train)
prediction_linear = classifier_linear.predict(X_test)
# results
classification_report(y_test, prediction_linear, output_dict=True)

In [None]:
print(metrics.accuracy_score( y_test,prediction_linear))

## Neural networks

In [None]:
seed = 7
np.random.seed(seed)
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [None]:
def batch_generator(X_data, y_data, batch_size):
    dim = X_data.shape[1]
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    index = np.arange(np.shape(y_data)[0])
    while 1:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].toarray()
        y_batch = y_data[y_data.index[index_batch]]
        counter += 1
        yield X_batch,y_batch
        if (counter > number_of_batches):
            counter=0
            
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=dim))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit_generator(generator=batch_generator(X_train, y_train, 32),
                    epochs=5, validation_data=(X_test, y_test),
                    steps_per_epoch=X_train.shape[0]/32)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    import itertools
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
plt.figure()
plot_confusion_matrix(cm, classes=['1', '2', '3', '4', '5'],
                      title='Confusion matrix')

## Predict unlabeled data

In [None]:
remaining_unlabeled = unlabeled_data.copy()

In [None]:
remaining_unlabeled

In [None]:
for j in range(5):
    unlabeled_test = vectorizer.transform(remaining_unlabeled['text'])
    pred_class = log_model.predict(unlabeled_test)
    pred_probab = log_model.predict_proba(unlabeled_test)
    p_test = []
    for i in range(len(pred_probab)):
        p_test.append(max(pred_probab[i]))
    remaining_unlabeled['label'] = pred_class
    remaining_unlabeled['probability'] = p_test
    
    new_train_data = remaining_unlabeled[remaining_unlabeled['probability'] > 0.8]
    print("length of obtained train data:", len(new_train_data))
    
    remaining_unlabeled = remaining_unlabeled[remaining_unlabeled['probability'] <= 0.8]
    print("length of remaining data:", len(remaining_unlabeled))
    
    new_train_data.drop(['probability'], axis=1, inplace=True)
    remaining_unlabeled.drop(['probability','label'], axis=1, inplace=True)
    
    if j == 0:
        train_data = pd.concat([lab_data, new_train_data])
        print("length of train data:", len(train_data))
    else:
        td = train_data.copy()
        train_data = pd.concat([td, new_train_data])
        print("length of train data:", len(train_data))
    
    vectorizer = TfidfVectorizer(lowercase = True,analyzer = 'word',ngram_range = (1,2), min_df=3, max_df=.99)
    train = vectorizer.fit_transform(train_data['text'])
    X_train, X_test, y_train, y_test = train_test_split(train, train_data['label'],test_size=0.20, random_state=1)
    log_model = LogisticRegression(random_state=1, C=1, solver='sag', multi_class = 'multinomial')
    log_model.fit(X_train, y_train)
    y_pred = log_model.predict(X_test)
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

In [None]:
len(train_data)

In [None]:
train_data.to_csv("train.csv")

In [None]:
train_data = pd.read_csv("./train.csv")

In [None]:
# train_data.drop(['Unnamed:0'], axis=1, inplace=True)
train_data['text']

In [None]:
vectorizer = TfidfVectorizer(lowercase = True,analyzer = 'word',ngram_range = (1,2), min_df=3, max_df=.99)
train = vectorizer.fit_transform(train_data['text'].values.astype('U'))
X_train, X_test, y_train, y_test = train_test_split(train, train_data['label'],test_size=0.20, random_state=1)
log_model = LogisticRegression(random_state=1, C=1, solver='sag', multi_class = 'multinomial')
log_model.fit(X_train, y_train)
y_pred = log_model.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

In [None]:
test_data = pd.read_csv("./test_data.csv")

In [None]:
test_data.head()

In [None]:
test_data['text'] = test_data.apply(lambda row: remove_extra_characters(row['text'].strip()), axis=1) 

In [None]:
test_data['text'] = test_data.apply(lambda row: token(row['text'].strip()), axis=1)

In [None]:
vectorizer = TfidfVectorizer(lowercase = True,analyzer = 'word',ngram_range = (1,2), min_df=3, max_df=.99)
train = vectorizer.fit_transform(train_data['text'].values.astype('U'))
test = vectorizer.transform(test_data['text'])

In [None]:
log_model = LogisticRegression(random_state=1, C=1, solver='sag', multi_class = 'multinomial')
log_model.fit(train, train_data['label'])
y_pred = log_model.predict(test)

In [None]:
pred_data = pd.DataFrame({'test_id':test_data['test_id'], 'label':y_pred})

In [None]:
pred_data.to_csv("predict_label.csv", index=False)

In [None]:
unlabeled_data['text'][0]

In [None]:
unlabeled_test = vectorizer.transform(unlabeled_data['text'])

In [None]:
pred_class = log_model.predict(unlabeled_test)

In [None]:
pred_class[10000]

In [None]:
pred_probab = log_model.predict_proba(unlabeled_test)

In [None]:
pred_probab[10000]

In [None]:
p_test = []

In [None]:
for i in range(len(pred_probab)):
    p_test.append(max(pred_probab[i]))

In [None]:
unlabeled_data['label'] = pred_class
unlabeled_data['probability'] = p_test

In [None]:
unlabeled_data.head()

In [None]:
new_train_data = unlabeled_data[unlabeled_data['probability'] > 0.8]

In [None]:
remaining_unlabeled_data = unlabeled_data[unlabeled_data['probability'] <= 0.8]

In [None]:
len(remaining_unlabeled_data)

In [None]:
len(new_train_data)

In [None]:
new_train_data.drop(['probability'], axis=1, inplace=True)
new_train_data

In [None]:
train_data = pd.concat([lab_data, new_train_data])
len(train_data)

In [None]:
train_data

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 


vectorizer_new = TfidfVectorizer(lowercase = True,analyzer = 'word',ngram_range = (1,2), min_df=3, max_df=.99)
    
train = vectorizer_new.fit_transform(train_data['text'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, train_data['label'],test_size=0.20, random_state=1)

In [None]:
log_model_new = LogisticRegression(random_state=1, C=1, solver='sag', multi_class = 'multinomial')
log_model_new.fit(X_train, y_train)
y_pred = log_model_new.predict(X_test)

In [None]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

In [None]:
remaining_unlabeled_data.drop(['probability','label'], axis=1, inplace=True)
remaining_unlabeled_data

In [None]:
unlabeled_test_2 = vectorizer_new.transform(remaining_unlabeled_data['text'])

In [None]:
pred_class_2 = log_model_new.predict(unlabeled_test_2)

In [None]:
pred_probab_2 = log_model_new.predict_proba(unlabeled_test_2)

In [None]:
pred_class_2[0]

In [None]:
pred_probab_2[0]

In [None]:
p_test = []
for i in range(len(pred_probab_2)):
    p_test.append(max(pred_probab_2[i]))

In [None]:
p_test

In [None]:
remaining_unlabeled_data['label'] = pred_class_2
remaining_unlabeled_data['probability'] = p_test

In [None]:
new_train_data_1 = remaining_unlabeled_data[remaining_unlabeled_data['probability'] > 0.8]

In [None]:
new_train_data_1

In [None]:
remaining_unlabeled_data_1 = remaining_unlabeled_data[remaining_unlabeled_data['probability'] <= 0.8]

In [None]:
new_train_data_1.drop(['probability'], axis=1, inplace=True)
remaining_unlabeled_data_1.drop(['probability','label'], axis=1, inplace=True)

In [None]:
train_data_1 = pd.concat([train_data, new_train_data_1])
len(train_data_1)

In [None]:
train = vectorizer_new.fit_transform(train_data_1['text'])
X_train, X_test, y_train, y_test = train_test_split(train, train_data_1['label'],test_size=0.20, random_state=1)
log_model_new = LogisticRegression(random_state=1, C=1, solver='sag', multi_class = 'multinomial')
log_model_new.fit(X_train, y_train)
y_pred = log_model_new.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

In [None]:
len(remaining_unlabeled_data_1)

In [None]:
unlabeled_test_3 = vectorizer_new.transform(remaining_unlabeled_data_1['text'])

In [None]:
pred_class_3 = log_model_new.predict(unlabeled_test_3)

In [None]:
pred_probab_3 = log_model_new.predict_proba(unlabeled_test_3)

In [None]:
pred_class_3[0]

In [None]:
pred_probab_3[0]

In [None]:
p_test = []
for i in range(len(pred_probab_3)):
    p_test.append(max(pred_probab_3[i]))

In [None]:
p_test

In [None]:
remaining_unlabeled_data_1['label'] = pred_class_3
remaining_unlabeled_data_1['probability'] = p_test

In [None]:
new_train_data_2 = remaining_unlabeled_data_1[remaining_unlabeled_data_1['probability'] > 0.8]

In [None]:
new_train_data_2

In [None]:
remaining_unlabeled_data_2 = remaining_unlabeled_data[remaining_unlabeled_data['probability'] <= 0.8]

In [None]:
new_train_data_2.drop(['probability'], axis=1, inplace=True)
remaining_unlabeled_data_2.drop(['probability','label'], axis=1, inplace=True)

In [None]:
train_data_2 = pd.concat([train_data_1, new_train_data_2])
len(train_data_2)

In [None]:
train = vectorizer_new.fit_transform(train_data_2['text'])
X_train, X_test, y_train, y_test = train_test_split(train, train_data_2['label'],test_size=0.20, random_state=1)
log_model_new = LogisticRegression(random_state=1, C=1, solver='sag', multi_class = 'multinomial')
log_model_new.fit(X_train, y_train)
y_pred = log_model_new.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

## Word2vec + Logistic

In [None]:

train_data_1 = pd.DataFrame({'review':unlabeled_data['text']})
train_data_2 = pd.DataFrame({'review':lab_data['text']})

In [None]:
train_data = pd.concat([train_data_1, train_data_2])
len(train_data)

In [None]:
train_data.head()

In [None]:
# sentences = []
# for review in lab_data['text']:
#     sentences.append(review.split())
# for review in unlabeled_data['text']:
#     sentences.append(review.split)
sentences = train_data.apply(lambda row: row['review'].split(), axis=1).values

In [None]:
sentences

In [None]:
from gensim.models import phrases
bigrams = phrases.Phrases(sentences)


In [None]:
print(bigrams["this is the new york".split()])

In [None]:
sentences[0]

In [None]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 5   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
# context = 10          # Context window size                                                                                    

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(bigrams[sentences], workers=num_workers, \
            size=num_features, min_count=3)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
# model.init_sims(replace=True)

In [None]:
?word2vec.Word2Vec

In [None]:
model.doesnt_match("man woman child kitchen".split())

In [None]:
model.wv.syn0.shape

In [None]:
from itertools import islice
list(islice(model.wv.vocab, 11030, 13050))

In [None]:
train, test = train_test_split(lab_data, test_size=0.3, random_state = 42)

In [None]:
def w2v_tokenize_text(text):
    tokens = text.split(' ')
    return tokens

test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values

In [None]:
# test_tokenized = test['text'].values
train_tokenized

In [None]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, review) for review in text_list ])

In [None]:
X_train_word_average = word_averaging_list(model.wv,train_tokenized)
X_test_word_average = word_averaging_list(model.wv,test_tokenized)

In [None]:
logreg = LogisticRegression(random_state=1, C=2, solver='sag', multi_class = 'multinomial')
logreg.fit(X_train_word_average, train['label'])
y_pred = logreg.predict(X_test_word_average)
print('accuracy %s' % metrics.accuracy_score(y_pred, test.label))
# print(classification_report(test.label, y_pred,target_names=my_tags))

In [None]:
from sklearn import svm
from sklearn.metrics import classification_report
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
classifier_linear.fit(X_train_word_average, train['label'])
prediction_linear = classifier_linear.predict(X_test_word_average)
# results
classification_report(test.label, prediction_linear, output_dict=True)