# Group 27
# Dataset-2
##### IMDB reviews

##### Text Classification

# Import Packages

In [None]:
import numpy as np
import pandas as pd
import re
import nltk
import os
import gensim
import spacy
import pickle
import multiprocessing
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors
from sklearn.metrics import classification_report
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors
from time import time
import logging
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

# Load the DataSet

## Load Training Set

In [None]:
path = r'/mnt/d/ml_p2/aclImdb/train/pos'
all_trainpos = glob.glob(path + "/*.txt")

path1 = r'/mnt/d/ml_p2/aclImdb/train/neg'
all_trainneg = glob.glob(path1 + "/*.txt")

In [None]:
# concatenate all pos
output=''
for file in all_trainpos:
    with open(file, 'r') as f:
        content = f.read().strip('\n')
        output += content + '\t1\n' #  This will use a placeholder of 1 for all labels.
with open('train_pos.txt', 'w') as result:
    result.write(output)

In [None]:
# concatenate all neg
output=''
for file in all_trainneg:
    with open(file, 'r') as f:
        content = f.read().strip('\n')
        output += content + '\t0\n' #  This will use a placeholder of 0 for all labels.
with open('train_neg.txt', 'w') as result:
    result.write(output)

In [None]:
# Merge neg and pos files
filenames=['train_pos.txt', 'train_neg.txt']
with open ('train.txt', 'w') as outfile:
    for names in filenames:
        with open(names) as infile:
            outfile.write(infile.read())
    outfile.write('\n')
    

## Load Test Set

In [None]:
path = r'/mnt/d/ml_p2/aclImdb/test/pos'
all_testpos = glob.glob(path + "/*.txt")

path1 = r'/mnt/d/ml_p2/aclImdb/test/neg'
all_testneg = glob.glob(path1 + "/*.txt")


In [None]:
# concatenate all pos
output=''
for file in all_testneg:
    with open(file, 'r') as f:
        content = f.read().strip('\n')
        output += content + '\t1\n' #  This will use a placeholder of 1 for all labels.
with open('test_pos.txt', 'w') as result:
    result.write(output)

In [None]:
# concatenate all pos and neg respectively
output=''
for file in all_testneg:
    with open(file, 'r') as f:
        content = f.read().strip('\n')
        output += content + '\t0\n' #  This will use a placeholder of 0 for all labels.
with open('test_neg.txt', 'w') as result:
    result.write(output)

In [None]:
# Merge neg and pos files
filenames=['test_pos.txt', 'test_neg.txt']
with open ('test.txt', 'w') as outfile:
    for names in filenames:
        with open(names) as infile:
            outfile.write(infile.read())
    outfile.write('\n')

# Preprocessing

In [None]:
t = open('train.txt')
train = t.read()
t.close()

In [None]:
t = open('test.txt')
test = t.read()
t.close()

In [None]:
def preprocessing(file):
    text_content = []
    exclude = string.punctuation
    exclude = exclude.replace("-", "")
    pattern = r"[{}]".format(exclude)

    for data in file :
        text = re.sub(r"(<br\s*/><br\s*/>)", " ", str(data))
        text = re.sub(pattern, "", str(text))
        text_content.append(text.lower())
    return text_content

In [None]:
train = preprocessing(train)
test = preprocessing(test)

# Word2Vec Feature Extraction

In [None]:
df = pd.read_csv(train, sep='\t' ,names=['review', 'label'])

In [None]:
x = df.astype('str')
sent = [row.split() for row in x['review']]
len(sent)

In [None]:
# Bigrams:
# We are using Gensim Phrases package to automatically detect common phrases (bigrams) from a list of sentences
phrases = Phrases(sent, min_count=1, progress_per=100)
bigram = Phraser(phrases)
sentences = bigram[sent]

In [None]:
# Setup hyperparameter for w2v_model
w2v_model = Word2Vec(min_count=2,
                     window=2,
                     size=300,
                     sample=1e-5, 
                     alpha=0.01, 
                     min_alpha=0.03, 
                     negative=20,
                     workers=cores-1)

In [None]:
# Build Vocab
t = time()
w2v_model.build_vocab(sentences, progress_per=10000)

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
pickle.dump('w2v_model', 'wb')

In [None]:
# test the model
w2v_model.wv.most_similar(positive=["movie"])

# VectorSpace Building

In [None]:
fname = get_tmpfile("vectors.kv")
w2v_model.save(fname)
word_vectors = KeyedVectors.load(fname, mmap='r')

t=time()
docs_vectors = pd.DataFrame()
for doc in df['review']:
    temp = pd.DataFrame()
    for word in doc.split(' '):
        if word in word_vectors:
            try:
                word_vec = word_vectors[word]
                temp = temp.append(pd.Series(word_vec), ignore_index = True)
            except:
                pass

    doc_vector = temp.mean() 
    docs_vectors = docs_vectors.append(doc_vector, ignore_index = True) # append each document value to the final dataframe


print('Time to build: {} mins'.format(round((time() - t) / 60, 2)))
docs_vectors.shape

In [None]:
docs_vectors.to_csv('docs_vectors.csv', index=None)

# Data Spliting to (Training and Validatoin)

In [None]:
train_x, test_x, train_y, test_y = train_test_split(docs_vectors.drop('label', axis= 1), docs_vectors['label'],
                                                    shuffle=True, test_size=0.2, random_state=1)


In [None]:
df1 = pd.read_csv(test, sep='\t' ,names=['review', 'label'])
x_test = df1['reveiw']
y_test = df1['label']

# AdaBoost Model Building

In [None]:
clf = AdaBoostClassifier(n_estimators=20000,learning_rate = 0.3, random_state=42)
clf.fit(train_x, train_y)

In [None]:
pickle.dump(clf, open('AdaBoost_model', 'wb'))

In [None]:
# To test for test set we need to create a 300 dimention vector space
test_pred = clf.predict(x_test)
target_names = ['class 0 (neg)', 'class 1 (pos)']
print(classification_report(y_test, test_pred, target_names=target_names))

# RandomForest Model Building

In [None]:
t=time()

clf= RandomForestClassifier(n_estimators = 20000, max_features=None,
                            max_depth=None, min_samples_split=2, min_samples_leaf=1, oob_score =True ,n_jobs = -2,
                            bootstrap = True,random_state = 42 ) # criterion = 'gini'
clf.fit(train_x, train_y)

print('Time to build: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
pickle.dump(clf, open('RandomForest_model', 'wb'))

In [None]:
test_pred = clf.predict(x_test)
target_names = ['class 0 (neg)', 'class 1 (pos)']
print(classification_report(y_test, test_pred, target_names=target_names))