In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
import glob
from sklearn.utils import shuffle
import spacy
from sklearn.model_selection import train_test_split

In [9]:
nlp = spacy.load('en_core_web_lg', disable=["tagger", "parser", "ner"])

def preprocess(text, prepr):
    doc = nlp(text)
    if prepr == "lemma":
        processed = [term.lemma_.lower() for term in doc if term.is_alpha and not term.is_stop]
    elif prepr == "alpha":
        processed = [term.text.lower() for term in doc if term.is_alpha]
    return ' '.join(processed)

In [3]:
def read_files(data_dict, files, label, prepr = "no"):
    for file_name in files:
        file_rating = file_name.split('_')

        with open(file_name, "r", encoding="utf8") as file_open:
            data_dict["file"].append(file_name)
            data_dict["rating"].append(file_rating[-1].split('.')[0])
            if prepr != "no":
                data_dict["text"].append(preprocess(file_open.read(), prepr))
            else:
                data_dict["text"].append(file_open.read())
            data_dict["label"].append(label)

In [17]:
# No preprocessing
labels = [1, 0]

train_dict = {"file":[], "rating":[], "text":[], "label":[]} #defaultdict(list)

pos_path = "aclImdb/train/pos"
train_pos = glob.glob(pos_path + '/*.txt')
read_files(train_dict, train_pos, labels[0], "no")

neg_path = "aclImdb/train/neg"
train_neg = glob.glob(neg_path + '/*.txt')
read_files(train_dict, train_neg, labels[1], "no")

pos_path_te = "aclImdb/test/pos"
test_pos = glob.glob(pos_path_te + '/*.txt')
read_files(train_dict, test_pos, labels[0], "no")

neg_path_te = "aclImdb/test/neg"
test_neg = glob.glob(neg_path_te + '/*.txt')
read_files(train_dict, test_neg, labels[1], "no")

train_data = pd.DataFrame(train_dict)
train_data = train_data.sample(frac=1)

train_data_no, val_data_no = train_test_split(train_data, test_size=0.3, random_state=1)
val_data_no, test_data_no = train_test_split(val_data_no, test_size=0.5, random_state=1)

train_data_no.to_csv('train_data_no.csv')
test_data_no.to_csv('test_data_no.csv')
val_data_no.to_csv('val_data_no.csv')

train_data.head()

Unnamed: 0,file,rating,text,label
13462,aclImdb/train/neg\10867_1.txt,1,film make thing well act write direct bad leav...,0
14207,aclImdb/train/neg\11537_4.txt,4,follow film progress time expect little consid...,0
17419,aclImdb/train/neg\3178_4.txt,4,movie like know go usual joke concern ghost ev...,0
7224,aclImdb/train/pos\5252_9.txt,9,hear movie suppose funny thing see yes funny m...,1
23856,aclImdb/train/neg\8972_3.txt,3,br main question pose concern film film cole p...,0


In [12]:
# Only alphabetical words preprocessing

train_dict = {"file":[], "rating":[], "text":[], "label":[]} #defaultdict(list)

pos_path = "aclImdb/train/pos"
train_pos = glob.glob(pos_path + '/*.txt')
read_files(train_dict, train_pos[:5], labels[0], "alpha")

neg_path = "aclImdb/train/neg"
train_neg = glob.glob(neg_path + '/*.txt')
read_files(train_dict, train_neg[:5], labels[1], "alpha")

pos_path_te = "aclImdb/test/pos"
test_pos = glob.glob(pos_path_te + '/*.txt')
read_files(train_dict, test_pos[:5], labels[0], "alpha")

neg_path_te = "aclImdb/test/neg"
test_neg = glob.glob(neg_path_te + '/*.txt')
read_files(train_dict, test_neg[:5], labels[1], "alpha")

train_data = pd.DataFrame(train_dict)
train_data = train_data.sample(frac=1)

train_data_alpha, val_data_alpha = train_test_split(train_data, test_size=0.3, random_state=1)
val_data_alpha, test_data_alpha = train_test_split(val_data_alpha, test_size=0.5, random_state=1)

train_data_alpha.to_csv('train_data_alpha.csv')
test_data_alpha.to_csv('test_data_alpha.csv')
val_data_alpha.to_csv('val_data_alpha.csv')

train_data.head()

Unnamed: 0,file,rating,text,label
15,aclImdb/test/neg\0_2.txt,2,once again costner has dragged out a movie for...,0
6,aclImdb/train/neg\10000_4.txt,4,airport starts as a brand new luxury plane is ...,0
2,aclImdb/train/pos\10001_10.txt,10,brilliant over acting by lesley ann warren bes...,1
13,aclImdb/test/pos\10002_8.txt,8,i saw this film in a sneak preview and it is d...,1
5,aclImdb/train/neg\0_3.txt,3,story of a man who has unnatural feelings for ...,0


In [None]:
# Remove stopwords and lemmatize preprocessing

train_dict = {"file":[], "rating":[], "text":[], "label":[]} #defaultdict(list)

pos_path = "aclImdb/train/pos"
train_pos = glob.glob(pos_path + '/*.txt')
read_files(train_dict, train_pos, labels[0], "lemma")

neg_path = "aclImdb/train/neg"
train_neg = glob.glob(neg_path + '/*.txt')
read_files(train_dict, train_neg, labels[1], "lemma")

pos_path_te = "aclImdb/test/pos"
test_pos = glob.glob(pos_path_te + '/*.txt')
read_files(train_dict, test_pos, labels[0], "lemma")

neg_path_te = "aclImdb/test/neg"
test_neg = glob.glob(neg_path_te + '/*.txt')
read_files(train_dict, test_neg, labels[1], "lemma")

train_data = pd.DataFrame(train_dict)
train_data = train_data.sample(frac=1)

train_data_lemma, val_data_lemma = train_test_split(train_data, test_size=0.3, random_state=1)
val_data_lemma, test_data_lemma = train_test_split(val_data_lemma, test_size=0.5, random_state=1)

train_data_lemma.to_csv('train_data_lemma.csv')
test_data_lemma.to_csv('test_data_lemma.csv')
val_data_lemma.to_csv('val_data_lemma.csv')

train_data.head()