# Graded assignment 1 - text classification using Genetic Algorithms
## By Abdullah Karagøz

In this assignmetn we'll make a binary text classifier using genetic algorithms. We will classify movie reviews from IMDB as either negative or positive. This task consists of several steps:

1. Preprocessing of the text
2. Genetich Algorithm
3. Validation



## 1. Preprocessing

In [1]:
## Upload the text
import numpy as np
import pandas as pd
import nltk
import string
import re
from nltk.corpus import PlaintextCorpusReader


In [2]:
# File directories
corpus_train_pos_root = 'aclImdb/train/pos/'
corpus_train_neg_root = 'aclImdb/train/neg/'
corpus_test_pos_root = 'aclImdb/test/pos/'
corpus_test_neg_root = 'aclImdb/test/neg/'

# Corpus file objects
files_train_pos = PlaintextCorpusReader(corpus_train_pos_root, '.*')
files_train_neg = PlaintextCorpusReader(corpus_train_neg_root, '.*')
files_test_pos = PlaintextCorpusReader(corpus_test_pos_root, '.*')
files_test_neg = PlaintextCorpusReader(corpus_test_neg_root, '.*')


# Getting review texts, labels and rates all in arrays
reviews_train_pos = [files_train_pos.open(n).read() for n in files_train_pos.fileids()]
rates_train_pos = [int(re.split("_|\.", n)[-2]) for n in files_train_pos.fileids()]
labels_train_pos = [1] * len(reviews_train_pos)

reviews_train_neg = [files_train_neg.open(n).read() for n in files_train_neg.fileids()]
rates_train_neg = [int(re.split("_|\.", n)[-2]) for n in files_train_neg.fileids()]
labels_train_neg = [-1] * len(reviews_train_neg)

reviews_test_pos = [files_test_pos.open(n).read() for n in files_test_pos.fileids()]
rates_test_pos = [int(re.split("_|\.", n)[-2]) for n in files_test_pos.fileids()]
labels_test_pos = [1] * len(reviews_test_pos)

reviews_test_neg = [files_test_neg.open(n).read() for n in files_test_neg.fileids()]
rates_test_neg = [int(re.split("_|\.", n)[-2]) for n in files_test_neg.fileids()]
labels_test_neg = [-1] * len(reviews_test_neg)

In [3]:
# Putting all into two Pandas dataframes - training set and testing set
train_set = pd.DataFrame()
test_set = pd.DataFrame()

train_set['review'] = reviews_train_pos + reviews_train_neg
train_set['rate'] = rates_train_pos + rates_train_neg
train_set['label'] = labels_train_pos + labels_train_neg

test_set['review'] = reviews_test_pos + reviews_test_neg
test_set['rate'] = rates_test_pos + rates_test_neg
test_set['label'] = labels_test_pos + labels_test_neg


In [4]:
# I think to put this into own file
class text_preprocessor():
    def __init__(self):
        import nltk
        import re
        import string
        
        nltk.download('stopwords')
        from nltk.corpus import stopwords
        ", ".join(stopwords.words('english'))
        self.stop_words = set(stopwords.words('english'))
        
        self.punctuation = string.punctuation
        
        self.emoji_pattern = re.compile("["
                                u"\U0001F600-\U0001F64F"  # emoticons
                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                u"\U00002702-\U000027B0"
                                u"\U000024C2-\U0001F251"
                                "]+", flags=re.UNICODE)
        
        # src : https://github.com/NeelShah18/emot/blob/master/emot/emo_unicode.py
        self.emoticons = {
            u":‑\)":"Happy face or smiley",
            u":\)":"Happy face or smiley",
            u":-\]":"Happy face or smiley",
            u":\]":"Happy face or smiley",
            u":-3":"Happy face smiley",
            u":3":"Happy face smiley",
            u":->":"Happy face smiley",
            u":>":"Happy face smiley",
            u"8-\)":"Happy face smiley",
            u":o\)":"Happy face smiley",
            u":-\}":"Happy face smiley",
            u":\}":"Happy face smiley",
            u":-\)":"Happy face smiley",
            u":c\)":"Happy face smiley",
            u":\^\)":"Happy face smiley",
            u"=\]":"Happy face smiley",
            u"=\)":"Happy face smiley"
        }

    def lower_case(self, text):
        return str.lower(text)
    
    def remove_punctuation(self, text):
        return text.translate(str.maketrans('', '', self.punctuation))
    
    def remove_stopwords(self, text):
        return " ".join([word for word in str(text).split() if word not in self.stop_words])
    
    def remove_freqwords(self, text, freq_words):
        return " ".join([word for word in str(text).split() if word not in freq_words])
    
    def remove_rarewords(self, text, rare_words):
        return " ".join([word for word in str(text).split() if word not in rare_words])
    
    def remove_emoji(self, text):
        # src: https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
        return self.emoji_pattern.sub(r'', text)
    
    
    def remove_emoticons(self, text):
        import re
        # src : https://github.com/NeelShah18/emot/blob/master/emot/emo_unicode.py
        emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in self.emoticons) + u')')
        return emoticon_pattern.sub(r'', text)
    
    def convert_emoticons(self, text):
        # src : https://github.com/NeelShah18/emot/blob/master/emot/emo_unicode.py
        for emot in self.emoticons:
            text = re.sub(u'('+emot+')', "_".join(self.emoticons[emot].replace(",","").split()), text)
        return text
    
    def remove_urls(self, text):
        url_pattern = re.compile(r'https?://\S+|www\.\S+')
        return url_pattern.sub(r'', text)


In [5]:
# preprocessing function
def preprocess_imdb_reviews(preprocessor, df):
    df['review'] = df['review'].apply(lambda text: preprocessor.lower_case(text))
    df['review'] = df['review'].apply(lambda text: preprocessor.remove_punctuation(text))
    df['review'] = df['review'].apply(lambda text: preprocessor.remove_stopwords(text))
    df['review'] = df['review'].apply(lambda text: preprocessor.remove_urls(text))
    df['review'] = df['review'].apply(lambda text: preprocessor.remove_emoji(text))

    from collections import Counter
    cnt = Counter()
    for text in df["review"].values:
        for word in text.split():
            cnt[word] += 1

    freq_words = set([w for (w, wc) in cnt.most_common(10)])
    df['review'] = df['review'].apply(lambda text: preprocessor.remove_freqwords(text, freq_words))

    n_rare_words = 10
    rare_words = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
    df['review'] = df['review'].apply(lambda text: preprocessor.remove_rarewords(text, rare_words))

    return df

In [6]:
preprocessor = text_preprocessor()
train_set_processed = preprocess_imdb_reviews(preprocessor, train_set)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abdka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
train_set_processed

Unnamed: 0,review,rate,label
0,bromwell high cartoon comedy ran programs scho...,9,1
1,homelessness houselessness george carlin state...,8,1
2,brilliant overacting lesley ann warren best dr...,10,1
3,easily underrated inn brooks cannon sure flawe...,7,1
4,typical mel brooks much less slapstick movies ...,8,1
...,...,...,...
24995,towards end felt technical felt classroom watc...,4,-1
24996,kind enemies content watch bloody true watch m...,3,-1
24997,saw descent last night stockholm festival huge...,3,-1
24998,films pick pound turn rather 23rd century film...,1,-1
