# Preparing Data for Text Processing

In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
import pyprind

basepath = './aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()

for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

In [None]:
np.random.seed(0)

# shuffle data and store as csv
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

In [None]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head()

In [None]:
df.shape

# Bag-of-Words Model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# initialize and fit bag of words model
count = CountVectorizer()
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining, the weather is sweet, '
    'and one and one is two'])
bag = count.fit_transform(docs)

In [None]:
print(count.vocabulary_)

In [None]:
print(bag.toarray())

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

# transform raw term frequencies into tf-idfs
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

In [None]:
df.loc[0, 'review'][-50:]

In [None]:
import re

def preprocessor(text):
    """Remove HTML markup and move emoticons to end of string."""
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) +  ' '.join(emoticons).replace('-', ''))
    return text

In [None]:
print(preprocessor(df.loc[0, 'review'][-50:]))
print(preprocessor("</a>This :) is :( a test :-)!"))

In [None]:
df['review'] = df['review'].apply(preprocessor)

In [None]:
def tokenizer(text):
    """Split documents into individual words."""
    return text.split()

tokenizer('runners like running and thus they run')

In [None]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer_porter(text):
    """Reduce words to root form."""
    return [porter.stem(word) for word in text.split()]

tokenizer_porter('runners like running and thus they run')

In [None]:
import nltk

nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords

# remove english stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

# Logistic Regression for Document Classification

In [None]:
# split dataset into features and target training and test sets
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
tfidf_X_train = tfidf.fit_transform(X_train)

clf = LogisticRegression(solver='lbfgs', random_state=0)
clf.fit(tfidf_X_train, y_train)
tfidf_X_test = tfidf.transform(X_test)
print(tfidf_X_test.shape)

In [None]:
scores = cross_val_score(clf, tfidf_X_test, y_test, cv=5)
acc = scores.mean()
print(f"Accuracy: {(acc *100):.2f} percent")

# Working with Big Data

In [None]:
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [None]:
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)  # skip header
        for line in csv:
            text, label = line[: -3], int(line[-2])
            yield text, label

In [None]:
next(stream_docs(path='movie_data.csv'))

In [None]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

# Initialize HashingVectorizer
vect = HashingVectorizer(decode_error='ignore',
                         n_features=2**21,
                         preprocessor=None,
                         tokenizer=tokenizer)
# Initialize SGDclassifier
clf = SGDClassifier(loss='log', random_state=1, max_iter=1)
# Return docs
doc_stream = stream_docs(path='movie_data.csv')

In [None]:
# Initialize progress indicator
pbar = pyprind.ProgBar(45)

classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

In [None]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print(f'Accuracy: {clf.score(X_test, y_test):.3f}')

In [None]:
clf = clf.partial_fit(X_test, y_test)
clf

# Serializing Fitted Estimators

In [None]:
import pickle
import os

dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)

pickle.dump(
    stop, open(os.path.join(dest, 'stopwords.pkl'),'wb'),
    protocol=4
)
pickle.dump(
    clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'),
    protocol=4
)

# Topic Modeling with Latent Dirichlet Allocation

In [None]:
count = CountVectorizer(stop_words='english', max_df=.1, max_features=2000)
X = count.fit_transform(df['review'].values)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

# Initialize and fit LDA
lda = LatentDirichletAllocation(n_components=10, random_state=123, learning_method='batch')
X_topics = lda.fit_transform(X)

In [None]:
lda.components_.shape

In [None]:
n_top_words = 5
feature_names = count.get_feature_names()

for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {(topic_idx + 1)}')
    print(" ".join(
        [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    ))

In [None]:
horror = X_topics[:, 4].argsort()[::-1]

for iter_idx, movie_idx in enumerate(horror[:3]):
    print(f'\nHorror movie {(iter_idx + 1)}')
    print(df['review'][movie_idx][:300], '...')