#### Making a Sentiment Analysis Web App 

In [1]:
#Preprocessing data into a more convenient format

import pyprind
import pandas as pd
import os

# change the `basepath` to the directory of the
# unzipped movie dataset

basepath = r"C:\Users\Arjvi\OneDrive\Desktop\IMDBDATA\aclImdb" # Replace with your actual folder name

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame(columns=['review', 'sentiment'])  # Initialize the DataFrame with columns
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file), 
                      'r', encoding='utf-8') as infile:
                txt = infile.read()
            df.loc[len(df)] = [txt, labels[l]]  # Add a new row using .loc[]
            pbar.update()

# Shuffling Data Frame

import numpy as np
import pandas as pd 

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))

df.to_csv('movie_data.csv', index=False, encoding='utf-8') # Saving as CSV

import pandas as pd

df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:09:35


Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [11]:
# Shuffling Data Frame

import numpy as np
import pandas as pd 

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))

df.to_csv('movie_data.csv', index=False, encoding='utf-8') # Saving as CSV

import pandas as pd

df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,I finally managed to sit through a whole episo...,0
1,Just what the world needed-another superficial...,0
2,I have to say despite it's reviews Angels in t...,1


In [9]:
# Cleaning the data & Processing into tokens & Adding stopwords

import nltk as nltk


import re #CLEAN
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

from nltk.stem.porter import PorterStemmer #Token 


porter = PorterStemmer()

def tokenizer(text):
    return text.split()


def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

from nltk.corpus import stopwords # Stopwords 

stop = stopwords.words('english')



In [27]:
# Building Tokenizer, Stream and Minibatch functions 

import numpy as np 
import re
from nltk.corpus import stopwords

stop = stopwords.words('english')

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(path): # Reads in and returns one document at a time
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv) # skip header
        for line in csv: 
            text, label = line[:-3], int(line[-2])
            yield text, label

next(stream_docs(path='movie_data.csv'))

def get_minibatch(doc_stream, size): # Take a particular number of docs from the stream
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y




In [37]:
# Hashing Vectorizer & SGD Classifier

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier


vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21,
                         preprocessor=None, 
                         tokenizer=tokenizer)

from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version

clf = SGDClassifier(loss='log_loss', random_state=1)


doc_stream = stream_docs(path='movie_data.csv')

In [38]:
# Out of Core Learning

import pyprind
pbar = pyprind.ProgBar(45)

classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()



0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:20


In [40]:
#Testing and updating with the test data

X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

clf = clf.partial_fit(X_test, y_test) #model update

Accuracy: 0.880


In [41]:
#Serializing the Estimator

import pickle 
import os

dest = os.path.join('movieclassifier', 'pkl_objects')

if not os.path.exists(dest):
    os.makedirs(dest)

pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4)
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)