# Model Creation

Import the necessary libraries

In [5]:
import urllib.request
import os
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

Reading the .csv

In [6]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


Cleaning the text data

In [7]:
# init Objects
tokenizer=RegexpTokenizer(r'\w+')
en_stopwords=set(stopwords.words('english'))
ps=PorterStemmer()
def getStemmedReview(review):
    review=review.lower()
    review=review.replace("<br /><br />"," ")
    #Tokenize
    tokens=tokenizer.tokenize(review)
    new_tokens=[token for token in tokens if token not in  en_stopwords]
    stemmed_tokens=[ps.stem(token) for token in new_tokens]
    clean_review=' '.join(stemmed_tokens)
    return clean_review

Cleaning all the reviews and splitting our data for training and testing

In [8]:
df['review'].apply(getStemmedReview)
X_train = df.loc[:35000, 'review'].values
y_train = df.loc[:35000, 'sentiment'].values
X_test = df.loc[35000:, 'review'].values
y_test = df.loc[35000:, 'sentiment'].values

Transforming words into feature vectors

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, encoding='utf-8', decode_error='ignore')
vectorizer.fit(X_train)
X_train=vectorizer.transform(X_train)
X_test=vectorizer.transform(X_test)

Creating the model and checking the score on training and test data

In [10]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(solver='liblinear')
model.fit(X_train,y_train)
print("Score on training data is: "+str(model.score(X_train,y_train)))
print("Score on testing data is: "+str(model.score(X_test,y_test)))

Score on training data is: 0.935973257906917
Score on testing data is: 0.8976666666666666


In [11]:
df.iloc[35000,0]

"If you haven't seen the gong show TV series then you won't like this movie much at all, not that knowing the series makes this a great movie. <br /><br />I give it a 5 out of 10 because a few things make it kind of amusing that help make up for its obvious problems.<br /><br />1) It's a funny snapshot of the era it was made in, the late 1970's and early 1980's. 2) You get a lot of funny cameos of people you've seen on the show. 3) It's interesting to see Chuck (the host) when he isn't doing his on air TV personality. 4) You get to see a lot of bizarre people doing all sorts of weirdness just like you see on the TV show.<br /><br />I won't list all the bad things because there's a lot of them, but here's a few of the most prominent.<br /><br />1) The Gong Show Movie has a lot of the actual TV show clips which gets tired at movie length. 2) The movie's story line outside of the clip segments is very weak and basically is made up of just one plot point. 3) Chuck is actually halfway decen

In [12]:
X_test[0]

<1x87888 sparse matrix of type '<class 'numpy.float64'>'
	with 128 stored elements in Compressed Sparse Row format>

# Model deployment and creating a Web-App

Serializing fitted scikit-learn estimators

In [13]:
import joblib as jb
jb.dump(en_stopwords,'./movieclassifier/pkl_objects/stopwords.pkl') 
jb.dump(model,'./movieclassifier/pkl_objects/model.pkl')
jb.dump(vectorizer,'./movieclassifier/pkl_objects/vectorizer.pkl')

['./movieclassifier/pkl_objects/vectorizer.pkl']

In [14]:
from flask import Flask
app = Flask(__name__)
@app.route("/")
def hello():
    return "Hello World!"
if __name__ == '__main__':
    app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [20/Mar/2023 19:41:51] "GET / HTTP/1.1" 200 -


In [17]:
from flask import Flask, render_template, request
from wtforms import Form, TextAreaField, validators
import pickle
import sqlite3
import os
import numpy as np
from sklearn.externals import joblib
loaded_model=joblib.load("./pkl_objects/model.pkl")
loaded_stop=joblib.load("./pkl_objects/stopwords.pkl")
loaded_vec=joblib.load("./pkl_objects/vectorizer.pkl")
app = Flask(__name__)
def classify(document):
    label = {0: 'negative', 1: 'positive'}
    X = loaded_vec.transform([document])
    y = loaded_model.predict(X)[0]
    proba = np.max(loaded_model.predict_proba(X))
    return label[y], proba
class ReviewForm(Form):
    moviereview = TextAreaField('',[validators.DataRequired(),validators.length(min=15)])
@app.route('/')
def index():
    form = ReviewForm(request.form)
    return render_template('reviewform.html', form=form)
@app.route('/results', methods=['POST'])
def results():
    form = ReviewForm(request.form)
    if request.method == 'POST' and form.validate():
        review = request.form['moviereview']
    y, proba = classify(review)
    return render_template('results.html',content=review,prediction=y,probability=round(proba*100, 2))
    return render_template('reviewform.html', form=form)
if __name__ == '__main__':
    app.run()

ImportError: cannot import name 'joblib' from 'sklearn.externals' (/Users/akilagamage/opt/anaconda3/lib/python3.9/site-packages/sklearn/externals/__init__.py)