In [None]:
!pip install wordcloud
!pip install - U spacy


In [1]:
import csv

#data manipulation
import pandas as pd
import numpy as np

#visualization
import matplotlib.pyplot as plt
import seaborn as sns

#text packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud, STOPWORDS
import spacy
from bs4 import BeautifulSoup

#model preparation & selection
from sklearn.model_selection import validation_curve, train_test_split
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

import warnings
warnings.filterwarnings("ignore")


# Import data

In [5]:
text_data = pd.read_csv('movies.csv', header=0)
text_data.columns = ["name", "comment", "label"]
print(text_data.shape)
text_data = text_data.astype(str)
text_data.head()


(21021, 3)


Unnamed: 0,name,comment,label
0,chrissy_judy,Theres something quite refreshing about Chriss...,fresh
1,chrissy_judy,Chrissy Judy is a story about young adult cris...,fresh
2,chrissy_judy,Flaherty’s writing and performance make no pre...,fresh
3,chrissy_judy,A torch song of a film.,fresh
4,chrissy_judy,Refreshingly witty honest tender and funny,fresh


# Process data

In [6]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# Load the spaCy language model
nlp = spacy.load('en_core_web_sm')


def preprocess_text(df):
    # Remove Emails
    df['comment'] = df['comment'].apply(lambda x: re.sub(
        r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', '', x))

    # Remove urls
    df['comment'] = df['comment'].apply(lambda x: re.sub(
        r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', x))

    # Removal of special chars and punctuation
    df['comment'] = df['comment'].apply(
        lambda x: re.sub('[^a-z A-Z 0-9-]+', '', x))

    # Removing multiple spaces
    df['comment'] = df['comment'].apply(lambda x: ' '.join(x.split()))

    # Remove HTML
    df['comment'] = df['comment'].apply(
        lambda x: BeautifulSoup(x, 'lxml').get_text())

    # Create a spaCy Doc object
    df['comment'] = df['comment'].apply(lambda x: ' '.join(
        [t for t in x.split() if t not in STOP_WORDS]))

    return df

df = preprocess_text(text_data)
print(df['comment'])  # Output: "example text email punctuation"


0        Theres refreshing Chrissy Judy arriving allowi...
1        Chrissy Judy story young adult crisis finding ...
2        Flahertys writing performance pretense positiv...
3                                        A torch song film
4                   Refreshingly witty honest tender funny
                               ...                        
21016    This intimate gay drama complex relationship w...
21017                                                  nan
21018                                                  nan
21019    A compelling drama suffering servanthood minis...
21020                                                  nan
Name: comment, Length: 21021, dtype: object


# Train SVM

In [7]:
vec = TfidfVectorizer(stop_words='english', min_df=7,
                      ngram_range=(1, 2), max_df=0.8)
X_data = vec.fit_transform(df['comment'])
df['sentiment_score'] = np.where(
    df['label'] == 'fresh', 1, 0)
Y_data = df['sentiment_score']

Xtrain, Xtest, Ytrain, Ytest = train_test_split(
    X_data, Y_data, test_size=0.15, random_state=0)

#Support Vector Machine (not cross-validating here)
clf = SVC(kernel='rbf', probability=True).fit(Xtrain, Ytrain)
clf_score = clf.score(Xtest, Ytest)


# # Train the SVM classifier
# classifier = SVC(kernel='linear')
# classifier.fit(vectorized_sentences, labels)


In [9]:
print('The accuracy for the kernel SVC model is '+str(clf_score))
clf_f1score = f1_score(Ytest, clf.predict(Xtest))
print('The F1 for the kernel SVC model is '+str(clf_f1score))


The accuracy for the kernel SVC model is 0.9540266328471781
The F1 for the kernel SVC model is 0.9689972204404532


In [11]:
import joblib
joblib.dump(clf,"%s.pkl"%"movies")
joblib.dump(vec, "%s.pkl" % "movies_vec")


['movies_vec.pkl']

In [13]:
estimator = joblib.load("%s.pkl"%"movies")
pkl_vec = joblib.load("%s.pkl" % "movies_vec")
# Test on a new sentence


def test(clf, vec, text):
    test_vectorized_sentence = vec.transform([text])
    prediction = clf.predict(test_vectorized_sentence)
    # Output: [1] (positive sentiment)
    print(prediction, clf.predict_proba(
        test_vectorized_sentence)[0][prediction[0]])


test(estimator, pkl_vec, "This movie is shit")


[0] 0.9939244428334526
