In [None]:
!pip install wordcloud
!pip install - U spacy


In [1]:
import csv

#data manipulation
import pandas as pd
import numpy as np

#visualization
import matplotlib.pyplot as plt
import seaborn as sns

#text packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud, STOPWORDS
import spacy
from bs4 import BeautifulSoup

#model preparation & selection
from sklearn.model_selection import validation_curve, train_test_split
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

import warnings
warnings.filterwarnings("ignore")


# Import data

In [2]:
text_data = pd.read_csv('movies.csv', header=0)
text_data.columns = ["name", "comment", "label"]
print(text_data.shape)
text_data = text_data.astype(str)
text_data.head()


(14050, 3)


Unnamed: 0,name,comment,label
0,juniper,Filmmakers should showcase Rampling’s indomita...,rotten
1,juniper,Savilles directorial debut is solid just like ...,fresh
2,juniper,It feels personal in a lot of the details and ...,fresh
3,juniper,We've seen this a million times -- but not wit...,fresh
4,juniper,Juniper takes some time to get where it is goi...,fresh


# Process data

In [3]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# Load the spaCy language model
nlp = spacy.load('en_core_web_sm')


def preprocess_text(df):
    # Remove Emails
    df['comment'] = df['comment'].apply(lambda x: re.sub(
        r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', '', x))

    # Remove urls
    df['comment'] = df['comment'].apply(lambda x: re.sub(
        r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', x))

    # Removal of special chars and punctuation
    df['comment'] = df['comment'].apply(
        lambda x: re.sub('[^a-z A-Z 0-9-]+', '', x))

    # Removing multiple spaces
    df['comment'] = df['comment'].apply(lambda x: ' '.join(x.split()))

    # Remove HTML
    df['comment'] = df['comment'].apply(
        lambda x: BeautifulSoup(x, 'lxml').get_text())

    # Create a spaCy Doc object
    df['comment'] = df['comment'].apply(lambda x: ' '.join(
        [t for t in x.split() if t not in STOP_WORDS]))

    return df

df = preprocess_text(text_data)
print(df['comment'])  # Output: "example text email punctuation"


0        Filmmakers showcase Ramplings indomitable pres...
1        Savilles directorial debut solid like script g...
2        It feels personal lot details production desig...
3        Weve seen million times -- Charlotte Rampling ...
4        Juniper takes time going invested odd couple f...
                               ...                        
14045    Although Sachs sustain cinematic promise The D...
14046    This intimate gay drama complex relationship w...
14047                                                  nan
14048                                                  nan
14049    A compelling drama suffering servanthood minis...
Name: comment, Length: 14050, dtype: object


# Train SVM

In [5]:
vec = TfidfVectorizer(stop_words='english', min_df=7,
                      ngram_range=(1, 2), max_df=0.8)
X_data = vec.fit_transform(df['comment'])
df['sentiment_score'] = np.where(
    df['label'] == 'fresh', 1, 0)
Y_data = df['sentiment_score']

Xtrain, Xtest, Ytrain, Ytest = train_test_split(
    X_data, Y_data, test_size=0.15, random_state=0)

#Support Vector Machine (not cross-validating here)
clf = SVC(kernel='linear', probability=True).fit(Xtrain, Ytrain)
clf_score = clf.score(Xtest, Ytest)


# # Train the SVM classifier
# classifier = SVC(kernel='linear')
# classifier.fit(vectorized_sentences, labels)


In [6]:
print('The accuracy for the Linear SVC model is '+str(clf_score))
clf_f1score = f1_score(Ytest, clf.predict(Xtest))
print('The F1 for the Linear SVC model is '+str(clf_f1score))

The accuracy for the Linear SVC model is 0.8946869070208728
The F1 for the Linear SVC model is 0.9316081330868762


In [7]:
# Test on a new sentence
test_sentence = ["This movie is amazing"]
test_vectorized_sentence = vec.transform(test_sentence)
prediction = clf.predict(test_vectorized_sentence)
print(prediction)  # Output: [1] (positive sentiment)


[1]
