In [1]:
import pandas as pd
import string
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from sklearn import svm
from sklearn.metrics import classification_report
from scipy.sparse import csr_matrix
from nltk.stem.porter import PorterStemmer
import nltk

In [2]:
data = pd.read_csv('IMDB Dataset.csv')
data['review'] = data['review'].str.lower()
df = data.sample(frac = 0.10, random_state=42)
df.head()

Unnamed: 0,review,sentiment
33553,i really liked this summerslam due to the look...,positive
9427,not many television shows appeal to quite as m...,positive
199,the film quickly gets to a major chase scene w...,negative
12447,jane austen would definitely approve of this o...,positive
39489,expectations were somewhat high for me when i ...,negative


In [3]:
def remove_tags(string):
    result = re.sub('<.*?>','',string)
    return result
df['clean_review']= df['review'].apply(lambda cw : remove_tags(cw))
df['clean_review'] = df['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')

  df['clean_review'] = df['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')


In [4]:
df.head()

Unnamed: 0,review,sentiment,clean_review
33553,i really liked this summerslam due to the look...,positive,i really liked this summerslam due to the look...
9427,not many television shows appeal to quite as m...,positive,not many television shows appeal to quite as m...
199,the film quickly gets to a major chase scene w...,negative,the film quickly gets to a major chase scene w...
12447,jane austen would definitely approve of this o...,positive,jane austen would definitely approve of this o...
39489,expectations were somewhat high for me when i ...,negative,expectations were somewhat high for me when i ...


In [5]:
X_train, X_test,Y_train, Y_test = train_test_split(df['clean_review'], df['sentiment'], test_size=0.2, random_state = 42)

In [6]:
trans_table = {ord(c): None for c in string.punctuation + string.digits}
stemmer = PorterStemmer()
def tokenize(text):
        # my text was unicode so I had to use the unicode-specific translate function. If your documents are strings, you will need to use a different `translate` function here. `Translated` here just does search-replace. See the trans_table: any matching character in the set is replaced with `None`
        tokens = [word for word in nltk.word_tokenize(text.translate(trans_table)) if len(word) > 1] #if len(word) > 1 because I only want to retain words that are at least two characters before stemming, although I can't think of any such words that are not also stopwords
        stems = [stemmer.stem(item) for item in tokens]
        return stems

In [7]:
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True,stop_words = 'english',lowercase = True,strip_accents ='unicode',tokenizer = tokenize)

In [8]:
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)



In [9]:
a = train_vectors.toarray()
len(a[0])
a.shape

(4000, 7021)

In [10]:
classifier_linear = svm.SVC(kernel='linear')
classifier_linear.fit(train_vectors, Y_train)

SVC(kernel='linear')

In [11]:
prediction_linear = classifier_linear.predict(test_vectors)

In [12]:
report = classification_report(Y_test, prediction_linear, output_dict=True)

In [13]:
print('positive: ', report['positive'])
print('negative: ', report['negative'])

positive:  {'precision': 0.8142589118198874, 'recall': 0.8785425101214575, 'f1-score': 0.8451801363193768, 'support': 494}
negative:  {'precision': 0.8715203426124197, 'recall': 0.8043478260869565, 'f1-score': 0.8365878725590956, 'support': 506}


# This is another way of vectorizing

In [14]:
import pandas as pd
import string
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from sklearn import svm
from sklearn.metrics import classification_report
from scipy.sparse import csr_matrix
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.corpus import stopwords

In [15]:
data = pd.read_csv('IMDB Dataset.csv')
data['review'] = data['review'].str.lower()
df = data.sample(frac = 0.10, random_state=42)
df.head()

Unnamed: 0,review,sentiment
33553,i really liked this summerslam due to the look...,positive
9427,not many television shows appeal to quite as m...,positive
199,the film quickly gets to a major chase scene w...,negative
12447,jane austen would definitely approve of this o...,positive
39489,expectations were somewhat high for me when i ...,negative


In [16]:
stopwords = stopwords.words("english")

In [17]:
def remove_stopwords(df):
  df['review without stopwords'] = df['review'].apply(lambda x : ' '.join([word for word in x.split() if word not in (stopwords)]))
  return df
def remove_tags(string):
    result = re.sub('<.*?>','',string)
    return result
    
data_without_stopwords = remove_stopwords(df)
data_without_stopwords['clean_review']= data_without_stopwords['review'].apply(lambda cw : remove_tags(cw))
data_without_stopwords['clean_review'] = data_without_stopwords['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')
df = data_without_stopwords

  data_without_stopwords['clean_review'] = data_without_stopwords['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')


In [18]:
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)

In [19]:
X_train, X_test,Y_train, Y_test = train_test_split(df['clean_review'], df['sentiment'], test_size=0.2, random_state = 42)

In [20]:
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

In [21]:
classifier_linear = svm.SVC(kernel='linear')
classifier_linear.fit(train_vectors, Y_train)
prediction_linear = classifier_linear.predict(test_vectors)
report = classification_report(Y_test, prediction_linear, output_dict=True)
print('positive: ', report['positive'])
print('negative: ', report['negative'])

positive:  {'precision': 0.8326996197718631, 'recall': 0.8866396761133604, 'f1-score': 0.8588235294117647, 'support': 494}
negative:  {'precision': 0.8818565400843882, 'recall': 0.8260869565217391, 'f1-score': 0.8530612244897959, 'support': 506}


In [22]:
import pandas as pd
import string
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from sklearn import svm
from sklearn.metrics import classification_report
from scipy.sparse import csr_matrix
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

In [23]:
data = pd.read_csv('IMDB Dataset.csv')
data['review'] = data['review'].str.lower()
df = data.sample(frac = 0.10, random_state=42)
df.head()

Unnamed: 0,review,sentiment
33553,i really liked this summerslam due to the look...,positive
9427,not many television shows appeal to quite as m...,positive
199,the film quickly gets to a major chase scene w...,negative
12447,jane austen would definitely approve of this o...,positive
39489,expectations were somewhat high for me when i ...,negative


In [24]:
stopwords = stopwords.words("english")

In [25]:
def remove_stopwords(df):
  df['review without stopwords'] = df['review'].apply(lambda x : ' '.join([word for word in x.split() if word not in (stopwords)]))
  return df
def remove_tags(string):
    result = re.sub('<.*?>','',string)
    return result
    
data_without_stopwords = remove_stopwords(df)
data_without_stopwords['clean_review']= data_without_stopwords['review'].apply(lambda cw : remove_tags(cw))
data_without_stopwords['clean_review'] = data_without_stopwords['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')
df = data_without_stopwords

  data_without_stopwords['clean_review'] = data_without_stopwords['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')


In [26]:
X_train, X_test,Y_train, Y_test = train_test_split(df['clean_review'], df['sentiment'], test_size=0.2, random_state = 42)

In [27]:
vec = CountVectorizer()
train_vectors = vec.fit_transform(X_train).toarray()
test_vectors = vec.transform(X_test).toarray()

In [28]:
classifier_linear = svm.SVC(kernel='linear')
classifier_linear.fit(train_vectors, Y_train)
prediction_linear = classifier_linear.predict(test_vectors)
report = classification_report(Y_test, prediction_linear, output_dict=True)
print('positive: ', report['positive'])
print('negative: ', report['negative'])

positive:  {'precision': 0.8279158699808795, 'recall': 0.8765182186234818, 'f1-score': 0.8515240904621436, 'support': 494}
negative:  {'precision': 0.8721174004192872, 'recall': 0.8221343873517787, 'f1-score': 0.8463886063072228, 'support': 506}


In [29]:
positive:  {'precision': 0.8326996197718631, 'recall': 0.8866396761133604, 'f1-score': 0.8588235294117647, 'support': 494}
negative:  {'precision': 0.8818565400843882, 'recall': 0.8260869565217391, 'f1-score': 0.8530612244897959, 'support': 506}