In [1]:
import pandas as pd
import string
import re
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from nltk.stem.porter import PorterStemmer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv('IMDB Dataset.csv')
data['review'] = data['review'].str.lower()
df = data.sample(frac = 0.10, random_state=42)
df.tail()

Unnamed: 0,review,sentiment
39885,one of eastwood's best movies after he had sep...,positive
17566,my blurred childhood memories have kept the ec...,negative
16062,i love zombie-movies and i love amateur-produc...,negative
48445,chan is in new york and he gets involved with ...,positive
20382,my wife and i both thought this film a watered...,negative


In [3]:
def remove_tags(string):
    result = re.sub('<.*?>','',string)
    return result
    
#data_without_stopwords = remove_stopwords(df)
df['clean_review']= df['review'].apply(lambda cw : remove_tags(cw))
df['clean_review'] = df['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')

  df['clean_review'] = df['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')


In [4]:
df.tail()

Unnamed: 0,review,sentiment,clean_review
39885,one of eastwood's best movies after he had sep...,positive,one of eastwood s best movies after he had sep...
17566,my blurred childhood memories have kept the ec...,negative,my blurred childhood memories have kept the ec...
16062,i love zombie-movies and i love amateur-produc...,negative,i love zombie movies and i love amateur produc...
48445,chan is in new york and he gets involved with ...,positive,chan is in new york and he gets involved with ...
20382,my wife and i both thought this film a watered...,negative,my wife and i both thought this film a watered...


In [5]:
X_train, X_test,Y_train, Y_test = train_test_split(df['clean_review'], df['sentiment'], test_size=0.3, random_state = 42)

In [6]:
trans_table = {ord(c): None for c in string.punctuation + string.digits}
stemmer = PorterStemmer()
def tokenize(text):
        # my text was unicode so I had to use the unicode-specific translate function. If your documents are strings, you will need to use a different `translate` function here. `Translated` here just does search-replace. See the trans_table: any matching character in the set is replaced with `None`
        tokens = [word for word in nltk.word_tokenize(text.translate(trans_table)) if len(word) > 1] #if len(word) > 1 because I only want to retain words that are at least two characters before stemming, although I can't think of any such words that are not also stopwords
        stems = [stemmer.stem(item) for item in tokens]
        return stems

In [7]:
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True,stop_words = 'english',lowercase = True,strip_accents ='unicode',tokenizer = tokenize)

In [8]:
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)



In [9]:
logisticRegr = LogisticRegression()
logisticRegr.fit(train_vectors, Y_train)

LogisticRegression()

In [10]:
predict_train = logisticRegr.predict(train_vectors)
predict_test = logisticRegr.predict(test_vectors)

In [11]:
score_train = logisticRegr.score(train_vectors,Y_train)
score_test = logisticRegr.score(test_vectors, Y_test)
print(score_train)
print(score_test)

0.9425714285714286
0.846


# This is in a different way

In [12]:
import pandas as pd
import string
import re
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from nltk.stem.porter import PorterStemmer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords

In [13]:
data = pd.read_csv('IMDB Dataset.csv')
data['review'] = data['review'].str.lower()
df = data.sample(frac = 0.10, random_state=42)
df.head()

Unnamed: 0,review,sentiment
33553,i really liked this summerslam due to the look...,positive
9427,not many television shows appeal to quite as m...,positive
199,the film quickly gets to a major chase scene w...,negative
12447,jane austen would definitely approve of this o...,positive
39489,expectations were somewhat high for me when i ...,negative


In [14]:
stopwords = stopwords.words("english")

In [15]:
def remove_stopwords(df):
  df['review without stopwords'] = df['review'].apply(lambda x : ' '.join([word for word in x.split() if word not in (stopwords)]))
  return df
def remove_tags(string):
    result = re.sub('<.*?>','',string)
    return result
    
data_without_stopwords = remove_stopwords(df)
data_without_stopwords['clean_review']= data_without_stopwords['review'].apply(lambda cw : remove_tags(cw))
data_without_stopwords['clean_review'] = data_without_stopwords['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')
df = data_without_stopwords

  data_without_stopwords['clean_review'] = data_without_stopwords['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')


In [16]:
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)

In [17]:
X_train, X_test,Y_train, Y_test = train_test_split(df['clean_review'], df['sentiment'], test_size=0.2, random_state = 42)

In [18]:
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

In [19]:
logisticRegr = LogisticRegression()
logisticRegr.fit(train_vectors, Y_train)

LogisticRegression()

In [20]:
predict_train = logisticRegr.predict(train_vectors)
predict_test = logisticRegr.predict(test_vectors)

In [21]:
score_train = logisticRegr.score(train_vectors,Y_train)
score_test = logisticRegr.score(test_vectors, Y_test)
print(score_train)
print(score_test)

0.94975
0.856


# Another Way

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
import joblib
from sklearn.feature_extraction.text import CountVectorizer
import string
import re
import numpy as np
from sklearn.linear_model import LogisticRegression

In [23]:
data = pd.read_csv('IMDB Dataset.csv')
data['review'] = data['review'].str.lower()
df = data.sample(frac = 0.10, random_state=42)
df.head()

Unnamed: 0,review,sentiment
33553,i really liked this summerslam due to the look...,positive
9427,not many television shows appeal to quite as m...,positive
199,the film quickly gets to a major chase scene w...,negative
12447,jane austen would definitely approve of this o...,positive
39489,expectations were somewhat high for me when i ...,negative


In [24]:
def remove_tags(string):
    result = re.sub('<.*?>','',string)
    return result
df['clean_review']= df['review'].apply(lambda cw : remove_tags(cw))
df['clean_review'] = df['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')

  df['clean_review'] = df['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')


In [25]:
X_train, X_test,Y_train, Y_test = train_test_split(df['clean_review'], df['sentiment'], test_size=0.2, random_state = 42)

In [26]:
vec = CountVectorizer(stop_words='english')
X_train = vec.fit_transform(X_train).toarray()
X_test = vec.transform(X_test).toarray()

In [27]:
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train, Y_train)

LogisticRegression()

In [28]:
predict_train = logisticRegr.predict(X_train)
predict_test = logisticRegr.predict(X_test)

In [29]:
score_train = logisticRegr.score(X_train,Y_train)
score_test = logisticRegr.score(X_test, Y_test)
print(score_train)
print(score_test)

0.9995
0.841
