In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from string import punctuation
import re
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer


In [2]:
df=pd.read_csv('./IMDB_dataset/IMDB dataset.csv')

stop = set(stopwords.words('english'))
punctuation = list(punctuation)
stop.update(punctuation)

def remove_stopwords(text, stopwords_set):
    output = []
    for i in text.split():
        word = i.strip().lower()
        if word not in stopwords_set and word.isalpha():
            output.append(word)
    return " ".join(output)
    
def process_data(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = remove_stopwords(text, stop)
    return text

df['review']=df['review'].apply(process_data)

df.sentiment.replace("positive" , 1 , inplace = True)
df.sentiment.replace("negative" , 0 , inplace = True)

def tokenize_and_stem(data):
    stem_tokens = []
    stemmer = PorterStemmer()
    data_tokens = word_tokenize(data)
    for word in data_tokens:
        stem_word = stemmer.stem(word)
        stem_tokens.append(stem_word)
    return stem_tokens    

  soup = BeautifulSoup(text, "html.parser")


In [3]:
X = df['review']
Y = df['sentiment']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


In [4]:
def build_freqs(reviews, ys):
    yslist = np.squeeze(ys).tolist()

    freqs = {}
    for y, review in zip(yslist, reviews):
        for word in tokenize_and_stem(review):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

In [5]:
freqs = build_freqs(x_train, y_train)

# check the output
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys() ) ))

type(freqs) = <class 'dict'>
len(freqs) = 74775


In [6]:
def sigmoid(z): 
    h = 1 / (1 + np.exp(-z))
    return h

def gradientDescent(x, y, theta, alpha, num_iters):

    m = x.shape[0]
    
    for i in range(0, num_iters):
        z = np.dot(x,theta)
        
        h = sigmoid(z)
        J = -1./m * (np.dot(y.T, np.log(h)) + np.dot((1-y).T,np.log(1-h)))                                                    

        theta = theta - (alpha/m) * np.dot(x.T,(h-y))
    J = float(J)
        
    return J, theta


In [7]:
def extract_features(review, freqs):

    word_l = process_data(review)
    word_l = tokenize_and_stem(word_l)
    
    x = np.zeros((1, 3)) 
    x[0,0] = 1 
    
    for word in word_l:
        x[0,1] += freqs.get((word, 1),0)
        x[0,2] += freqs.get((word, 0),0)
    
    assert(x.shape == (1, 3))   
    return x

In [8]:
# x_train = x_train.reset_index(drop=True)
# x_train = x_train.drop("index",axis=1)

In [9]:
tmp1 = extract_features(x_train[1], freqs)
print(tmp1)

[[1.00000e+00 2.28306e+05 2.76312e+05]]


In [10]:
X = np.zeros((len(x_train), 3))
for i in range(len(x_train)):
    X[i, :]= extract_features(x_train[i], freqs)


In [11]:
Y = y_train.to_numpy(dtype='float')

Y = np.reshape(Y, (-1, 1))

print(X)
print(Y)

[[1.00000e+00 1.83212e+05 1.91880e+05]
 [1.00000e+00 2.28306e+05 2.76312e+05]
 [1.00000e+00 6.52980e+04 7.17730e+04]
 ...
 [1.00000e+00 1.21839e+05 1.48854e+05]
 [1.00000e+00 2.37357e+05 2.87970e+05]
 [1.00000e+00 1.95996e+05 2.20790e+05]]
[[1.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]


In [26]:
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-20, 20000)
print(f"The cost after training is {J}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 0.6931471734905965.
The resulting vector of weights is [0.0, 0.0, -0.0]


In [13]:
def predict_review(review, freqs, theta):
    x = extract_features(review,freqs)
    y_pred = sigmoid(np.dot(x,theta))
    
    return y_pred

In [20]:
def test_logistic_regression(x_test, y_test, freqs, theta):

    y_hat = []
    for rev in x_test:
        y_pred = predict_review(rev, freqs, theta)
        
        if y_pred > 0.5:
            y_hat.append(1)
        else:
            y_hat.append(0)

    accuracy = (y_hat==np.squeeze(y_test)).sum()/len(x_test)
    
    return accuracy

In [27]:
tmp_accuracy = test_logistic_regression(x_test, y_test, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.5000


In [28]:
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-10, 20000)
print(f"The cost after training is {J}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")
tmp_accuracy = test_logistic_regression(x_test, y_test, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

The cost after training is 0.8863505337207406.
The resulting vector of weights is [1e-08, 0.0001007, -8.987e-05]
Logistic regression model's accuracy = 0.6481


In [29]:
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-8, 20000)
print(f"The cost after training is {J}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")
tmp_accuracy = test_logistic_regression(x_test, y_test, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

  J = -1./m * (np.dot(y.T, np.log(h)) + np.dot((1-y).T,np.log(1-h)))
  h = 1 / (1 + np.exp(-z))


The cost after training is nan.
The resulting vector of weights is [4.4e-07, 0.01122994, -0.00988898]
Logistic regression model's accuracy = 0.6318


In [30]:
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-5, 20000)
print(f"The cost after training is {J}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")
tmp_accuracy = test_logistic_regression(x_test, y_test, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

  h = 1 / (1 + np.exp(-z))
  J = -1./m * (np.dot(y.T, np.log(h)) + np.dot((1-y).T,np.log(1-h)))


The cost after training is nan.
The resulting vector of weights is [0.00041329, 11.22550229, -9.88485158]
Logistic regression model's accuracy = 0.6316


In [31]:
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-12, 20000)
print(f"The cost after training is {J}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")
tmp_accuracy = test_logistic_regression(x_test, y_test, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

The cost after training is 0.5878769065125304.
The resulting vector of weights is [0.0, 3.069e-05, -2.954e-05]
Logistic regression model's accuracy = 0.7010


In [32]:
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-15, 20000)
print(f"The cost after training is {J}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")
tmp_accuracy = test_logistic_regression(x_test, y_test, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

The cost after training is 0.6926584664943483.
The resulting vector of weights is [0.0, 2e-08, -9e-08]
Logistic regression model's accuracy = 0.5000


In [33]:
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-13, 20000)
print(f"The cost after training is {J}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")
tmp_accuracy = test_logistic_regression(x_test, y_test, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

The cost after training is 0.6639210210847599.
The resulting vector of weights is [0.0, 5.45e-06, -5.32e-06]
Logistic regression model's accuracy = 0.6985


In [34]:
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-12, 25000)
print(f"The cost after training is {J}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")
tmp_accuracy = test_logistic_regression(x_test, y_test, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

The cost after training is 0.583345217207419.
The resulting vector of weights is [0.0, 3.412e-05, -3.283e-05]
Logistic regression model's accuracy = 0.7005


In [35]:
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-12, 18000)
print(f"The cost after training is {J}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")
tmp_accuracy = test_logistic_regression(x_test, y_test, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

The cost after training is 0.5904363504335174.
The resulting vector of weights is [0.0, 2.906e-05, -2.797e-05]
Logistic regression model's accuracy = 0.7007
