In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
import string
import nltk
from nltk.corpus import stopwords
import re
import random

In [3]:
from sklearn import model_selection, preprocessing, feature_extraction, linear_model, metrics, pipeline

In [4]:
data = pd.read_csv('dataset/IMDB_dataset.csv')

In [5]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
data.sentiment.replace("positive", 1, inplace = True)
data.sentiment.replace("negative", 0, inplace = True)
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [7]:
X = data['review']
y = data['sentiment']

In [8]:
# radimo izbor hiperparametara pomocu validacionog skupa
X_train_val, X_test, y_train_val, y_test = model_selection.train_test_split(X, y, test_size=0.33, stratify=y, random_state=1234)

In [9]:
X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train_val, y_train_val, test_size=0.2, stratify=y_train_val, random_state=1234)

In [10]:
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

((26800,), (6700,), (16500,), (26800,), (6700,), (16500,))

In [11]:
y_train.value_counts()

1    13400
0    13400
Name: sentiment, dtype: int64

In [12]:
y_val.value_counts()

0    3350
1    3350
Name: sentiment, dtype: int64

In [13]:
y_test.value_counts()

0    8250
1    8250
Name: sentiment, dtype: int64

## Bag of Words

In [14]:
# pomocne funkcije iz 01:
def simple_tokenization(review):
    tokens = nltk.tokenize.word_tokenize(review)
    tokens_without_punctuation = [token for token in tokens if token not in string.punctuation]
    return tokens_without_punctuation

In [15]:
def short_form_transform(text):
    text=re.sub("isn't", 'is not', text)
    text=re.sub("aren't", 'are not', text)
    text=re.sub("he's", 'he is', text)
    text=re.sub("wasn't", 'was not',text)
    text=re.sub("there's", 'there is',text)
    text=re.sub("couldn't",'could not',text)
    text=re.sub("can't", 'can not', text)
    text=re.sub("won't", 'will not',text)
    text=re.sub("they're", 'they are',text)
    text=re.sub("she's", 'she is',text)
    text=re.sub("wouldn't", 'would not',text)
    text=re.sub("haven't", 'have not',text)
    text=re.sub("that's", 'that is',text)
    text=re.sub("you've", 'you have',text)
    text=re.sub("he's", 'he is',text)
    text=re.sub("what's", 'what is',text)
    text=re.sub("weren't", 'were not',text)
    text=re.sub("we're", 'we are',text)
    text=re.sub("hasn't", 'has not',text)
    text=re.sub('i’d','i would',text)
    text=re.sub("you'd",'you would',text)
    text=re.sub("shouldn't",'should not',text)
    text=re.sub("let's",'let us',text)
    text=re.sub("i've", 'i have', text)
    text=re.sub("you've", 'you have', text)
    text=re.sub("we've", 'we have', text)
    text=re.sub("they've",'they have',text)
    text=re.sub("you'll",'you will',text)
    text=re.sub("i'm",'i am',text)
    text=re.sub("we've",'we have',text)
    text=re.sub("it's",'it is',text)
    text=re.sub("don't",'do not',text)
    text=re.sub("doesn't", 'does not',text)
    text=re.sub("didn't", 'did not', text)
    text=re.sub("hadn't", 'had not', text)
    text=re.sub("mightn't", 'might not', text)
    text=re.sub("mustn't", 'must not', text)
    text=re.sub("it's",'it is',text)
    return text

In [16]:
# eliminacija html tagova
def strip_html(review):
    return re.sub('<[^<]+?>', '', review)

# eliminacija url-ova
def strip_url(review):
    return re.sub(r'http\S+', '', review)

# c.g.i -> cgi, u.s.a -> usa
def full_stop_abbrev_elim(review):
    pattern = re.compile(r'\b(?:[a-z]\.){2,}', re.I)
    review = pattern.sub(lambda m: m.group().replace('.',''), review)
    return review

def remove_stop_words(tokens):
    stopwords_list = stopwords.words('english')
    tokens_without_stop = [token.strip() for token in tokens if token.strip() not in stopwords_list]
    return tokens_without_stop

In [17]:
def review_preprocessor(text):
    text = text.lower()
    text = short_form_transform(text)
    text = strip_html(text)
    text = strip_url(text)
    text = full_stop_abbrev_elim(text)
    return text

In [18]:
def review_tokenizer(text):
    tokens = simple_tokenization(text)
    tokens = remove_stop_words(tokens)
            
    stems = []
    
    # default mode (NLTK_EXTENSIONS) se pokazao kao najbolji
    stemmer = nltk.stem.PorterStemmer(mode='NLTK_EXTENSIONS')

    for token in tokens:
        
        token_pattern = re.compile(r'\b[^\W\d_]+\b')
        if not token_pattern.match(token) or len(token) <= 2:
            continue
        
        stem = stemmer.stem(token)
        stems.append(stem)
    return stems

In [19]:
vectorizer1 = feature_extraction.text.CountVectorizer(preprocessor=review_preprocessor, tokenizer=review_tokenizer, min_df=0.05,
                                                         max_df=0.7, token_pattern=None)
# min_df - hocemo da izbacimo jako retke reci
# max_df - hocemo da izbacimo preceste reci (corpus-specific stopwords)

In [20]:
vectorizer1.fit(X_train)

In [21]:
# sve reci u vokabularu
vectorizer1.get_feature_names_out()

array(['absolut', 'act', 'action', 'actor', 'actual', 'almost', 'along',
       'also', 'although', 'alway', 'amaz', 'american', 'anoth', 'anyon',
       'anyth', 'appear', 'around', 'ask', 'attempt', 'audienc', 'aw',
       'away', 'back', 'bad', 'base', 'beauti', 'becom', 'begin',
       'believ', 'best', 'better', 'big', 'bit', 'black', 'book', 'bore',
       'boy', 'bring', 'budget', 'call', 'came', 'camera', 'care', 'case',
       'cast', 'certainli', 'chang', 'charact', 'classic', 'close',
       'come', 'comedi', 'comment', 'complet', 'consid', 'could', 'coupl',
       'cours', 'creat', 'day', 'dead', 'death', 'decid', 'definit',
       'dialogu', 'die', 'differ', 'direct', 'director', 'disappoint',
       'done', 'dvd', 'earli', 'effect', 'either', 'els', 'emot', 'end',
       'enjoy', 'enough', 'entertain', 'entir', 'episod', 'especi',
       'even', 'ever', 'everi', 'everyon', 'everyth', 'exampl', 'excel',
       'except', 'expect', 'experi', 'extrem', 'eye', 'face', 'fact',


In [22]:
X_train_vectorized = vectorizer1.transform(X_train)
X_val_vectorized = vectorizer1.transform(X_val)

In [23]:
X_train_vectorized.shape, X_val_vectorized.shape

((26800, 345), (6700, 345))

### Logisticka regresija

In [24]:
Cs = np.array([10**i for i in range(-5,5)])
penalties = np.array(['l1', 'l2', 'elasticnet'])
l1_ratios = np.array([0.1 * i for i in range(1, 10)])

best_score = float('-inf')
best_C = None
best_penalty = None
best_l1_ratio = None

for C in Cs:
    for penalty in penalties:
        
        if penalty == 'l2': 
            model = linear_model.LogisticRegression(C=C, max_iter=500, penalty=penalty, solver='lbfgs')
            model.fit(X_train_vectorized, y_train)
        
            score = metrics.accuracy_score(y_val, model.predict(X_val_vectorized))
            print(f'C: {C} penalty: {penalty} - score: {score}')
            if score > best_score:
                best_score = score
                best_C = C
                best_penalty = penalty
        
        if penalty == 'l1': 
                                                                                     # 'lbfgs' ne podrzava
            model = linear_model.LogisticRegression(C=C, max_iter=500, penalty=penalty, solver='liblinear')
            model.fit(X_train_vectorized, y_train)
        
            score = metrics.accuracy_score(y_val, model.predict(X_val_vectorized))
            print(f'C: {C} penalty: {penalty} - score: {score}')
            if score > best_score:
                best_score = score
                best_C = C
                best_penalty = penalty   
        
        
        if penalty == 'elasticnet':
            
            for l1_ratio in l1_ratios:
                                                                                     # 'lbfgs' ne podrzava
                model = linear_model.LogisticRegression(C=C, max_iter=500, penalty=penalty, solver='saga', l1_ratio=l1_ratio)
                model.fit(X_train_vectorized, y_train)
        
                score = metrics.accuracy_score(y_val, model.predict(X_val_vectorized))
                print(f'C: {C} penalty: {penalty} l1_ratio: {l1_ratio} - score: {score}')
                if score > best_score:
                    best_score = score
                    best_C = C
                    best_penalty = penalty 
                    best_l1_ratio = l1_ratio
                    
print("----------------------------------------------------------------------------------------------")
if best_penalty == 'elasticnet':
    print(f'Najbolja vrednost regularizacionog hiperparametra: {best_C}\nNajbolja norma regularizacije: {best_penalty}')
    print(f'Najbolji l1_ratio: {best_l1_ratio}\nNajbolji skor: {best_score}')
else:
    print(f'Najbolja vrednost regularizacionog hiperparametra: {best_C}\nNajbolja norma regularizacije: {best_penalty}')
    print(f'Najbolji skor: {best_score}')

C: 1e-05 penalty: l1 - score: 0.5
C: 1e-05 penalty: l2 - score: 0.6965671641791045
C: 1e-05 penalty: elasticnet l1_ratio: 0.1 - score: 0.5
C: 1e-05 penalty: elasticnet l1_ratio: 0.2 - score: 0.5
C: 1e-05 penalty: elasticnet l1_ratio: 0.30000000000000004 - score: 0.5
C: 1e-05 penalty: elasticnet l1_ratio: 0.4 - score: 0.5
C: 1e-05 penalty: elasticnet l1_ratio: 0.5 - score: 0.5
C: 1e-05 penalty: elasticnet l1_ratio: 0.6000000000000001 - score: 0.5
C: 1e-05 penalty: elasticnet l1_ratio: 0.7000000000000001 - score: 0.5
C: 1e-05 penalty: elasticnet l1_ratio: 0.8 - score: 0.5
C: 1e-05 penalty: elasticnet l1_ratio: 0.9 - score: 0.5
C: 0.0001 penalty: l1 - score: 0.5
C: 0.0001 penalty: l2 - score: 0.7740298507462686
C: 0.0001 penalty: elasticnet l1_ratio: 0.1 - score: 0.6491044776119403
C: 0.0001 penalty: elasticnet l1_ratio: 0.2 - score: 0.6180597014925373
C: 0.0001 penalty: elasticnet l1_ratio: 0.30000000000000004 - score: 0.5582089552238806
C: 0.0001 penalty: elasticnet l1_ratio: 0.4 - scor

In [25]:
vectorizer2 = feature_extraction.text.CountVectorizer(preprocessor=review_preprocessor, tokenizer=review_tokenizer, min_df=0.05,
                                                         max_df=0.7, token_pattern=None)

In [26]:
vectorizer2.fit(X_train_val)

In [27]:
X_train_val_vec = vectorizer2.transform(X_train_val)
X_test_vec = vectorizer2.transform(X_test)

In [30]:
X_train_val_vec.shape, X_test_vec.shape

((33500, 347), (16500, 347))

In [39]:
final_lr_model = None
if best_penalty == 'l2':
    final_lr_model = linear_model.LogisticRegression(C=best_C, max_iter=500, penalty=best_penalty, solver='lbfgs')
if best_penalty == 'l1':
    final_lr_model = linear_model.LogisticRegression(C=best_C, max_iter=500, penalty=best_penalty, solver='liblinear')
if best_penalty == 'elasticnet':
    final_lr_model = linear_model.LogisticRegression(C=best_C, max_iter=500, penalty=best_penalty,
                                                     solver='saga', l1_ratio=best_l1_ratio)

In [40]:
final_lr_model.fit(X_train_val_vec, y_train_val)

In [41]:
lr_train_score = final_lr_model.score(X_train_val_vec, y_train_val)
print(lr_train_score)

0.8370746268656717


In [43]:
lr_test_score = final_lr_model.score(X_test_vec, y_test)
print(lr_test_score)

0.8307272727272728


In [45]:
y_test_predicted = final_lr_model.predict(X_test_vec)

In [46]:
print(metrics.classification_report(y_test, y_test_predicted))

              precision    recall  f1-score   support

           0       0.84      0.81      0.83      8250
           1       0.82      0.85      0.83      8250

    accuracy                           0.83     16500
   macro avg       0.83      0.83      0.83     16500
weighted avg       0.83      0.83      0.83     16500



In [47]:
metrics.confusion_matrix(y_test, y_test_predicted)

array([[6712, 1538],
       [1255, 6995]], dtype=int64)