In [1]:
import numpy as np
import pandas as pd 
import re 
from nltk.corpus import stopwords
from string import punctuation
import re
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB


In [2]:
df=pd.read_csv('./IMDB_dataset/IMDB dataset.csv')

stop = set(stopwords.words('english'))
punctuation = list(punctuation)
stop.update(punctuation)

def remove_stopwords(text, stopwords_set):
    output = []
    for i in text.split():
        word = i.strip().lower()
        if word not in stopwords_set and word.isalpha():
            output.append(word)
    return " ".join(output)
    
def process_data(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = remove_stopwords(text, stop)
    return text

df['review']=df['review'].apply(process_data)

df.sentiment.replace("positive" , 1 , inplace = True)
df.sentiment.replace("negative" , 0 , inplace = True)

def tokenize_and_stem(data):
    stem_tokens = []
    stemmer = PorterStemmer()
    data_tokens = word_tokenize(data)
    for word in data_tokens:
        stem_word = stemmer.stem(word)
        stem_tokens.append(stem_word)
    return stem_tokens

  soup = BeautifulSoup(text, "html.parser")


In [3]:
X = df['review']
Y = df['sentiment']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [4]:
vectorized = CountVectorizer()
x_train_cv= vectorized.fit_transform(x_train)
print(x_train_cv.shape)

(40000, 76074)


In [5]:
model=MultinomialNB()

In [6]:
model.fit(x_train_cv,y_train)

In [7]:
x_test_cv=vectorized.transform(x_test)
x_test_cv.shape

(10000, 76074)

In [8]:
y_pred= model.predict(x_test_cv)

In [9]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.86      0.85      5000
           1       0.86      0.83      0.84      5000

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



In [10]:
def count_acc(y_test, y_pred):
    count = 0
    accurate = 0
    for y_t, y_p in zip(y_test, y_pred):
        if(y_t == y_p):
            accurate += 1
        count += 1
    return accurate/count

In [11]:
count_acc(y_test, y_pred)

0.843

## Tdidf Vectorized - ważenie częstością termów

In [12]:
tf =TfidfVectorizer()
x_train_tfidf = tf.fit_transform(x_train)

In [13]:
model=MultinomialNB()
model.fit(x_train_tfidf,y_train)

In [14]:
x_test_tf=tf.transform(x_test)
x_test_tf.shape

(10000, 76074)

In [15]:
y_pred= model.predict(x_test_tf)

In [16]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85      5000
           1       0.86      0.84      0.85      5000

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [17]:
count_acc(y_test, y_pred)

0.8497

In [18]:
import pickle


In [19]:
filename = 'multinomialNB_model.sav'
pickle.dump(model, open(filename, 'wb'))