In [None]:
import pandas as pd

In [None]:
import pandas as pd
import io

data_lines = []

with open("/content/IMDB Dataset.csv", "r", encoding="utf-8") as file:
    for line in file:
        data_lines.append(line)


csv_content = ''.join(data_lines)

try:
    imdb = pd.read_csv(io.StringIO(csv_content))
except pd.errors.ParserError as e:
    print(f"ParserError: {e}")



In [None]:
imdb.head()

In [None]:
print("Number of rows: ", imdb.shape[0])
print("Nummer of columns: ", imdb.shape[1])

In [None]:
imdb.info()

In [None]:
imdb.sentiment.value_counts()

In [None]:
imdb['review'][1]


In [None]:
from bs4 import BeautifulSoup
cleantext = BeautifulSoup(imdb["review"][1], 'lxml').text
cleantext

In [None]:
import re
cleantext = re.sub(r'[^\w\s]', '', cleantext)
cleantext

In [None]:
import nltk
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')
stopwords.words('english')

In [None]:
token = cleantext.lower().split()
stopword = set(stopwords.words('english'))
token_list = [ word for word in token if word.lower() not in stopword ]

In [None]:
" ".join(token_list)

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
nltk.download('wordnet')

In [None]:
nltk.download('omw-1.4')

In [None]:
lemmatizer.lemmatize(" ".join(token_list))

In [None]:
imdb.keys()

In [None]:
from tqdm import tqdm
def data_cleaner(imdb):
    clean_data = []
    for review in tqdm(imdb):
        cleantext = BeautifulSoup(review, "lxml").text
        cleantext = re.sub(r'[^\w\s]', '', cleantext)
        cleantext = [ token for token in cleantext.lower().split() if token not in stopword ]
        cleantext = lemmatizer.lemmatize(" ".join(cleantext))
        clean_data.append(cleantext.strip())
    return clean_data

In [None]:
clean_data = data_cleaner(imdb.review.values)

In [None]:
clean_data[0]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(imdb, imdb.sentiment, test_size=0.2, random_state=42, stratify=imdb.sentiment)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(y_train)
le_test = LabelEncoder()
y_test = le_test.fit_transform(y_test)

In [None]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
clean_data_train_data = data_cleaner(X_train.review.values)

In [None]:
X_train['cleaned_text'] = clean_data_train_data
X_train.head()

In [None]:
clean_data_test_data = data_cleaner(X_test.review.values)
X_test['cleaned_text'] = clean_data_test_data
X_test.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vec = CountVectorizer()
vec = vec.fit(X_train.cleaned_text)
train_x_bow = vec.transform(X_train.cleaned_text)
test_x_bow = vec.transform(X_test.cleaned_text)

In [None]:
print(train_x_bow.shape)
print(test_x_bow.shape)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

In [None]:
classifier = MultinomialNB()

In [None]:
alpha_ranges = {
    "alpha": [0.001, 0.01, 0.1, 1, 10.0, 100]
}

In [None]:
grid_search = GridSearchCV(classifier, param_grid=alpha_ranges, scoring='accuracy', cv=3, return_train_score=True)
grid_search.fit(train_x_bow, y_train)

In [None]:
alpha = [0.001, 0.01, 0.1, 1, 10.0, 100]
train_acc = grid_search.cv_results_['mean_train_score']
train_std = grid_search.cv_results_['std_train_score']

test_acc = grid_search.cv_results_['mean_test_score']
test_std = grid_search.cv_results_['std_test_score']

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(alpha, train_acc, label="Training Score", color='b')
plt.plot(alpha, test_acc, label="Cross Validation Score", color='r')

plt.title("Validation Curve with Naive Bayes Classifier")
plt.xlabel("Alpha")
plt.ylabel("Accuracy")

plt.tight_layout()
plt.legend(loc = 'best')
plt.show()

In [None]:
grid_search.best_estimator_

In [None]:
classifier = MultinomialNB(alpha=1)
classifier.fit(train_x_bow, y_train)

In [None]:
predict = classifier.predict(test_x_bow)

In [None]:
from sklearn.metrics import accuracy_score
print("Accuracy is ", accuracy_score(y_test, predict))

In [None]:
from sklearn.metrics import classification_report
print("Accuracy is ", classification_report(y_test, predict))

In [None]:
text = ["I liked the movie. It was great"]
text_vec = vec.transform(text)
classifier.predict(text_vec)

In [None]:
le.inverse_transform([1])

In [None]:
text = ["Movie was worse"]
text_vec = vec.transform(text)
classifier.predict(text_vec)

In [None]:
le.inverse_transform([0])