In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv("IMDB Dataset.csv")
dataset

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


# Removing the HTML tags from the dataframe

In [3]:
from bs4 import BeautifulSoup

In [4]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    if soup.find():
        return soup.get_text()
    else:
        return text

In [5]:
dataset['review'] = dataset['review'].apply(remove_html_tags)
dataset

  soup = BeautifulSoup(text, 'html.parser')


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


# Building the model

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

### Feature Extraction

In [7]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X = tfidf_vectorizer.fit_transform(dataset['review'])
y = dataset['sentiment']

### Train test split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Choose a Model

In [9]:
classifier = LogisticRegression()

### Training the model

In [12]:
classifier.fit(X_train, y_train) ;

In [13]:
y_pred = classifier.predict(X_test)

<br>

# Testing the model

In [29]:
custom_string = "it was a great movie."
custom_string_processed = tfidf_vectorizer.transform([custom_string])

In [30]:
sentiment = classifier.predict(custom_string_processed)
sentiment[0]

'positive'

<br><br>

# Testing Accuracy 

In [31]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8697