In [None]:
import numpy as np
import pandas as pd


In [None]:
temp_df = pd.read_csv('IMDB Dataset.csv')
df = temp_df.iloc[:10000]
df.head()


In [None]:
df['review'][1]


In [None]:
df['sentiment'].value_counts()


In [None]:
df.isnull().sum()


In [None]:
df.duplicated().sum()


In [None]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()


In [None]:
# Basic preprocessing
# Removal tags
# Lowercase
# Remove stopwords


In [None]:
import re
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
    return cleaned_text


In [None]:
df['review'] = df['review'].apply(remove_tags)
df.sample(5)


In [None]:
df['review'] = df['review'].apply(lambda x:x.lower())
df.sample(5)


In [None]:
from nltk.corpus import stopwords
sw_list = stopwords.words('english')
df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x: " ".join(x))
df['review'].sample(5)


In [None]:
import gensim

from nltk import sent_tokenize
from gensim.utils import simple_preprocess


In [None]:
story = []
for doc in df['review']:
    raw_sent = sent_tokenize(doc)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))


In [None]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)
model.build_vocab(story)


In [None]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)
len(model.wv.index_to_key)


In [None]:
def document_vector(doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc.split() if word in model.wv.index_to_key]
    return np.mean(model.wv[doc], axis=0)


In [None]:
document_vector(df['review'].values[0])


In [None]:
from tqdm import tqdm


In [None]:
X= []

for doc in tqdm(df['review'].values):
    X.append(document_vector(doc))

X = np.array(X)
X.shape


In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

y = encoder.fit_transform(df['sentiment'])
y


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

mnb = GaussianNB()
mnb.fit(X_train, y_train)

y_pred = mnb.predict(X_test)
accuracy_score(y_test, y_pred)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)


In [None]:
X = df.iloc[:, 0:1]
y = df['sentiment']


In [None]:
X.sample(5)


In [None]:
y.sample(5)


In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y = encoder.fit_transform(y)
y


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [None]:
X_train.shape


In [None]:
# applying BoW
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()


In [None]:
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()


In [None]:
X_train_bow.shape


In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb.fit(X_train_bow, y_train)


In [None]:
y_pred = gnb.predict(X_test_bow)

from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(y_test, y_pred)


In [None]:
confusion_matrix(y_test, y_pred)


In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(X_train_bow, y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test, y_pred)



In [None]:
cv = CountVectorizer(max_features=3000)

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf = RandomForestClassifier()

rf.fit(X_train_bow, y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test, y_pred)


In [None]:
cv = CountVectorizer(ngram_range=(1,3), max_features=5000)

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf = RandomForestClassifier()

rf.fir(X_train_bow, y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test, y_pred)


#### Using TfIdf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train['review']).toarray()
X_test_tfidf = tfidf.transform(X_test['review'])


In [None]:
rf = RandomForestClassifier()

rf.fit(X_train_tfidf, y_train)
y_pred = rf.predict(X_test_tfidf)

accuracy_score(y_test, y_pred)
