In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('./IMDB Dataset.csv')

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [6]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [7]:
df.duplicated().sum()

418

In [8]:
df.drop_duplicates(inplace= True)

In [9]:
df.duplicated().sum()

0

In [10]:
##preprocessing

In [11]:
import re
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'),'',raw_text)
    return cleaned_text

In [12]:
df['review'] = df['review'].apply(remove_tags)

In [13]:
df #no html tags

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [14]:
df['review'] = df['review'].apply(lambda x : x.lower())

In [15]:
!pip install nltk



In [16]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\siddh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
from nltk.corpus import stopwords

sw_list = stopwords.words('english')

In [18]:
df['review'] = df['review'].apply(lambda x : [item for item in x.split() if item not in sw_list]).apply(lambda y: " ".join(y))

In [19]:
df

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production. filming technique...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there's family little boy (jake) thi...,negative
4,"petter mattei's ""love time money"" visually stu...",positive
...,...,...
49995,thought movie right good job. creative origina...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,catholic taught parochial elementary schools n...,negative
49998,i'm going disagree previous comment side malti...,negative


In [20]:
X = df.iloc[:,0:1]
y = df['sentiment']

In [21]:
y

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 49582, dtype: object

In [22]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y = y.ravel() # flattens the 2d to 1d array
y = encoder.fit_transform(y)

In [23]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [24]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size = 0.2 , random_state = 40 )

In [25]:
X_train.shape


(39665, 1)

In [26]:
## Applying BOW

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

In [None]:
X_train_bow.shape  #indicates our vocabulary contains 48326 words

In [None]:
##algorithm

In [30]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

gnb.fit(X_train_bow, y_train)

In [31]:
y_pred = gnb.predict(X_test_bow)

from sklearn.metrics import accuracy_score , confusion_matrix
accuracy_score(y_test,y_pred)

0.629444166249374

In [32]:
confusion_matrix(y_test, y_pred)

array([[691, 305],
       [435, 566]], dtype=int64)

In [33]:
#second algorithm

In [34]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()



In [35]:
### takes so much time

rf.fit(X_train_bow , y_train)
y_pred = rf.predict(X_test_bow)

accuracy_score(y_test , y_pred)

0.8492739108662994

In [105]:
import pickle
with open('RandomForestCV.pkl', 'wb') as file: # Dump the model into the file
    pickle.dump(rf, file)

In [36]:
### decreasing features by taking frequent 3000 words in the vocabulary

cv = CountVectorizer(max_features = 3000)

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf = RandomForestClassifier()

rf.fit(X_train_bow , y_train)
y_pred = rf.predict(X_test_bow)

accuracy_score(y_test , y_pred)

0.8387581372058087

In [37]:
### using n - grams

In [38]:
cv = CountVectorizer(ngram_range = (1,2) , max_features = 5000)

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rf = RandomForestClassifier()

rf.fit(X_train_bow , y_train)
y_pred = rf.predict(X_test_bow)

accuracy_score(y_test , y_pred)

0.8387581372058087

#using tf- idf

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [40]:
tfidf = TfidfVectorizer()

In [41]:
X_train_tfidf = tfidf.fit_transform(X_train['review']).toarray()
X_test_tfidf = tfidf.transform(X_test['review']).toarray()

In [42]:
rf = RandomForestClassifier()

rf.fit(X_train_tfidf , y_train)
y_pred = rf.predict(X_test_tfidf)

accuracy_score(y_test , y_pred)

0.8412618928392589

In [43]:
#word2vec

In [46]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-win_amd64.whl.metadata (8.2 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.1.0-py3-none-any.whl.metadata (24 kB)
Downloading gensim-4.3.3-cp311-cp311-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   -- ------------------------------------- 1.6/24.0 MB 4.9 MB/s eta 0:00:05
   ------ --------------------------------- 3.9/24.0 MB 6.3 MB/s eta 0:00:04
   ---------- ----------------------------- 6.3/24.0 MB 7.0 MB/s eta 0:00:03
   -------------- ------------------------- 8.7/24.0 MB 7.8 MB/s eta 0:00:02
   ----------------- ---------------------- 10.7/24.0 MB 8.2 MB/s eta 0:00:02
   --------------------- ------------------ 12.8/24.0 MB 8.3 MB/s eta 0:00:02
   ----------------------- ---------------- 13.9/24.0 MB 8.0 MB/s eta 0:00:02
   ----------------------- ---------------- 14.2/24.0 MB 7.6 MB/s eta 0:00:02
   ------------------------ --------------- 14.7/24.

In [55]:
import nltk
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\siddh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [56]:
import gensim
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [57]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\siddh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [58]:
story = []
for doc in df['review']:
    raw_sent = sent_tokenize(doc)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))

In [60]:
model = gensim.models.Word2Vec(
    window = 10,
    min_count = 2
)

In [61]:
model.build_vocab(story)

In [62]:
model.train(story , total_examples = model.corpus_count , epochs = model.epochs)

(5875463, 6212140)

In [63]:
len(model.wv.index_to_key)

31845

In [64]:
model.wv.index_to_key

['movie',
 'film',
 'one',
 'like',
 'good',
 'it',
 'the',
 'would',
 'time',
 'even',
 'story',
 'see',
 'really',
 'well',
 'much',
 'get',
 'bad',
 'great',
 'people',
 'first',
 'also',
 'made',
 'make',
 'way',
 'movies',
 'could',
 'think',
 'characters',
 'watch',
 'character',
 'films',
 'that',
 'never',
 'little',
 'show',
 'seen',
 'many',
 'two',
 'love',
 'acting',
 'plot',
 'best',
 'know',
 'life',
 'this',
 'ever',
 'better',
 'man',
 'there',
 'still',
 'say',
 'scene',
 'end',
 'and',
 'scenes',
 'something',
 'go',
 'real',
 'back',
 'watching',
 'director',
 'actors',
 'years',
 'thing',
 'though',
 've',
 'work',
 'look',
 'funny',
 'actually',
 'old',
 'nothing',
 'going',
 'makes',
 'new',
 'lot',
 'another',
 'all',
 'every',
 'find',
 'pretty',
 'things',
 'part',
 'can',
 'he',
 'us',
 'world',
 'horror',
 'around',
 'want',
 'big',
 'quite',
 'cast',
 'long',
 'young',
 'enough',
 'in',
 'take',
 'seems',
 'got',
 'must',
 'however',
 'may',
 'thought',
 'fa

In [67]:
def document_vector(doc):
    doc = [word for word in doc.split() if word in model.wv.index_to_key]
    return np.mean(model.wv[doc] , axis = 0 )

In [68]:
document_vector(df['review'].values[0])

array([-0.17888154,  0.45295545,  0.14197834,  0.21970473, -0.15529935,
       -0.58838654,  0.21036358,  0.8891549 , -0.3450474 , -0.26558062,
       -0.30587873, -0.4454744 ,  0.13829736,  0.081778  ,  0.15076236,
       -0.11614622, -0.00919233, -0.33331013, -0.10694014, -0.66050494,
        0.05119769,  0.2514973 ,  0.0907824 , -0.27242732, -0.33295944,
       -0.01769579, -0.3032438 , -0.01269659, -0.33428666,  0.047382  ,
        0.31816635,  0.02065944,  0.20099778, -0.2914552 , -0.10178244,
        0.37489802,  0.07098916, -0.39121795, -0.2288766 , -0.7650213 ,
        0.12069517, -0.26871276,  0.0811187 , -0.0816585 ,  0.4818114 ,
       -0.1761682 , -0.28188306, -0.04288814,  0.13367724,  0.42347625,
        0.12720548, -0.38065684, -0.42660788, -0.06734861, -0.1488817 ,
        0.24486002,  0.2466731 ,  0.02508778, -0.31311876,  0.11281795,
        0.10673671,  0.09952986,  0.00224548, -0.10983808, -0.45977283,
        0.2689529 ,  0.07767156,  0.05341562, -0.34854746,  0.30

In [72]:
from tqdm import tqdm

In [81]:
df

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production. filming technique...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there's family little boy (jake) thi...,negative
4,"petter mattei's ""love time money"" visually stu...",positive
...,...,...
9995,"fun, entertaining movie wwii german spy (julie...",positive
9996,"give break. anyone say ""good hockey movie""? kn...",negative
9997,movie bad movie. watching endless series bad h...,negative
9998,"movie probably made entertain middle school, e...",negative


In [79]:
df['review'] = pd.DataFrame(df['review'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = pd.DataFrame(df['review'])


In [84]:
x = []
for doc in tqdm(df['review'].values):
    x.append(document_vector(doc))

100%|██████████████████████████████████████████████████████████████████████████████| 9983/9983 [39:30<00:00,  4.21it/s]


In [94]:
X = np.array(x)

In [86]:
arr = np.array(x)

In [87]:
np.savez('arrays.npz', array1=arr)

In [88]:
file = open("file1.txt", "w+")
 
# Saving the array in a text file
content = str(arr)
file.write(content)
file.close()

In [89]:
X.shape

(9983, 1)

In [90]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(df['sentiment'])

In [91]:
y

array([1, 1, 1, ..., 0, 0, 1])

In [95]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size = 0.2 , random_state = 40 )

In [98]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train , y_train)
y_pred = rf.predict(X_test)




In [99]:
accuracy_score(y_test,y_pred)

0.7696544817225839

In [100]:




import pickle 
  
# Save the trained model as a pickle string. 
saved_model = pickle.dumps(rf) 
  
# # Load the pickled model 
# knn_from_pickle = pickle.loads(saved_model) 
  
# # Use the loaded pickled model to make predictions 
# knn_from_pickle.predict(X_test) 

In [103]:
with open('RandomForestW2Vec.pkl', 'wb') as file: # Dump the model into the file
    pickle.dump(rf, file)