In [1]:
# set up
import numpy as np
import pandas as pd
import glob
import re
import string 
from collections import defaultdict
from sklearn import metrics
from tqdm import tqdm

# 1. Preparing data (IMDb movies' reviews)

In [2]:
train_pos_path = 'data_sets/movies/train/pos/*'
train_neg_path = 'data_sets/movies/train/neg/*'

train_pos = glob.glob(train_pos_path)
train_neg = glob.glob(train_neg_path)


test_pos_path = 'data_sets/movies/test/pos/*'
test_neg_path = 'data_sets/movies/test/neg/*'

test_pos = glob.glob(test_pos_path)
test_neg = glob.glob(test_neg_path)

In [3]:
train_pos[:3]

['data_sets/movies/train/pos\\0_9.txt',
 'data_sets/movies/train/pos\\10000_8.txt',
 'data_sets/movies/train/pos\\10001_10.txt']

In [4]:
train_df = []
test_df = []

In [5]:
for path in tqdm(train_pos, desc='Getting positive train data', position=0):
    with open(path, encoding="utf8") as f:
        text = f.read()
        beg, end = path.find('\\'), path.find('.')
        idx, rating = path[beg+1:end].split('_')
        train_df.append([text, rating])
        
        

for path in tqdm(train_neg, desc='Getting negative train data', position=0):
    with open(path, encoding="utf8") as f:
        text = f.read()
        beg, end = path.find('\\'), path.find('.')
        idx, rating = path[beg+1:end].split('_')
        train_df.append([text, rating])
        
        
        
for path in tqdm(test_pos, desc='Getting positive test data', position=0):
    with open(path, encoding="utf8") as f:
        text = f.read()
        beg, end = path.find('\\'), path.find('.')
        idx, rating = path[beg+1:end].split('_')
        test_df.append([text, rating])
        
        
        
for path in tqdm(test_neg, desc='Getting negative test data', position=0):
    with open(path, encoding="utf8") as f:
        text = f.read()
        beg, end = path.find('\\'), path.find('.')
        idx, rating = path[beg+1:end].split('_')
        test_df.append([text, rating])

Getting positive train data: 100%|████████████████████████████████████████████| 12500/12500 [00:01<00:00, 11491.88it/s]
Getting negative train data: 100%|████████████████████████████████████████████| 12500/12500 [00:01<00:00, 11584.74it/s]
Getting positive test data: 100%|█████████████████████████████████████████████| 12500/12500 [00:01<00:00, 11717.11it/s]
Getting negative test data: 100%|█████████████████████████████████████████████| 12500/12500 [00:01<00:00, 11907.71it/s]


In [6]:
train_df = pd.DataFrame(train_df, columns=['text', 'rating'])
test_df = pd.DataFrame(test_df, columns=['text', 'rating'])

In [7]:
print('Records: ', train_df.size)
train_df.head()

Records:  50000


Unnamed: 0,text,rating
0,Bromwell High is a cartoon comedy. It ran at t...,9
1,Homelessness (or Houselessness as George Carli...,8
2,Brilliant over-acting by Lesley Ann Warren. Be...,10
3,This is easily the most underrated film inn th...,7
4,This is not the typical Mel Brooks film. It wa...,8


In [8]:
for i in range(1, 11):
    print(f'Number of reviews with rating {i}: {train_df[train_df.rating == str(i)].shape[0]}')

Number of reviews with rating 1: 5100
Number of reviews with rating 2: 2284
Number of reviews with rating 3: 2420
Number of reviews with rating 4: 2696
Number of reviews with rating 5: 0
Number of reviews with rating 6: 0
Number of reviews with rating 7: 2496
Number of reviews with rating 8: 3009
Number of reviews with rating 9: 2263
Number of reviews with rating 10: 4732


### *We might consider (?or not?) only movies with reviews 1(terrible) and 10(perfect)

# 1.1 Preparing data (Stanford Sentiment)
### (or not because this DS doesn't make sense) 
**for eg. sentences:** <br/>
' ( the cockettes ) provides a window into a subculture hell-bent on expressing itself in every way imaginable|5 <br/>
' ( the cockettes ) provides a window into a subculture hell-bent on expressing itself in every way imaginable .|6 <br/>
' ( the cockettes ) provides a window into a subculture hell-bent on expressing itself in every way imaginable . '|7 <br/>
**Have sentiment:** <br/>
5|0.375 <br/>
6|0.41667 <br/> 
7|0.54167 <br/>

In [9]:
with open('data_sets/stanfordSentiment/sentiment.txt') as f:
    for line in f:
        text, idx = line.split('|')

# 2. Clean and Preprocess

In [10]:
def regex(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    return text

In [11]:
%%time
# Remove punctuaction and lower all texts
train_df.text = train_df.text.apply(lambda row: regex(row))
test_df.text = test_df.text.apply(lambda row: regex(row))

Wall time: 1.55 s


In [12]:
train_df.head()

Unnamed: 0,text,rating
0,bromwell high is a cartoon comedy it ran at th...,9
1,homelessness or houselessness as george carlin...,8
2,brilliant overacting by lesley ann warren best...,10
3,this is easily the most underrated film inn th...,7
4,this is not the typical mel brooks film it was...,8


# 3. pure Naive Bayes

In [13]:
# consider only rating 1 and 10
bayes_df_train = train_df[(train_df.rating == '1') | (train_df.rating == '10')]
bayes_df_test = test_df[(test_df.rating == '1') | (test_df.rating == '10')]
GOOD_WORDS = defaultdict(int)
BAD_WORDS = defaultdict(int)

In [14]:
for index, row in tqdm(bayes_df_train.iterrows(), desc='Creating Bayes dictionaries', position=0):
    text, rating = row['text'], row['rating']
    
    for word in text.split():
        if rating == '10':
            GOOD_WORDS[word] += 1
        else:
            BAD_WORDS[word] += 1

Creating Bayes dictionaries: 9832it [00:01, 7132.18it/s]


In [15]:
# most frequent GOOD words
list(sorted(GOOD_WORDS.items(), key=lambda x: x[1], reverse=True))[:10]

[('the', 56972),
 ('and', 30301),
 ('a', 26016),
 ('of', 25791),
 ('to', 22249),
 ('is', 19380),
 ('in', 16257),
 ('i', 14729),
 ('it', 14595),
 ('this', 13341)]

In [16]:
# most frequent BAD words
list(sorted(BAD_WORDS.items(), key=lambda x: x[1], reverse=True))[:10]

[('the', 58427),
 ('a', 28386),
 ('and', 26617),
 ('to', 26292),
 ('of', 25218),
 ('is', 18370),
 ('this', 18239),
 ('i', 17985),
 ('it', 15645),
 ('in', 15538)]

In [17]:
def classify(text, target_dict):
    text = regex(text)
    
    for word in text.split():
        if not word in target_dict:
            target_dict[word] = 1
            
    sum_of_all = sum(target_dict.values())
    
    ppd = 0
    for word in text.split():
        ppd += np.log2(float(target_dict[word]) / sum_of_all)
        
    return ppd
    
    
def predict(text):
    ppd_good = classify(text, GOOD_WORDS)
    ppd_bad = classify(text, BAD_WORDS)

    
    all_ppd = np.array([ppd_good, ppd_bad])
    target = ['10', '1'][np.argmax(all_ppd)]
    
    return target

### 3.1 Checking train accuracy

In [32]:
correct, wrong = 0, 0
real_targets, predictions = [], []

for index, row in tqdm(bayes_df_train.iterrows(), desc='Checking train accuracy', position=0):
    text, rating = row['text'], row['rating']
    
    pred = predict(text)
    real_targets.append(rating)
    predictions.append(pred)
    if pred == rating:
        correct += 1
    else:
        wrong += 1
        
    if pred == '10' and rating == '1':
        FP += 1

Checking train accuracy: 9832it [00:23, 423.25it/s]


In [33]:
print(f'Correct: {correct}, Wrong: {wrong}')
print(f'Accuracy: {correct / (wrong + correct) * 100}%')

M = metrics.confusion_matrix(predictions, real_targets)
print('\nConfusion matrix:')
print(M)
print(f'\nTrue negative (rating = 1): {M[0][0]}')
print(f'True positive (rating = 10): {M[1][1]}')
print(f'False negative: {M[0][1]}')
print(f'False positive: {M[1][0]}')

Correct: 9311, Wrong: 521
Accuracy: 94.70097640358016%

Confusion matrix:
[[4976  397]
 [ 124 4335]]

True negative (rating = 1): 4976
True positive (rating = 10): 4335
False negative: 397
False positive: 124


*So we are rather sceptic and most of our mistakes are movies which are good, but we classify them as bad.

### 3.2 Checking test accuracy

In [27]:
correct, wrong = 0, 0
real_targets, predictions = [], []

for index, row in tqdm(bayes_df_test.iterrows(), desc='Checking test accuracy', position=0):
    text, rating = row['text'], row['rating']
    
    pred = predict(text)
    real_targets.append(rating)
    predictions.append(pred)

    if pred == rating:
        correct += 1
    else:
        wrong += 1

Checking test accuracy: 10021it [00:23, 424.32it/s]


In [28]:
print(f'Correct: {correct}, Wrong: {wrong}')
print(f'Accuracy: {correct / (wrong + correct) * 100}%')

M = metrics.confusion_matrix(predictions, real_targets)
print('\nConfusion matrix:')
print(M)
print(f'\nTrue negative (rating = 1): {M[0][0]}')
print(f'True positive (rating = 10): {M[1][1]}')
print(f'False negative: {M[0][1]}')
print(f'False positive: {M[1][0]}')

Correct: 8877, Wrong: 1144
Accuracy: 88.58397365532382%

Confusion matrix:
[[4690  812]
 [ 332 4187]]

True negative (rating = 1): 4690
True positive (rating = 10): 4187
False negative: 812
False positive: 332


*Again we are sceptic

# 4. stemmed Naive Bayes without stop words