In [27]:
# set up
import numpy as np
import pandas as pd
import glob
import re
import string 

from tqdm import tqdm

# 1. Preparing data (IMDb movies' reviews)

In [2]:
train_pos_path = 'data_sets/movies/train/pos/*'
train_neg_path = 'data_sets/movies/train/neg/*'

train_pos = glob.glob(train_pos_path)
train_neg = glob.glob(train_neg_path)


test_pos_path = 'data_sets/movies/test/pos/*'
test_neg_path = 'data_sets/movies/test/neg/*'

test_pos = glob.glob(test_pos_path)
test_neg = glob.glob(test_neg_path)

In [3]:
train_pos[:3]

['data_sets/movies/train/pos\\0_9.txt',
 'data_sets/movies/train/pos\\10000_8.txt',
 'data_sets/movies/train/pos\\10001_10.txt']

In [4]:
train_df = []
test_df = []

In [5]:
for path in tqdm(train_pos, desc='Getting positive train data', position=0):
    with open(path, encoding="utf8") as f:
        text = f.read()
        beg, end = path.find('\\'), path.find('.')
        idx, rating = path[beg+1:end].split('_')
        train_df.append([text, rating])
        
        

for path in tqdm(train_neg, desc='Getting negative train data', position=0):
    with open(path, encoding="utf8") as f:
        text = f.read()
        beg, end = path.find('\\'), path.find('.')
        idx, rating = path[beg+1:end].split('_')
        train_df.append([text, rating])
        
        
        
for path in tqdm(test_pos, desc='Getting positive test data', position=0):
    with open(path, encoding="utf8") as f:
        text = f.read()
        beg, end = path.find('\\'), path.find('.')
        idx, rating = path[beg+1:end].split('_')
        test_df.append([text, rating])
        
        
        
for path in tqdm(test_neg, desc='Getting negative test data', position=0):
    with open(path, encoding="utf8") as f:
        text = f.read()
        beg, end = path.find('\\'), path.find('.')
        idx, rating = path[beg+1:end].split('_')
        test_df.append([text, rating])

Getting positive train data: 100%|██████████████████████████████████████████████| 12500/12500 [00:41<00:00, 301.96it/s]
Getting negative train data: 100%|██████████████████████████████████████████████| 12500/12500 [00:41<00:00, 298.66it/s]
Getting positive test data: 100%|███████████████████████████████████████████████| 12500/12500 [00:40<00:00, 305.23it/s]
Getting negative test data: 100%|███████████████████████████████████████████████| 12500/12500 [00:37<00:00, 336.68it/s]


In [6]:
train_df = pd.DataFrame(train_df, columns=['text', 'rating'])
test_df = pd.DataFrame(test_df, columns=['text', 'rating'])

In [74]:
print('Records: ', train_df.size)
train_df.head()

Records:  50000


Unnamed: 0,text,rating
0,bromwell high is a cartoon comedy it ran at th...,9
1,homelessness or houselessness as george carlin...,8
2,brilliant overacting by lesley ann warren best...,10
3,this is easily the most underrated film inn th...,7
4,this is not the typical mel brooks film it was...,8


In [76]:
for i in range(1, 11):
    print(f'Number of reviews with rating {i}: {train_df[train_df.rating == str(i)].size}')

Number of reviews with rating 1: 10200
Number of reviews with rating 2: 4568
Number of reviews with rating 3: 4840
Number of reviews with rating 4: 5392
Number of reviews with rating 5: 0
Number of reviews with rating 6: 0
Number of reviews with rating 7: 4992
Number of reviews with rating 8: 6018
Number of reviews with rating 9: 4526
Number of reviews with rating 10: 9464


### *We might consider (?or not?) only movies with reviews 1(terrible) and 10(perfect)

# 1.1 Preparing data (Stanford Sentiment)
### (or not because this DS doesn't make sense) 
**for eg. sentences:** <br/>
' ( the cockettes ) provides a window into a subculture hell-bent on expressing itself in every way imaginable|5 <br/>
' ( the cockettes ) provides a window into a subculture hell-bent on expressing itself in every way imaginable .|6 <br/>
' ( the cockettes ) provides a window into a subculture hell-bent on expressing itself in every way imaginable . '|7 <br/>
**Have sentiment:** <br/>
5|0.375 <br/>
6|0.41667 <br/> 
7|0.54167 <br/>

In [9]:
with open('data_sets/stanfordSentiment/sentiment.txt') as f:
    for line in f:
        text, idx = line.split('|')

# 2. Clean and Preprocess

In [41]:
def regex(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    return text

In [46]:
# Remove punctuaction and lower all texts
train_df.text = train_df.text.apply(lambda row: regex(row))

In [58]:
train_df.head()

Unnamed: 0,text,rating
0,bromwell high is a cartoon comedy it ran at th...,9
1,homelessness or houselessness as george carlin...,8
2,brilliant overacting by lesley ann warren best...,10
3,this is easily the most underrated film inn th...,7
4,this is not the typical mel brooks film it was...,8


# 3. pure Naive Bayes

In [101]:
# consider only rating 1 and 10
bayes_df = train_df[(train_df.rating == '1') |  (train_df.rating == '10')]

In [102]:
for index, row in tqdm(bayes_df.head().iterrows(), desc='Creating Bayes dictionaries', position=0):
    text, rating = row['text'], row['rating']
    
    if rating == '10':
        pass
    else:
        pass

Creating Bayes dictionaries: 5it [00:00, 5008.72it/s]
