In [15]:
# set up
import numpy as np
import pandas as pd
import glob
import re
import string 
from collections import defaultdict
from sklearn import metrics
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import scipy.optimize as sopt

# 1. Preparing data (IMDb movies' reviews)

In [2]:
train_pos_path = 'data_sets/movies/train/pos/*'
train_neg_path = 'data_sets/movies/train/neg/*'

train_pos = glob.glob(train_pos_path)
train_neg = glob.glob(train_neg_path)


test_pos_path = 'data_sets/movies/test/pos/*'
test_neg_path = 'data_sets/movies/test/neg/*'

test_pos = glob.glob(test_pos_path)
test_neg = glob.glob(test_neg_path)

In [3]:
train_pos[:3]

['data_sets/movies/train/pos/2239_7.txt',
 'data_sets/movies/train/pos/3291_8.txt',
 'data_sets/movies/train/pos/11920_9.txt']

In [5]:
train_df = []
test_df = []

In [6]:
for path in tqdm(train_pos, desc='Getting positive train data', position=0):
    with open(path, encoding="utf8") as f:
        text = f.read()
        # na windowsie to -> beg = path.find('\\') 
        beg = re.search(r"\d",path).start()-1
        end = path.find('.')
        idx, rating = path[beg+1:end].split('_')
        train_df.append([text, rating])
        
        

for path in tqdm(train_neg, desc='Getting negative train data', position=0):
    with open(path, encoding="utf8") as f:
        text = f.read()
        # beg, end = path.find('\\'), path.find('.')
        beg = re.search(r"\d",path).start()-1
        end = path.find('.')
        idx, rating = path[beg+1:end].split('_')
        train_df.append([text, rating])
        
        
        
for path in tqdm(test_pos, desc='Getting positive test data', position=0):
    with open(path, encoding="utf8") as f:
        text = f.read()
        #beg, end = path.find('\\'), path.find('.')
        beg = re.search(r"\d",path).start()-1
        end = path.find('.')
        idx, rating = path[beg+1:end].split('_')
        test_df.append([text, rating])
        
        
        
for path in tqdm(test_neg, desc='Getting negative test data', position=0):
    with open(path, encoding="utf8") as f:
        text = f.read()
        #beg, end = path.find('\\'), path.find('.')
        beg = re.search(r"\d",path).start()-1
        end = path.find('.')
        idx, rating = path[beg+1:end].split('_')
        test_df.append([text, rating])

Getting positive train data: 100%|██████████| 12500/12500 [00:03<00:00, 3971.50it/s]
Getting negative train data: 100%|██████████| 12500/12500 [00:02<00:00, 4195.53it/s]
Getting positive test data: 100%|██████████| 12500/12500 [00:03<00:00, 3903.77it/s]
Getting negative test data: 100%|██████████| 12500/12500 [00:03<00:00, 3969.23it/s]


In [7]:
train_df = pd.DataFrame(train_df, columns=['text', 'rating'])
test_df = pd.DataFrame(test_df, columns=['text', 'rating'])

In [8]:
print('Records: ', train_df.size)
train_df.head()

Records:  50000


Unnamed: 0,text,rating
0,I guess this movie will only work on people wh...,7
1,"A great film this, and a shame that it will re...",8
2,Christopher Lloyd is funny and really believab...,9
3,This is a thinking man's silly movie. If you d...,10
4,Daniel Day Lewis is one of the best actors of ...,10


In [9]:
for i in range(1, 11):
    print(f'Number of reviews with rating {i}: {train_df[train_df.rating == str(i)].shape[0]}')

Number of reviews with rating 1: 5100
Number of reviews with rating 2: 2284
Number of reviews with rating 3: 2420
Number of reviews with rating 4: 2696
Number of reviews with rating 5: 0
Number of reviews with rating 6: 0
Number of reviews with rating 7: 2496
Number of reviews with rating 8: 3009
Number of reviews with rating 9: 2263
Number of reviews with rating 10: 4732


### *We might consider (?or not?) only movies with reviews 1(terrible) and 10(perfect)

# 2. Clean and Preprocess

In [10]:
def regex(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    return text

In [11]:
%%time
# Remove punctuaction and lower all texts
train_df.text = train_df.text.apply(lambda row: regex(row))
test_df.text = test_df.text.apply(lambda row: regex(row))

CPU times: user 2.74 s, sys: 7.95 ms, total: 2.75 s
Wall time: 2.75 s


In [12]:
train_df.head()

Unnamed: 0,text,rating
0,i guess this movie will only work on people wh...,7
1,a great film this and a shame that it will rec...,8
2,christopher lloyd is funny and really believab...,9
3,this is a thinking mans silly movie if you don...,10
4,daniel day lewis is one of the best actors of ...,10


In [13]:
print(train_df.text)

0        i guess this movie will only work on people wh...
1        a great film this and a shame that it will rec...
2        christopher lloyd is funny and really believab...
3        this is a thinking mans silly movie if you don...
4        daniel day lewis is one of the best actors of ...
                               ...                        
24995    i understand what this movie was trying to por...
24996    a friend of mine bought this very cheaply and ...
24997    my friends and i rented this for bad movie nig...
24998    moron and girlfriend conduct some ritual to re...
24999    ok people honestly this gotta be one of the wo...
Name: text, Length: 25000, dtype: object


In [14]:
consider only rating 1 and 10
bayes_df_train = train_df[(train_df.rating == '1') | (train_df.rating == '10')]
bayes_df_test = test_df[(test_df.rating == '1') | (test_df.rating == '10')]
GOOD_WORDS = defaultdict(int)
BAD_WORDS = defaultdict(int)

In [15]:
for index, row in tqdm(bayes_df_train.iterrows(), desc='Creating Bayes dictionaries', position=0):
    text, rating = row['text'], row['rating']
    
    for word in text.split():
        if rating == '10':
            GOOD_WORDS[word] += 1
        else:
            BAD_WORDS[word] += 1

In [16]:
# most frequent GOOD words
list(sorted(GOOD_WORDS.items(), key=lambda x: x[1], reverse=True))[:10]

In [17]:
# most frequent BAD words
list(sorted(BAD_WORDS.items(), key=lambda x: x[1], reverse=True))[:10]

In [18]:
def classify(text, target_dict):
    text = regex(text)
    
    for word in text.split():
        if not word in target_dict:
            target_dict[word] = 1
            
    sum_of_all = sum(target_dict.values())
    
    ppd = 0
    for word in text.split():
        ppd += np.log2(float(target_dict[word]) / sum_of_all)
        
    return ppd
    
    
def predict(text):
    ppd_good = classify(text, GOOD_WORDS)
    ppd_bad = classify(text, BAD_WORDS)

    
    all_ppd = np.array([ppd_good, ppd_bad])
    target = ['10', '1'][np.argmax(all_ppd)]
    
    return target

### 3.1 Checking train accuracy

In [19]:
correct, wrong = 0, 0
real_targets, predictions = [], []

for index, row in tqdm(bayes_df_train.iterrows(), desc='Checking train accuracy', position=0):
    text, rating = row['text'], row['rating']
    
    pred = predict(text)
    real_targets.append(rating)
    predictions.append(pred)
    if pred == rating:
        correct += 1
    else:
        wrong += 1
        
   if pred == '10' and rating == '1':
       FP += 1

In [20]:
print(f'Correct: {correct}, Wrong: {wrong}')
print(f'Accuracy: {correct / (wrong + correct) * 100}%')

M = metrics.confusion_matrix(predictions, real_targets)
print('\nConfusion matrix:')
print(M)
print(f'\nTrue negative (rating = 1): {M[0][0]}')
print(f'True positive (rating = 10): {M[1][1]}')
print(f'False negative: {M[0][1]}')
print(f'False positive: {M[1][0]}')

*So we are rather sceptic and most of our mistakes are movies which are good, but we classify them as bad.

### 3.2 Checking test accuracy

In [21]:
correct, wrong = 0, 0
real_targets, predictions = [], []

for index, row in tqdm(bayes_df_test.iterrows(), desc='Checking test accuracy', position=0):
    text, rating = row['text'], row['rating']
    
    pred = predict(text)
    real_targets.append(rating)
    predictions.append(pred)

    if pred == rating:
        correct += 1
    else:
        wrong += 1

In [22]:
print(f'Correct: {correct}, Wrong: {wrong}')
print(f'Accuracy: {correct / (wrong + correct) * 100}%')

M = metrics.confusion_matrix(predictions, real_targets)
print('\nConfusion matrix:')
print(M)
print(f'\nTrue negative (rating = 1): {M[0][0]}')
print(f'True positive (rating = 10): {M[1][1]}')
print(f'False negative: {M[0][1]}')
print(f'False positive: {M[1][0]}')

*Again we are sceptic

# 5. sklearn built in Logistic Regression

### 5.1 Data vectorization (one-hot encoding)

In [12]:
reviews_train_clean = np.array(train_df.text)
reviews_test_clean = np.array(test_df.text)
cv = CountVectorizer(binary=True)
cv.fit(reviews_train_clean)
X = cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)     

In [14]:
target = [1 if i < 12500 else 0 for i in range(25000)]

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:    
    lr = LogisticRegression(C=c,max_iter=300) # pobawic sie parametrami znalezc najlepsze
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))

Accuracy for C=0.01: 0.86768
Accuracy for C=0.05: 0.87632
Accuracy for C=0.25: 0.8768
Accuracy for C=0.5: 0.87568
Accuracy for C=1: 0.87296


# 6. pure Logistic Regression

In [16]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [18]:
def logreg_loss(Theta, X, Y):
    Z = Theta.T.dot(X)
    nll = -np.sum([
                    y * np.log2(sigmoid(Z)) \
                    + (1-y) * np.log2(1 - sigmoid(Z)) \
                    for y in Y
                    ])
    
    grad = X.dot((sigmoid(Z) - Y).T)
    return nll, grad.reshape(Theta.shape)


### 6.1. Data vectorization (one-hot encoding)

In [19]:
reviews_train_clean = np.array(train_df.text)
reviews_test_clean = np.array(test_df.text)
cv = CountVectorizer(binary=True)
cv.fit(reviews_train_clean)
X = cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)   

In [20]:

target = [1 if i < 12500 else 0 for i in range(25000)]

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

In [21]:
def logreg_classify(Theta,x):
    return (Theta.T.dot(x) >= 0)

def logreg_predict(Theta,Xs):
    return [logreg_classify(Theta,x) for x in Xs]

In [34]:
print(X_train.shape)

(18750, 121004)


In [35]:
Theta0 = np.ones((X_train.shape[1]))
ThetaOpt = sopt.fmin_l_bfgs_b(lambda Theta: logreg_loss(Theta, X_train, y_train), np.array(Theta0),maxiter=100,maxfun=100,factr=1e12)[0] #liczy w nieskonczonosc albo bardzo długo

In [23]:
print ("Accuracy: %s"  % accuracy_score(y_val, logreg_predict(ThetaOpt,X_val)))


