In [2]:
# set up
import numpy as np
import pandas as pd
import glob
import re
import string 
from collections import defaultdict
from sklearn import metrics
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import scipy.optimize as sopt

## 0. Macro for windows / mac users

In [7]:
system = 'Mac'
# system = 'Win'

# 1. Preparing data (IMDb movies' reviews)

In [8]:
train_pos_path = 'data_sets/aclImdb/train/pos/*'
train_neg_path = 'data_sets/aclImdb/train/neg/*'

train_pos = glob.glob(train_pos_path)
train_neg = glob.glob(train_neg_path)


test_pos_path = 'data_sets/aclImdb/test/pos/*'
test_neg_path = 'data_sets/aclImdb/test/neg/*'

test_pos = glob.glob(test_pos_path)
test_neg = glob.glob(test_neg_path)

In [13]:
train_df = []
test_df = []

for path in tqdm(train_pos, desc='Getting positive train data', position=0, leave=False):
    with open(path, encoding="utf8") as f:
        text = f.read()
        
#         For win users
        if system == 'Win':
            beg, end = path.find('\\'), path.find('.')
        
#         For mac users
        if system == 'Mac':
            beg = re.search(r"\d",path).start()-1
            end = path.find('.')
            
        idx, rating = path[beg+1:-4].split('_')
        train_df.append([text, rating])
        
for path in tqdm(train_neg, desc='Getting negative train data', position=0, leave=False):
    with open(path, encoding="utf8") as f:
        text = f.read()
        
#         For win users
        if system == 'Win':
            beg, end = path.find('\\'), path.find('.')
        
#         For mac users
        if system == 'Mac':
            beg = re.search(r"\d",path).start()-1
            end = path.find('.')

        idx, rating = path[beg+1:-4].split('_')
        train_df.append([text, rating])
         
for path in tqdm(test_pos, desc='Getting positive test data', position=0, leave=False):
    with open(path, encoding="utf8") as f:
        text = f.read()
        
#         For win users
        if system == 'Win':
            beg, end = path.find('\\'), path.find('.')
        
#         For mac users
        if system == 'Mac':
            beg = re.search(r"\d",path).start()-1
            end = path.find('.')
            
        idx, rating = path[beg+1:-4].split('_')
        test_df.append([text, rating])
   
for path in tqdm(test_neg, desc='Getting negative test data', position=0, leave=False):
    with open(path, encoding="utf8") as f:
        text = f.read()

#         For win users
        if system == 'Win':
            beg, end = path.find('\\'), path.find('.')
        
#         For mac users
        if system == 'Mac':
            beg = re.search(r"\d",path).start()-1
            end = path.find('.')
            
        idx, rating = path[beg+1:-4].split('_')
        test_df.append([text, rating])

                                                                                    

In [4]:
train_df = pd.DataFrame(train_df, columns=['text', 'rating'])
test_df = pd.DataFrame(test_df, columns=['text', 'rating'])

In [5]:
print('Records: ', train_df.size)
train_df.head()

Records:  50000


Unnamed: 0,text,rating
0,Bromwell High is a cartoon comedy. It ran at t...,9
1,Homelessness (or Houselessness as George Carli...,8
2,Brilliant over-acting by Lesley Ann Warren. Be...,10
3,This is easily the most underrated film inn th...,7
4,This is not the typical Mel Brooks film. It wa...,8


In [6]:
for i in range(1, 11):
    print(f'Number of reviews with rating {i}: {train_df[train_df.rating == str(i)].shape[0]}')

Number of reviews with rating 1: 5100
Number of reviews with rating 2: 2284
Number of reviews with rating 3: 2420
Number of reviews with rating 4: 2696
Number of reviews with rating 5: 0
Number of reviews with rating 6: 0
Number of reviews with rating 7: 2496
Number of reviews with rating 8: 3009
Number of reviews with rating 9: 2263
Number of reviews with rating 10: 4732


### *We might consider (?or not?) only movies with reviews 1(terrible) and 10(perfect)

# 2. Clean and Preprocess

In [7]:
def regex(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    return text

In [8]:
%%time
# Remove punctuaction and lower all texts
train_df.text = train_df.text.apply(lambda row: regex(row))
test_df.text = test_df.text.apply(lambda row: regex(row))

Wall time: 2.24 s


In [9]:
train_df.head()

Unnamed: 0,text,rating
0,bromwell high is a cartoon comedy it ran at th...,9
1,homelessness or houselessness as george carlin...,8
2,brilliant overacting by lesley ann warren best...,10
3,this is easily the most underrated film inn th...,7
4,this is not the typical mel brooks film it was...,8


In [10]:
print(train_df.text)

0        bromwell high is a cartoon comedy it ran at th...
1        homelessness or houselessness as george carlin...
2        brilliant overacting by lesley ann warren best...
3        this is easily the most underrated film inn th...
4        this is not the typical mel brooks film it was...
                               ...                        
24995    towards the end of the movie i felt it was too...
24996    this is the kind of movie that my enemies cont...
24997    i saw descent last night at the stockholm film...
24998    some films that you pick up for a pound turn o...
24999    this is one of the dumbest films ive ever seen...
Name: text, Length: 25000, dtype: object


In [11]:
# consider only rating 1 and 10
bayes_df_train = train_df[(train_df.rating == '1') | (train_df.rating == '10')]
bayes_df_test = test_df[(test_df.rating == '1') | (test_df.rating == '10')]
GOOD_WORDS = defaultdict(int)
BAD_WORDS = defaultdict(int)

In [12]:
for index, row in tqdm(bayes_df_train.iterrows(), desc='Creating Bayes dictionaries', position=0):
    text, rating = row['text'], row['rating']
    
    for word in text.split():
        if rating == '10':
            GOOD_WORDS[word] += 1
        else:
            BAD_WORDS[word] += 1

Creating Bayes dictionaries: 9832it [00:02, 4182.13it/s]


In [13]:
# most frequent GOOD words
list(sorted(GOOD_WORDS.items(), key=lambda x: x[1], reverse=True))[:10]

[('the', 56972),
 ('and', 30301),
 ('a', 26016),
 ('of', 25791),
 ('to', 22249),
 ('is', 19380),
 ('in', 16257),
 ('i', 14729),
 ('it', 14595),
 ('this', 13341)]

In [14]:
# most frequent BAD words
list(sorted(BAD_WORDS.items(), key=lambda x: x[1], reverse=True))[:10]

[('the', 58427),
 ('a', 28386),
 ('and', 26617),
 ('to', 26292),
 ('of', 25218),
 ('is', 18370),
 ('this', 18239),
 ('i', 17985),
 ('it', 15645),
 ('in', 15538)]

In [15]:
def classify(text, target_dict):
    text = regex(text)
    
    for word in text.split():
        if not word in target_dict:
            target_dict[word] = 1
            
    sum_of_all = sum(target_dict.values())
    
    ppd = 0
    for word in text.split():
        ppd += np.log2(float(target_dict[word]) / sum_of_all)
        
    return ppd
    
    
def predict(text):
    ppd_good = classify(text, GOOD_WORDS)
    ppd_bad = classify(text, BAD_WORDS)
    
    all_ppd = np.array([ppd_good, ppd_bad])
    target = ['10', '1'][np.argmax(all_ppd)]
    
    return target

### 3.1 Checking train accuracy

In [16]:
correct, wrong = 0, 0
real_targets, predictions = [], []

for index, row in tqdm(bayes_df_train.iterrows(), desc='Checking train accuracy', position=0):
    text, rating = row['text'], row['rating']
    
    pred = predict(text)
    real_targets.append(rating)
    predictions.append(pred)
    if pred == rating:
        correct += 1
    else:
        wrong += 1

Checking train accuracy: 9832it [00:25, 378.44it/s]


In [17]:
print(f'Correct: {correct}, Wrong: {wrong}')
print(f'Accuracy: {correct / (wrong + correct) * 100}%')

M = metrics.confusion_matrix(predictions, real_targets)
print('\nConfusion matrix:')
print(M)
print(f'\nTrue negative (rating = 1): {M[0][0]}')
print(f'True positive (rating = 10): {M[1][1]}')
print(f'False negative: {M[0][1]}')
print(f'False positive: {M[1][0]}')

Correct: 9346, Wrong: 486
Accuracy: 95.05695687550855%

Confusion matrix:
[[4922  308]
 [ 178 4424]]

True negative (rating = 1): 4922
True positive (rating = 10): 4424
False negative: 308
False positive: 178


*So we are rather sceptic and most of our mistakes are movies which are good, but we classify them as bad.

### 3.2 Checking test accuracy

In [18]:
correct, wrong = 0, 0
real_targets, predictions = [], []

for index, row in tqdm(bayes_df_test.iterrows(), desc='Checking test accuracy', position=0):
    text, rating = row['text'], row['rating']
    
    pred = predict(text)
    real_targets.append(rating)
    predictions.append(pred)

    if pred == rating:
        correct += 1
    else:
        wrong += 1

Checking test accuracy: 10021it [00:33, 297.44it/s]


In [19]:
print(f'Correct: {correct}, Wrong: {wrong}')
print(f'Accuracy: {correct / (wrong + correct) * 100}%')

M = metrics.confusion_matrix(predictions, real_targets)
print('\nConfusion matrix:')
print(M)
print(f'\nTrue negative (rating = 1): {M[0][0]}')
print(f'True positive (rating = 10): {M[1][1]}')
print(f'False negative: {M[0][1]}')
print(f'False positive: {M[1][0]}')

Correct: 8901, Wrong: 1120
Accuracy: 88.82347071150583%

Confusion matrix:
[[4689  787]
 [ 333 4212]]

True negative (rating = 1): 4689
True positive (rating = 10): 4212
False negative: 787
False positive: 333


*Again we are sceptic

# 4.5 Class Naive Bayes

In [111]:
class Naive_Bayes:
    def __init__(self,alpha=0,fit_prior=True,class_prior=None):
        self.alpha = alpha
        self.fit_prior = fit_prior
        self.class_prior_array = class_prior
        if class_prior:
            self.fit_prior = False
    
    
    def fit(self,X,y):
        self.classes,prior = np.unique(y,return_counts=True)
        self.N = len(y)
        
        # Setting class prior
        if self.fit_prior:
            self.class_prior = {class_ : np.log(prior[i]/self.N + 1e-100) for i,class_ in enumerate(self.classes)}
        elif self.class_prior_array:
            self.class_prior = {class_ : np.log(self.class_prior_array[i] + 1e-100) for i,class_ in enumerate(self.classes)}
        else:
            self.class_prior = {class_ : np.log(1/len(self.classes) + 1e-100) for class_ in self.classes}
            
        # Creating words dictionaries
        self.class_words_counts = {class_ : defaultdict(lambda: 0) for class_ in self.classes}
        for i,text in enumerate(X):
            target = y[i]
            for word in text.split():
                self.class_words_counts[target][word] += 1
        
        # Creating probabilities dictionaries
        self.class_words_probs = {class_ : defaultdict(lambda: np.log(self.alpha + 1e-100)) for class_ in self.classes}
        for class_,dict_ in self.class_words_counts.items():
            for word,count in dict_.items():
                self.class_words_probs[class_][word] = np.log(count + 1e-100)
    
        self.class_words_amount = {class_ : np.log(sum(self.class_words_counts[class_].values())) for class_ in self.classes}
    

    def get_class_log_probabilities(self,text):
        probs = {class_ : 0 for class_ in self.classes}
        for class_ in self.classes:
            for word in text.split():
                probs[class_] += self.class_words_probs[class_][word]
                probs[class_] -= self.class_words_amount[class_]
            probs[class_] += self.class_prior[class_]
        return probs
    
    
    def predict(self,X,return_probabilities = False):
        preds = []
        preds_probs = []
        for text in X:
            prob = self.get_class_log_probabilities(text)
            #prob = {class_ : np.exp(pbb) for class_,pbb in prob.items()}
            preds_probs.append(prob)
            pred = max(prob,key = prob.get)
            preds.append(pred)
        
        if return_probabilities:
            return preds,preds_probs
        return preds
            

In [61]:
bayes_df_train = train_df[(train_df.rating == '1') | (train_df.rating == '10')]
bayes_df_test = test_df[(test_df.rating == '1') | (test_df.rating == '10')]

In [62]:
X_train,y_train = np.array(bayes_df_train['text']),np.array(bayes_df_train['rating'])
X_test,y_test = np.array(bayes_df_test['text']),np.array(bayes_df_test['rating'])

In [126]:
alpha = 1.0
NB = Naive_Bayes(fit_prior = False,alpha=alpha)

In [127]:
NB.fit(X_train,y_train)

In [128]:
predictions,ppb = NB.predict(X_train,return_probabilities=True)

In [129]:
print(f"TRAIN, alpha : {alpha}")
print(f"Acc: {np.mean(predictions == y_train)}")
M = metrics.confusion_matrix(predictions, y_train)
print('\nConfusion matrix:')
print(M)
print(f'\nTrue negative (rating = 1): {M[0][0]}')
print(f'True positive (rating = 10): {M[1][1]}')
print(f'False negative: {M[0][1]}')
print(f'False positive: {M[1][0]}')

TRAIN, alpha : 1.0
Acc: 0.9491456468673718

Confusion matrix:
[[4963  363]
 [ 137 4369]]

True negative (rating = 1): 4963
True positive (rating = 10): 4369
False negative: 363
False positive: 137


In [130]:
predictions,ppb = NB.predict(X_test,return_probabilities=True)

In [131]:
print(f"TEST, alpha : {alpha}")
print(f"Acc: {np.mean(predictions == y_test)}")
M = metrics.confusion_matrix(predictions, y_test)
print('\nConfusion matrix:')
print(M)
print(f'\nTrue negative (rating = 1): {M[0][0]}')
print(f'True positive (rating = 10): {M[1][1]}')
print(f'False negative: {M[0][1]}')
print(f'False positive: {M[1][0]}')

TEST, alpha : 1.0
Acc: 0.8884342879952101

Confusion matrix:
[[4660  756]
 [ 362 4243]]

True negative (rating = 1): 4660
True positive (rating = 10): 4243
False negative: 756
False positive: 362


In [132]:
alpha = 0.0
NB = Naive_Bayes(fit_prior = False,alpha=alpha)

In [133]:
NB.fit(X_train,y_train)

In [134]:
predictions,ppb = NB.predict(X_train,return_probabilities=True)

In [135]:
print(f"TRAIN, alpha : {alpha}")
print(f"Acc: {np.mean(predictions == y_train)}")
M = metrics.confusion_matrix(predictions, y_train)
print('\nConfusion matrix:')
print(M)
print(f'\nTrue negative (rating = 1): {M[0][0]}')
print(f'True positive (rating = 10): {M[1][1]}')
print(f'False negative: {M[0][1]}')
print(f'False positive: {M[1][0]}')

TRAIN, alpha : 0.0
Acc: 0.9899308380797396

Confusion matrix:
[[5092   91]
 [   8 4641]]

True negative (rating = 1): 5092
True positive (rating = 10): 4641
False negative: 91
False positive: 8


In [136]:
predictions,ppb = NB.predict(X_test,return_probabilities=True)

In [137]:
print(f"TEST, alpha : {alpha}")
print(f"Acc: {np.mean(predictions == y_test)}")
M = metrics.confusion_matrix(predictions, y_test)
print('\nConfusion matrix:')
print(M)
print(f'\nTrue negative (rating = 1): {M[0][0]}')
print(f'True positive (rating = 10): {M[1][1]}')
print(f'False negative: {M[0][1]}')
print(f'False positive: {M[1][0]}')

TEST, alpha : 0.0
Acc: 0.7240794331903003

Confusion matrix:
[[3932 1675]
 [1090 3324]]

True negative (rating = 1): 3932
True positive (rating = 10): 3324
False negative: 1675
False positive: 1090


In [139]:
for alpha in [0.0,0.25,0.5,0.75,1.0,1.25,1.5,1.75,2.0,3.0,4.0,5.0,10.0]:
    NB = Naive_Bayes(fit_prior = False,alpha=alpha)
    NB.fit(X_train,y_train)
    predictions,ppb = NB.predict(X_test,return_probabilities=True)
    acc = np.mean(predictions == y_test)
    print(f'Alpha : {alpha}, test acc: {acc}')

Alpha : 0.0, test acc: 0.7240794331903003
Alpha : 0.25, test acc: 0.8820476998303562
Alpha : 0.5, test acc: 0.8868376409539966
Alpha : 0.75, test acc: 0.8882347071150584
Alpha : 1.0, test acc: 0.8884342879952101
Alpha : 1.25, test acc: 0.8881349166749826
Alpha : 1.5, test acc: 0.8893324019558926
Alpha : 1.75, test acc: 0.8884342879952101
Alpha : 2.0, test acc: 0.8882347071150584
Alpha : 3.0, test acc: 0.8872368027142999
Alpha : 4.0, test acc: 0.8862388983135415
Alpha : 5.0, test acc: 0.8830456042311147
Alpha : 10.0, test acc: 0.8759604829857299


Now we can consider if stemming or removing stop words can imporove accuracy

For stemming we will use Snowball Stemming (Porter2)

In [143]:
stemmer = SnowballStemmer("english")
stop_words = set(stopwords.words('english'))

In [149]:
def stem_(text):
    return ' '.join([stemmer.stem(word) for word in text.split()])

In [150]:
def remove_stop_words(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

In [151]:
def stem_and_remove_stop_words(text):
    return ' '.join([stemmer.stem(word) for word in text.split() if word not in stop_words])

In [155]:
stemmed_train_df = bayes_df_train.copy()
stemmed_test_df = bayes_df_test.copy()
stemmed_train_df.text = stemmed_train_df.text.apply(lambda row: stem_(row))
stemmed_test_df.text = stemmed_test_df.text.apply(lambda row: stem_(row))

In [156]:
swr_train_df = bayes_df_train.copy()
swr_test_df = bayes_df_test.copy()
swr_train_df.text = swr_train_df.text.apply(lambda row: remove_stop_words(row))
swr_test_df.text = swr_test_df.text.apply(lambda row: remove_stop_words(row))

In [157]:
stemmed_swr_train_df = bayes_df_train.copy()
stemmed_swr_test_df = bayes_df_test.copy()
stemmed_swr_train_df.text = stemmed_swr_train_df.text.apply(lambda row: stem_(row))
stemmed_swr_test_df.text = stemmed_swr_test_df.text.apply(lambda row: stem_(row))

In [164]:
alpha = 1.5

In [165]:
NB = Naive_Bayes(fit_prior = False,alpha=alpha)
NB.fit(np.array(stemmed_train_df['text']),np.array(stemmed_train_df['rating']))
predictions = NB.predict(np.array(stemmed_test_df['text']))
print(f"TEST, alpha : {alpha}, stemmed")
print(f"Acc: {np.mean(predictions == np.array(stemmed_test_df['rating']))}")
M = metrics.confusion_matrix(predictions, np.array(stemmed_test_df['rating']))
print('\nConfusion matrix:')
print(M)
print(f'\nTrue negative (rating = 1): {M[0][0]}')
print(f'True positive (rating = 10): {M[1][1]}')
print(f'False negative: {M[0][1]}')
print(f'False positive: {M[1][0]}')

TEST, alpha : 1.5, stemmed
Acc: 0.8827462329108872

Confusion matrix:
[[4623  776]
 [ 399 4223]]

True negative (rating = 1): 4623
True positive (rating = 10): 4223
False negative: 776
False positive: 399


In [166]:
NB = Naive_Bayes(fit_prior = False,alpha=alpha)
NB.fit(np.array(swr_train_df['text']),np.array(swr_test_df['rating']))
predictions = NB.predict(np.array(swr_test_df['text']))
print(f"TEST, alpha : {alpha}, stop words removed")
print(f"Acc: {np.mean(predictions == np.array(swr_test_df['rating']))}")
M = metrics.confusion_matrix(predictions, np.array(swr_test_df['rating']))
print('\nConfusion matrix:')
print(M)
print(f'\nTrue negative (rating = 1): {M[0][0]}')
print(f'True positive (rating = 10): {M[1][1]}')
print(f'False negative: {M[0][1]}')
print(f'False positive: {M[1][0]}')

TEST, alpha : 1.5, stop words removed
Acc: 0.8995110268436284

Confusion matrix:
[[4656  641]
 [ 366 4358]]

True negative (rating = 1): 4656
True positive (rating = 10): 4358
False negative: 641
False positive: 366


In [167]:
NB = Naive_Bayes(fit_prior = False,alpha=alpha)
NB.fit(np.array(stemmed_swr_train_df['text']),np.array(stemmed_swr_train_df['rating']))
predictions = NB.predict(np.array(stemmed_swr_test_df['text']))
print(f"TEST, alpha : {alpha}, stemmed and stop words removed")
print(f"Acc: {np.mean(predictions == np.array(stemmed_swr_test_df['rating']))}")
M = metrics.confusion_matrix(predictions, np.array(stemmed_swr_test_df['rating']))
print('\nConfusion matrix:')
print(M)
print(f'\nTrue negative (rating = 1): {M[0][0]}')
print(f'True positive (rating = 10): {M[1][1]}')
print(f'False negative: {M[0][1]}')
print(f'False positive: {M[1][0]}')

TEST, alpha : 1.5, stemmed and stop words removed
Acc: 0.8827462329108872

Confusion matrix:
[[4623  776]
 [ 399 4223]]

True negative (rating = 1): 4623
True positive (rating = 10): 4223
False negative: 776
False positive: 399


# 5. sklearn built in Logistic Regression

### 5.1 Data vectorization (one-hot encoding)

In [22]:
reviews_train_clean = np.array(train_df.text)
reviews_test_clean = np.array(test_df.text)
cv = CountVectorizer(binary=True)
cv.fit(reviews_train_clean)
X = cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)     

In [23]:
target = [1 if i < 12500 else 0 for i in range(25000)]

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:    
    lr = LogisticRegression(C=c,max_iter=300) # pobawic sie parametrami znalezc najlepsze
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.86576
Accuracy for C=0.05: 0.87856
Accuracy for C=0.25: 0.8784
Accuracy for C=0.5: 0.87648
Accuracy for C=1: 0.87504


# Text Preprocessing
Removing stop words, stemming, encode review as a vector of words occurences

In [24]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [25]:
# We will be using reviews with rating equal 1 or 10 from training set. 
# Test samples rating will be also classified as 1 or 10
LR_train_df = train_df[(train_df.rating == '1') | (train_df.rating == '10')]
LR_test_df = test_df[(test_df.rating == '1') | (test_df.rating == '10')]

In [26]:
LR_train_target = np.array([int(r) for r in LR_train_df['rating']]) // 10
LR_test_target = np.array([int(r) for r in LR_test_df['rating']]) // 10

In [27]:
LR_train_df.shape

(9832, 2)

In [28]:
stemmer = SnowballStemmer("english")
stopwords_set = set(stopwords.words('english'))

In [29]:
def preprocess(sample):
    words = sample.split()
    words = [ stemmer.stem(word) for word in words if word not in stopwords_set ]
    return words

In [30]:
# sample from train set
sample = '''brilliant overacting by lesley ann warren best dramatic hobo lady i 
have ever seen and love scenes in clothes warehouse are second to none the corn 
on face is a classic as good as anything in blazing saddles the take on lawyers 
is also superb after being accused of being a turncoat selling out his boss and 
being dishonest the lawyer of pepto bolt shrugs indifferently im a lawyer he says 
three funny words jeffrey tambor a favorite from the later larry sanders show is 
fantastic here too as a mad millionaire who wants to crush the ghetto his character 
is more malevolent than usual the hospital scene and the scene where the homeless 
invade a demolition site are alltime classics look for the legs scene and the two 
big diggers fighting one bleeds this movie gets better each time i see it which is 
quite often'''

In [31]:
" ".join(preprocess(sample))

'brilliant overact lesley ann warren best dramat hobo ladi ever seen love scene cloth warehous second none corn face classic good anyth blaze saddl take lawyer also superb accus turncoat sell boss dishonest lawyer pepto bolt shrug indiffer im lawyer say three funni word jeffrey tambor favorit later larri sander show fantast mad millionair want crush ghetto charact malevol usual hospit scene scene homeless invad demolit site alltim classic look leg scene two big digger fight one bleed movi get better time see quit often'

In [32]:
words = {}

In [33]:
%%time
for i in LR_train_df.index:
    sample = LR_train_df.loc[i]['text']
    #words |= set(preprocess(sample)) <- for set of words
    for word in preprocess(sample):
        words[word] = words.get(word,0) + 1

Wall time: 32.1 s


In [34]:
# from all stemmed words we consider only those, which occurs more than k times
min_occur = 3
all_train_words = list(filter(lambda w: words[w] >= min_occur,words))
n_w = len(all_train_words)
n_w

17385

In [35]:
all_train_words_indexes = {w:i for i,w in enumerate(all_train_words)}

In [36]:
def vectorize(df):
    data = []
    for i in df.index:
        v = np.zeros(n_w+1, dtype=np.int32)
        v[0] = 1
        sample = df.loc[i]['text']
        words = preprocess(sample)
        for word in words:
            idx = all_train_words_indexes.get(word,-1)
            if idx >= 0:
                v[idx+1] += 1
        data.append(v)
    return np.array(data)

In [37]:
vectorized_train = vectorize(LR_train_df)

In [38]:
vectorized_train.shape

(9832, 17386)

In [39]:
Theta0 = np.ones(vectorized_train.shape[1])

In [41]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [42]:
def logreg_loss(Theta, X, Y):
    print(f"Loss calculating... ",end="")
    Z = np.dot(Theta,X.T)
    print(f" Z done... ",end="")
    SZ = sigmoid(Z)
    nll = -np.sum([
                    y * np.log2(SZ + 1e-7) \
                    + (1-y) * np.log2(1 - SZ + 1e-7) \
                    for y in Y
                    ])
    print(f" nll done... ",end="")
    grad = np.dot(X.T, (SZ - Y).T )
    print(f" grad done... done ")
    return nll, grad.reshape(Theta.shape)

In [72]:
ThetaOpt = sopt.fmin_l_bfgs_b(lambda Theta: logreg_loss(Theta, vectorized_train, LR_train_target), Theta0, maxiter=500)[0]

Loss calculating...  Z done...  nll done...  grad done... done 
Loss calculating...  Z done...  nll done...  grad done... done 
Loss calculating...  Z done...  nll done...  grad done... done 
Loss calculating...  Z done...  nll done...  grad done... done 
Loss calculating...  Z done...  nll done...  grad done... done 
Loss calculating...  Z done...  nll done...  grad done... done 
Loss calculating...  Z done...  nll done...  grad done... done 
Loss calculating...  Z done...  nll done...  grad done... done 
Loss calculating...  Z done...  nll done...  grad done... done 
Loss calculating...  Z done...  nll done...  grad done... done 
Loss calculating...  Z done...  nll done...  grad done... done 
Loss calculating...  Z done...  nll done...  grad done... done 
Loss calculating...  Z done...  nll done...  grad done... done 
Loss calculating...  Z done...  nll done...  grad done... done 
Loss calculating...  Z done...  nll done...  grad done... done 
Loss calculating...  Z done...  nll done

In [73]:
ThetaOpt

array([-3.89831833,  1.87011138,  0.52321257, ...,  0.96498814,
        0.89387323,  0.9807871 ])

In [76]:
Theta_df = pd.DataFrame(backup)
Theta_df

Unnamed: 0,0
0,-3.898318
1,1.870111
2,0.523213
3,0.974915
4,0.851720
...,...
17381,0.929919
17382,0.903717
17383,0.964988
17384,0.893873


In [78]:
export_csv = Theta_df.to_csv(r'LR_ThetaOpt.csv', index = True, header=None) 

In [80]:
words_pd = pd.DataFrame(all_train_words)
words_pd

Unnamed: 0,0
0,brilliant
1,overact
2,lesley
3,ann
4,warren
...,...
17380,hooten
17381,frewer
17382,wishbon
17383,ajax


In [81]:
export_csv2 = words_pd.to_csv(r'LR_all_words.csv', index = True, header=None)

In [46]:
vectorized_test = vectorize(LR_test_df)

In [43]:
def logreg_classify(Theta,x):
    return (Theta.T.dot(x) >= 0)

def logreg_predict(Theta,Xs):
    return [logreg_classify(Theta,x) for x in Xs]

In [74]:
print("Accuracy: %s"  % accuracy_score(LR_test_target, logreg_predict(ThetaOpt,vectorized_test)))

Accuracy: 0.8216744835844726


# 6. pure Logistic Regression

### 6.1. Data vectorization (one-hot encoding)

In [26]:
reviews_train_clean = np.array(train_df.text)
reviews_test_clean = np.array(test_df.text)
cv = CountVectorizer(binary=True)
cv.fit(reviews_train_clean)
X = cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)   

In [60]:
np.mean((np.array([int(e) for e in train_df['rating']]) > 5) == np.array([True]*12500 + [False]*12500))

1.0

In [65]:
X

<25000x121004 sparse matrix of type '<class 'numpy.int64'>'
	with 3465481 stored elements in Compressed Sparse Row format>

In [43]:

target = [1 if i < 12500 )else 0 for i in range(25000)]

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

In [29]:
print(X_train.shape)

(18750, 121004)


In [30]:
Theta0 = np.ones((X_train.shape[1]))
#ThetaOpt = sopt.fmin_l_bfgs_b(lambda Theta: logreg_loss(Theta, X_train, y_train), np.array(Theta0),maxiter=100,maxfun=100,factr=1e12)[0] #liczy w nieskonczonosc albo bardzo długo

In [31]:
print ("Accuracy: %s"  % accuracy_score(y_val, logreg_predict(ThetaOpt,X_val)))

NameError: name 'ThetaOpt' is not defined