Logistic Regression

In [1]:
import numpy as np
from tqdm import tqdm
from tensorflow import keras
import tensorflow

### Importing IMDB sentiment dataset

In [2]:
import keras
from keras.datasets import imdb

In [3]:
# LOAD IMDB DATA

(x_train, y_train), (x_test, y_test) = tensorflow.keras.datasets.imdb.load_data(
    path="imdb.npz",
    num_words=None,
    skip_top=0,
    maxlen=None,
    seed=113,
    start_char=1,
    oov_char=2,
    index_from=3
)

In [4]:
print("train_data ", x_train.shape)
print("train_labels ", y_train.shape)
print("_"*100)
print("test_data ", x_test.shape)
print("test_labels ", y_test.shape)
print("_"*100)
print("Maximum value of a word index ")
print(max([max(sequence) for sequence in x_train]))
print("Maximum length num words of review in train ")
print(max([len(sequence) for sequence in x_train]))

train_data  (25000,)
train_labels  (25000,)
____________________________________________________________________________________________________
test_data  (25000,)
test_labels  (25000,)
____________________________________________________________________________________________________
Maximum value of a word index 
88586
Maximum length num words of review in train 
2494


In [5]:
# Retrieve the word index file mapping words to indices
word_index = keras.datasets.imdb.get_word_index()

In [6]:
# Reverse the word index to obtain a dict mapping indices to words
inverted_word_index = dict((i, word) for (word, i) in word_index.items())
# Decode the first sequence in the dataset
decoded_sequence = " ".join(inverted_word_index[i] for i in x_train[0])

### Logistic Regression

In [7]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.linear_model import LogisticRegression
import pandas as pd

In [8]:
def import_lexicon(path):
    data = pd.read_csv(path, sep='\t', names=[0, 1, 2, 3])
    df = pd.DataFrame()
    df['token'] = data[0]
    df['sentiment'] = data[1]
    return df

In [9]:
def does_no_appear(review) -> int:
    if word_index["no"] in review:
        return 1
    return 0

In [10]:
def count_first_and_second_pro(review) -> int:
    count = 0
    for word in review:
        if word in [word_index['you'], word_index['i'], word_index['yours']]:
                    count += 1
    return count

In [11]:
try:
    word_index['!']
except KeyError:
    print("! not found, let's see if we find it in a word")

! not found, let's see if we find it in a word


Let's search for all the words finishing with !

As it would make no sense that the ! character appears before the end of the word

In [12]:
found = False
for key, value in word_index.items():
    if key.endswith('!') or key.endswith("! "):
        found = True
        #displaying the key finishing by ! and its value
        print(key, value)
print(found)

False


it appears no words ends with ! in the training set

Thus, there is no need to check for ! in the documents

In [13]:
def log_word_count_in_doc(review):
    return np.log(len(review))

In [14]:
def filter_lexicon(lexicon, word_index):
    # only the int value of the words present in index
    filtered_lexicon, i = pd.DataFrame([], columns=['token_int', 'token_string', 'sentiment']), 0
    for index in range(lexicon.shape[0]):
        token = lexicon['token'][index]
        if token in word_index.keys():
            mapped_token = word_index[token]
            filtered_lexicon.loc[i] = [mapped_token, lexicon.token[index], lexicon.sentiment[index]]
            i += 1
    return filtered_lexicon

In [15]:
def split_lexicon(lexicon):
    positive_df = lexicon[lexicon.sentiment > 0]
    positive_words = positive_df['token_int'].to_numpy().tolist()
    negative_df = lexicon[lexicon.sentiment < 0]
    negative_words = negative_df['token_int'].to_numpy().tolist()
    return positive_words, negative_words

In [16]:
def number_of_words_pos(review, positive_words):
    posi = np.isin(positive_words, review)
    return sum(posi)

def number_of_words_neg(review, negative_words):
    nega = np.isin(negative_words, review)
    return sum(nega)

In [17]:
def LoRegression(X_train, y_train):
    lexicon = import_lexicon("vader_lexicon.txt")
    word_index = keras.datasets.imdb.get_word_index()
    lexicon = filter_lexicon(lexicon, word_index)
    positive_words, negative_words = split_lexicon(lexicon)
    
    X_features = []
    for review in tqdm(x_train):
        #as said earlier, there is no ! characters in word index
        #thus it doesnt exist in the review
        feature = np.zeros(5)
        feature[0] = does_no_appear(review)
        feature[1] = count_first_and_second_pro(review)
        feature[2] = log_word_count_in_doc(review)
        feature[3] = number_of_words_neg(review, negative_words)
        feature[4] = number_of_words_pos(review, positive_words)

        X_features.append(feature)
    return np.asarray(X_features)

In [18]:
X_train_features = LoRegression(x_train, y_train)

100%|██████████| 25000/25000 [03:58<00:00, 104.91it/s]


In [19]:
X_train_features.shape

(25000, 5)

In [50]:
clf = LogisticRegression(random_state=0).fit(X_train_features, y_train)

In [36]:
#create X_test using loRegression to have a usable informations
X_test_features = LoRegression(x_test, y_test)

100%|██████████| 25000/25000 [04:03<00:00, 102.51it/s]


In [51]:
y_pred = clf.predict(X_test_features)
print(X_test_features.shape, y_pred.shape)

(25000, 5) (25000,)


In [52]:
clf.score(X_test_features, y_test)

0.49396

In [47]:
from sklearn.metrics import classification_report

target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.51      0.52      0.52     12500
     class 1       0.51      0.49      0.50     12500

    accuracy                           0.51     25000
   macro avg       0.51      0.51      0.51     25000
weighted avg       0.51      0.51      0.51     25000

