Logistic Regression

In [1]:
import numpy as np
from tqdm import tqdm
from tensorflow import keras
import tensorflow

### Importing IMDB sentiment dataset

In [2]:
import keras
from keras.datasets import imdb

In [4]:
# LOAD IMDB DATA

(x_train, y_train), (x_test, y_test) = tensorflow.keras.datasets.imdb.load_data(
    path="imdb.npz",
    num_words=None,
    skip_top=0,
    maxlen=None,
    seed=113,
    start_char=1,
    oov_char=2,
    index_from=3
)

In [5]:
print("train_data ", x_train.shape)
print("train_labels ", y_train.shape)
print("_"*100)
print("test_data ", x_test.shape)
print("test_labels ", y_test.shape)
print("_"*100)
print("Maximum value of a word index ")
print(max([max(sequence) for sequence in x_train]))
print("Maximum length num words of review in train ")
print(max([len(sequence) for sequence in x_train]))

train_data  (25000,)
train_labels  (25000,)
____________________________________________________________________________________________________
test_data  (25000,)
test_labels  (25000,)
____________________________________________________________________________________________________
Maximum value of a word index 
88586
Maximum length num words of review in train 
2494


In [6]:
# Retrieve the word index file mapping words to indices
word_index = keras.datasets.imdb.get_word_index()

In [7]:
# Reverse the word index to obtain a dict mapping indices to words
inverted_word_index = dict((i, word) for (word, i) in word_index.items())
# Decode the first sequence in the dataset
decoded_sequence = " ".join(inverted_word_index[i] for i in x_train[0])

### Logistic Regression

In [8]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.linear_model import LogisticRegression
import pandas as pd

In [9]:
def import_lexicon(path):
    data = pd.read_csv(path, sep='\t', names=[0, 1, 2, 3])
    df = pd.DataFrame()
    df['token'] = data[0]
    df['sentiment'] = data[1]
    return df

In [10]:
def get_reviews_from_class(x_train, y_train, _class, V):
    reviews = x_train[y_train == _class]
    
    for i in range (len(reviews)):
        decode_sentence = reviews[i]
        reviews[i] = decode_sentence
    return reviews

In [11]:
def does_no_appear(review) -> int:
    if "no" in review:
        return 1
    return 0

In [12]:
def count_first_and_second_pro(review) -> int:
    count = 0
    for char in review:
        if char in ["I", "i", "you", "yours"]:
            count += 1
    return count

In [13]:
def exclamation_in_doc(review):
    if "!" in review:
            return 1
    return 0

In [14]:
def log_word_count_in_doc(review):
    return np.log(len(review))

In [15]:
def number_of_words_pos(review, lexicon):
    columns = ['token']
    tmp = lexicon[lexicon.sentiment > 0]
    positive_words = tmp[columns].to_numpy().tolist()
    number_of_pos = np.in1d(positive_words, review)
    return sum(number_of_pos)

In [16]:
def number_of_words_neg(review, lexicon):
    columns = ['token']
    tmp = lexicon[lexicon.sentiment < 0]
    negative_words = tmp[columns].to_numpy().tolist()
    number_of_neg = np.in1d(negative_words, review)
    return sum(number_of_neg)

In [17]:
def LoRegression(X_train, y_train):
    nb_class = 2
    lexicon = import_lexicon("vader_lexicon.txt")
    X_features_of_all_the_class = []
    V = keras.datasets.imdb.get_word_index()

    # Preprocessing

    #to refacto
    for _class in range (nb_class):
        #try to re-write the functions without calling this
        #staying on dicts to lower the order of complexity
        reviews = get_reviews_from_class(X_train, y_train, _class, V)
        features = []

        for review in tqdm(reviews):
            feature = []
            feature.append(does_no_appear(review))
            feature.append(count_first_and_second_pro(review))
            feature.append(exclamation_in_doc(review))
            feature.append(log_word_count_in_doc(review))
            feature.append(number_of_words_neg(review, lexicon))
            feature.append(number_of_words_pos(review, lexicon))
            features.append(feature)

        X_features_of_all_the_class.append(features)
    return np.asarray(X_features_of_all_the_class)

In [18]:
X_features_of_all_the_class = LoRegression(x_train, y_train)

  mask |= (ar1 == a)
100%|██████████| 12500/12500 [06:53<00:00, 30.22it/s]
100%|██████████| 12500/12500 [06:42<00:00, 31.07it/s]


In [19]:
print(X_features_of_all_the_class.reshape(2 * 12500, 6))
X_features = X_features_of_all_the_class.reshape(X_features_of_all_the_class.shape[0] * X_features_of_all_the_class.shape[1], X_features_of_all_the_class.shape[2])

[[0.         0.         0.         5.24174702 0.         0.        ]
 [0.         0.         0.         4.94875989 1.         0.        ]
 [0.         0.         0.         4.99043259 0.         1.        ]
 ...
 [0.         0.         0.         5.21493576 1.         0.        ]
 [0.         0.         0.         5.19295685 1.         1.        ]
 [0.         0.         0.         5.01063529 0.         0.        ]]


In [21]:
X_features[0].shape

(6,)

In [22]:
clf = LogisticRegression(random_state=0).fit(X_features, y_train)

In [None]:
#create X_test using loRegression to have a similar shape

In [23]:
from sklearn.metrics import classification_report
#y_test = np.resize(y_test, 150000).reshape(25000,6)
y_test = y_test.reshape(25000, 6)
print(y_test.shape)

ValueError: cannot reshape array of size 25000 into shape (25000,6)

In [32]:
y_pred = clf.predict(x_test)
print(y_pred.shape)

ValueError: setting an array element with a sequence.

In [None]:
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred, target_names=target_names))