Logistic Regression

In [65]:
import numpy as np
from tqdm import tqdm
from tensorflow import keras
import tensorflow

### Importing IMDB sentiment dataset

In [66]:
import keras
from keras.datasets import imdb

In [136]:
# LOAD IMDB DATA

(x_train, y_train), (x_test, y_test) = tensorflow.keras.datasets.imdb.load_data(
    path="imdb.npz",
    num_words=None,
    skip_top=0,
    maxlen=None,
    seed=113,
    start_char=1,
    oov_char=2,
    index_from=3
)

In [44]:
print("train_data ", x_train.shape)
print("train_labels ", y_train.shape)
print("_"*100)
print("test_data ", x_test.shape)
print("test_labels ", y_test.shape)
print("_"*100)
print("Maximum value of a word index ")
print(max([max(sequence) for sequence in x_train]))
print("Maximum length num words of review in train ")
print(max([len(sequence) for sequence in x_train]))

train_data  (25000,)
train_labels  (25000,)
____________________________________________________________________________________________________
test_data  (25000,)
test_labels  (25000,)
____________________________________________________________________________________________________
Maximum value of a word index 
88586
Maximum length num words of review in train 
2494


In [45]:
# Retrieve the word index file mapping words to indices
word_index = keras.datasets.imdb.get_word_index()

In [46]:
# Reverse the word index to obtain a dict mapping indices to words
inverted_word_index = dict((i, word) for (word, i) in word_index.items())
# Decode the first sequence in the dataset
decoded_sequence = " ".join(inverted_word_index[i] for i in x_train[0])

### Logistic Regression

In [68]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.linear_model import LogisticRegression
import pandas as pd

In [69]:
def import_lexicon(path):
    data = pd.read_csv(path, sep='\t', names=[0, 1, 2, 3])
    df = pd.DataFrame()
    df['token'] = data[0]
    df['sentiment'] = data[1]
    return df

In [70]:
def get_reviews_from_class(x_train, y_train, _class, V):
    reviews = x_train[y_train == _class]
    
    for i in range (len(reviews)):
        decode_sentence = reviews[i]
        reviews[i] = decode_sentence
    return reviews

In [71]:
def does_no_appear(review) -> int:
    if "no" in review:
        return 1
    return 0

In [72]:
def count_first_and_second_pro(review) -> int:
    count = 0
    for char in review:
        if char in ["I", "i", "you", "yours"]:
            count += 1
    return count

In [73]:
def exclamation_in_doc(review):
    if "!" in review:
            return 1
    return 0

In [74]:
def log_word_count_in_doc(review):
    return np.log(len(review))

In [75]:
def number_of_words_pos(review, lexicon):
    columns = ['token']
    tmp = lexicon[lexicon.sentiment > 0]
    positive_words = tmp[columns].to_numpy().tolist()
    number_of_pos = np.in1d(positive_words, review)
    return sum(number_of_pos)

In [76]:
def number_of_words_neg(review, lexicon):
    columns = ['token']
    tmp = lexicon[lexicon.sentiment < 0]
    negative_words = tmp[columns].to_numpy().tolist()
    number_of_neg = np.in1d(negative_words, review)
    return sum(number_of_neg)

In [77]:
def LoRegression(X_train, y_train):
    nb_class = 2
    lexicon = import_lexicon("vader_lexicon.txt")
    X_features_of_all_the_class = []
    V = keras.datasets.imdb.get_word_index()

    # Preprocessing

    for _class in range (nb_class):
        reviews = get_reviews_from_class(X_train, y_train, _class, V)
        features = []

        for review in tqdm(reviews):
            feature = []
            feature.append(does_no_appear(review))
            feature.append(count_first_and_second_pro(review))
            feature.append(exclamation_in_doc(review))
            feature.append(log_word_count_in_doc(review))
            feature.append(number_of_words_neg(review, lexicon))
            feature.append(number_of_words_pos(review, lexicon))
            features.append(feature)

        X_features_of_all_the_class.append(features)
    return np.asarray(X_features_of_all_the_class)

In [79]:
X_features_of_all_the_class = LoRegression(x_train, y_train)

  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)


  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
100%|██████████| 12500/12500 [06:19<00:00, 32.96it/s]
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)


  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)


  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
  mask |= (ar1 == a)
100%|██████████| 12500/12500 [06:14<00:00, 33.41it/s]


In [148]:
print(X_features_of_all_the_class.reshape(2 * 12500, 6))
X_features = X_features_of_all_the_class.reshape(X_features_of_all_the_class.shape[0] * X_features_of_all_the_class.shape[1], X_features_of_all_the_class.shape[2])

[[0.         0.         0.         5.24174702 0.         0.        ]
 [0.         0.         0.         4.94875989 1.         0.        ]
 [0.         0.         0.         4.99043259 0.         1.        ]
 ...
 [0.         0.         0.         5.21493576 1.         0.        ]
 [0.         0.         0.         5.19295685 1.         1.        ]
 [0.         0.         0.         5.01063529 0.         0.        ]]


In [146]:
clf = LogisticRegression(random_state=0).fit(X_features, y_train)

(25000, 6)
(25000,)
(25000,)


In [144]:
from sklearn.metrics import classification_report
#y_test = np.resize(y_test, 150000).reshape(25000,6)
y_test = y_test.reshape(25000, 6)
print(y_test.shape)
y_pred = clf.predict(y_test)
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred, target_names=target_names))

(25000,)


ValueError: Expected 2D array, got 1D array instead:
array=[0 1 1 ... 0 0 0].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.