Logistic Regression

In [1]:
from logistic_regression import *

### Importing HuggingFace

In [2]:
train_dataset = load_dataset('imdb', split='train')
test_dataset = load_dataset('imdb', split='test')

Reusing dataset imdb (/home/surenis/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)
Reusing dataset imdb (/home/surenis/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)


In [3]:
x_train, y_train = generate_x_and_y(train_dataset)
x_test, y_test = generate_x_and_y(test_dataset)

### Logistic Regression

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.linear_model import LogisticRegression
import pandas as pd

In [None]:
type(x_train[0])

In [None]:
def import_lexicon(path: str) -> pd.core.frame.DataFrame: 
    data = pd.read_csv(path, sep='\t', names=[0, 1, 2, 3])
    df = pd.DataFrame()
    df['token'] = data[0]
    df['sentiment'] = data[1]
    return df

In [None]:
def does_no_appear(review: np.str_) -> int:
    if "no" in review:
        return 1
    return 0

In [None]:
def count_first_and_second_pro(review: np.str_) -> int:
    count = 0
    for word in review:
        if word in ["I", "i", "you", "yours"]:
            count += 1
    return count

Let's search for all the words finishing with !

As it would make no sense that the ! character appears before the end of the word

In [None]:
def does_exclamation_appear(review: np.str_) -> int:
    if "!" in review:
        return 1
    return 0

it appears no words ends with ! in the training set

Thus, there is no need to check for ! in the documents

In [None]:
def log_word_count_in_doc(review: np.str_) -> int:
    return np.log(len(review))

In [None]:
def split_lexicon(lexicon: pd.core.frame.DataFrame) -> tuple:
    return lexicon[lexicon.sentiment > 0], lexicon[lexicon.sentiment < 0]

In [None]:
def positivity_counter(review: np.str_ , positive_df: pd.core.frame.DataFrame) -> tuple:
    posi = np.isin(positive_df.token, review)
    #return the # of positive words and their sum
    return sum(posi), sum(positive_df.sentiment[posi])

def negativity_counter(review : np.str_, negative_df: pd.core.frame.DataFrame) -> tuple:
    nega = np.isin(negative_df.token, review)
    #return the # of negative words and their sum
    return sum(nega), sum(negative_df.sentiment[nega])

In [None]:
print(x_train_preprocessed[0])

In [None]:
def LoRegression(x_train: list, y_train: list):
    lexicon = import_lexicon("vader_lexicon.txt")
    positive_df, negative_df = split_lexicon(lexicon)
    X_features = []
    for review in tqdm(x_train):
        feature = np.zeros(8)
        feature[0] = does_no_appear(review)
        feature[1] = does_exclamation_appear(review)
        feature[2] = count_first_and_second_pro(review)
        feature[3] = log_word_count_in_doc(review)
        feature[4], feature[5] = negativity_counter(review, negative_df)
        feature[6], feature[7] = positivity_counter(review, positive_df)
        X_features.append(feature)
    return np.asarray(X_features)

In [None]:
X_train_features = LoRegression(x_train_preprocessed, y_train)

In [None]:
X_train_features.shape
type(X_train_features)

In [None]:
clf = LogisticRegression(random_state=0).fit(X_train_features, y_train)

In [None]:
#create X_test using loRegression to have a usable informations
X_test_features = LoRegression(x_test_preprocessed, y_test)

In [None]:
y_pred = clf.predict(X_test_features)
print(X_test_features.shape, y_pred.shape)

In [None]:
clf.score(X_test_features, y_test)

In [None]:
from sklearn.metrics import classification_report

target_names = ['negative review', 'positive review']
print(classification_report(y_test, y_pred, target_names=target_names))