In [28]:
from datasets import load_dataset
from datasets import get_dataset_split_names
import pandas as pd
import numpy as np

In [29]:
dataset = load_dataset("imdb")
dataset

Found cached dataset imdb (/Users/quentinfisch/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
100%|██████████| 3/3 [00:00<00:00, 92.93it/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

## Features

### Preprocessing

In [None]:
from string import punctuation
import re

def preprocess(dataset: pd.DataFrame) -> pd.DataFrame :
    """
    Preprocess the dataset by lowercasing the text and removing the punctuation manually

    Parameters
    ----------
    dataset : pd.DataFrame
        The dataset to preprocess

    Returns
    -------
    pd.DataFrame
        The preprocessed dataset
    """
    # First lower the case
    dataset["document"] = dataset["document"].apply(lambda x: x.lower())
    # Replace the punctuation with spaces. We keep the ' - that may give revelant informations
    # Replace HTML tag <br />
    punctuation_to_remove = '|'.join(map(re.escape, sorted(list(filter(lambda p: p != "'" and p != '-' and p != "!", punctuation)), reverse=True)))
    print(f"Deleting all these punctuation: {punctuation_to_remove}")
    dataset["document"] = dataset["document"].apply(lambda x: re.sub(punctuation_to_remove, " ", x.replace('<br />', "")))
    return dataset

In [None]:
train_raw = pd.DataFrame(dataset["train"], columns=["text", "label"]).rename(columns={"text": "document", "label": "class"})
preprocessed_train = preprocess(train_raw)
preprocessed_train

test_raw = pd.DataFrame(dataset["test"], columns=["text", "label"]).rename(columns={"text": "document", "label": "class"})
preprocessed_test = preprocess(test_raw)
preprocessed_test

Load lexicon and keep only interesting tokens (one above the treshold)

In [24]:
treshhold = 1
lexicon = pd.read_csv("vader_lexicon.txt", sep="\t", names=['Token', "Score", "Std", "Vector"]).drop(columns=["Std", "Vector"]).set_index("Token")
lexicon = lexicon[(lexicon["Score"] <= -treshhold) | (lexicon["Score"] >= treshhold)]
lexicon

Unnamed: 0_level_0,Score
Token,Unnamed: 1_level_1
$:,-1.5
%-),-1.5
( '}{' ),1.6
('-:,2.2
(':,2.3
...,...
|^:,-1.1
||-:,-2.3
}:,-2.1
}:(,-2.0


Generate the following features:
- 1 if "no" appears in the document, 0 otherwise.
- The count of first and second pronouns in the document.
- 1 if "!" is in the document, 0 otherwise.
- Log(word count in the document).
- Number of words in the document which are in the positive lexicon.
- Number of words in the document which are in the negative lexicon.
- [Bonus] Add another feature of your choice.

In [42]:
# Generate the following features:
# - 1 if "no" appears in the document, 0 otherwise.
# - The count of first and second pronouns in the document.
# - 1 if "!" is in the document, 0 otherwise.
# - Log(word count in the document).
# - Number of words in the document which are in the positive lexicon.
# - Number of words in the document which are in the negative lexicon.
# - [Bonus] Add another feature of your choice.

def generate_features(dataset: pd.DataFrame) -> pd.DataFrame :
    """
    Generate the features for the dataset

    Parameters
    ----------
    dataset : pd.DataFrame
        The dataset to generate the features for

    Returns
    -------
    pd.DataFrame
        The dataset with the features
    """
    # First lower the case
    dataset["no"] = dataset["document"].apply(lambda x: 1 if "no" in x.split(" ") else 0)
    dataset["pronouns"] = dataset["document"].apply(lambda x: x.count("i") + x.count("me") + x.count("my") + x.count("mine") + x.count("we") + x.count("us") + x.count("our") + x.count("ours") + x.count("you") + x.count("your") + x.count("yours") + x.count("u") + x.count("ur") + x.count("urs"))
    dataset["exclamation"] = dataset["document"].apply(lambda x: 1 if "!" in x else 0)
    dataset["log_word_count"] = dataset["document"].apply(lambda x: np.log(len(x.split(" "))))
    dataset["positive_lexicon"] = dataset["document"].apply(lambda x: len(list(filter(lambda w: w in lexicon.index and lexicon.at[w, "Score"] >= 1, x.split(" ")))))
    dataset["negative_lexicon"] = dataset["document"].apply(lambda x: len(list(filter(lambda w: w in lexicon.index and lexicon.at[w, "Score"] <= -1, x.split(" ")))))
    return dataset

In [43]:
features_train = generate_features(preprocessed_train)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [37]:
w = "no"
w in lexicon.index and lexicon.loc[w, "Score"] >= 1

False

In [44]:
sentence = preprocessed_train.iloc[0]["document"]
len(list(filter(lambda w: w in lexicon.index and lexicon.loc[w, "Score"] >= 1, sentence.split(" "))))

7