In [7]:
from datasets import load_dataset
from datasets import get_dataset_split_names
import pandas as pd
import numpy as np

In [8]:
dataset = load_dataset("imdb")
dataset

Found cached dataset imdb (/Users/quentinfisch/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
100%|██████████| 3/3 [00:00<00:00, 96.89it/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

## Features

### Preprocessing

In [9]:
from string import punctuation
import re

def preprocess(dataset: pd.DataFrame) -> pd.DataFrame :
    """
    Preprocess the dataset by lowercasing the text and removing the punctuation manually

    Parameters
    ----------
    dataset : pd.DataFrame
        The dataset to preprocess

    Returns
    -------
    pd.DataFrame
        The preprocessed dataset
    """
    # First lower the case
    dataset["document"] = dataset["document"].apply(lambda x: x.lower())
    # Replace the punctuation with spaces. We keep the ' - that may give revelant informations
    # Replace HTML tag <br />
    punctuation_to_remove = '|'.join(map(re.escape, sorted(list(filter(lambda p: p != "'" and p != '-' and p != "!", punctuation)), reverse=True)))
    print(f"Deleting all these punctuation: {punctuation_to_remove}")
    dataset["document"] = dataset["document"].apply(lambda x: re.sub(punctuation_to_remove, " ", x.replace('<br />', "")))
    return dataset

In [15]:
train_raw = pd.DataFrame(dataset["train"], columns=["text", "label"]).rename(columns={"text": "document", "label": "class"})
preprocessed_train = preprocess(train_raw)
preprocessed_train

test_raw = pd.DataFrame(dataset["test"], columns=["text", "label"]).rename(columns={"text": "document", "label": "class"})
preprocessed_test = preprocess(test_raw)
preprocessed_test

Deleting all these punctuation: \~|\}|\||\{|`|_|\^|\]|\\|\[|@|\?|>|=|<|;|:|/|\.|,|\+|\*|\)|\(|\&|%|\$|\#|"
Deleting all these punctuation: \~|\}|\||\{|`|_|\^|\]|\\|\[|@|\?|>|=|<|;|:|/|\.|,|\+|\*|\)|\(|\&|%|\$|\#|"


Unnamed: 0,document,class
0,i love sci-fi and am willing to put up with a ...,0
1,worth the entertainment value of a rental esp...,0
2,its a totally average film with a few semi-alr...,0
3,star rating saturday night friday ...,0
4,first off let me say if you haven't enjoyed a...,0
...,...,...
24995,just got around to seeing monster man yesterda...,1
24996,i got this as part of a competition prize i w...,1
24997,i got monster man in a box set of three films ...,1
24998,five minutes in i started to feel how naff th...,1


Load lexicon and keep only interesting tokens (one above the treshold)

In [12]:
treshhold = 1
lexicon = pd.read_csv("vader_lexicon.txt", sep="\t", names=['Token', "Score", "Std", "Vector"]).drop(columns=["Std", "Vector"]).set_index("Token")
lexicon = lexicon[(lexicon["Score"] <= -treshhold) | (lexicon["Score"] >= treshhold)]
lexicon

Unnamed: 0_level_0,Score
Token,Unnamed: 1_level_1
$:,-1.5
%-),-1.5
( '}{' ),1.6
('-:,2.2
(':,2.3
...,...
|^:,-1.1
||-:,-2.3
}:,-2.1
}:(,-2.0


Generate the following features:
- 1 if "no" appears in the document, 0 otherwise.
- The count of first and second pronouns in the document.
- 1 if "!" is in the document, 0 otherwise.
- Log(word count in the document).
- Number of words in the document which are in the positive lexicon.
- Number of words in the document which are in the negative lexicon.
- [Bonus] Add another feature of your choice.

In [43]:
# Generate the following features:
# - 1 if "no" appears in the document, 0 otherwise.
# - The count of first and second pronouns in the document.
# - 1 if "!" is in the document, 0 otherwise.
# - Log(word count in the document).
# - Number of words in the document which are in the positive lexicon.
# - Number of words in the document which are in the negative lexicon.
# - [Bonus] Add another feature of your choice.

def is_in_lexicon(word: str, positive: bool):
    try:
        score = lexicon.at[word, "Score"].item()
        if positive:
            return score >= treshhold
        else:
            return score <= -treshhold
    except:
        return False

def generate_features(dataset: pd.DataFrame) -> pd.DataFrame :
    """
    Generate the features for the dataset

    Parameters
    ----------
    dataset : pd.DataFrame
        The dataset to generate the features for

    Returns
    -------
    pd.DataFrame
        The dataset with the features
    """
    dataset["no"] = dataset["document"].apply(lambda x: 1 if "no" in x.split(" ") else 0)
    dataset["pronouns"] = dataset["document"].apply(lambda x: x.split(" ")).apply(lambda x: x.count("i") + x.count("me") + x.count("my") + x.count("mine") + x.count("we") + x.count("us") + x.count("our") + x.count("ours") + x.count("you") + x.count("your") + x.count("yours") + x.count("u") + x.count("ur") + x.count("urs"))
    dataset["exclamation"] = dataset["document"].apply(lambda x: 1 if "!" in x else 0)
    dataset["log_word_count"] = dataset["document"].apply(lambda x: np.log(len(x.split(" "))))
    dataset["positive_lexicon"] = dataset["document"].apply(lambda x: len(list(filter(lambda w: is_in_lexicon(w, True), x.split(" ")))))
    dataset["negative_lexicon"] = dataset["document"].apply(lambda x: len(list(filter(lambda w: is_in_lexicon(w, False), x.split(" ")))))

    # add feature vector column
    dataset["feature_vector"] = dataset.apply(lambda x: [x["no"], x["pronouns"], x["exclamation"], x["log_word_count"], x["positive_lexicon"], x["negative_lexicon"]], axis=1)
    # drop the other columns
    dataset = dataset.drop(columns=["no", "pronouns", "exclamation", "log_word_count", "positive_lexicon", "negative_lexicon"])

    return dataset

In [44]:
reduced_train = preprocessed_train.iloc[::50].copy()

reduced_train = generate_features(reduced_train)
reduced_train

Unnamed: 0,document,class,feature_vector
0,i rented i am curious-yellow from my video sto...,0,"[1, 11, 0, 5.739792912179234, 7, 6]"
50,i saw this film opening weekend in australia ...,0,"[0, 8, 0, 5.402677381872279, 15, 4]"
100,terrible movie nuff said these lines are just...,0,"[0, 10, 0, 5.123963979403259, 3, 13]"
150,this film is a calculated attempt to cash in t...,0,"[0, 2, 0, 5.209486152841421, 5, 4]"
200,this is an action western james steart leads ...,0,"[0, 0, 0, 5.19295685089021, 14, 4]"
...,...,...,...
24750,this could be well have been the definitive fi...,1,"[0, 4, 1, 6.792344427470809, 45, 22]"
24800,you'd be forgiven to think a finnish director ...,1,"[1, 5, 1, 5.308267697401205, 5, 6]"
24850,a tragically wonderful movie brings us to a...,1,"[0, 5, 0, 5.081404364984463, 6, 1]"
24900,i can't remember many films where a bumbling i...,1,"[0, 2, 0, 4.189654742026425, 6, 2]"


## Logistic regression classifier