Logistic Regression

In [1]:
import numpy as np
from tqdm import tqdm
from tensorflow import keras
import tensorflow

### Importing HuggingFace

In [2]:
!pip install datasets
!pip install contractions
!pip install beautifulsoup4



In [3]:
from datasets import load_dataset

In [4]:
train_dataset = load_dataset('imdb', split='train')
test_dataset = load_dataset('imdb', split='test')

Reusing dataset imdb (/home/surenis/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)
Reusing dataset imdb (/home/surenis/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)


In [12]:
import contractions
from bs4 import BeautifulSoup
import unicodedata
import numpy as np
import re

def strip_html_tags(text : str) -> str:
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

def remove_accented_chars(text: str) -> str:
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def expand_contractions(text: str) -> str:
    return contractions.fix(text)

def remove_special_characters(text : str, remove_digits : bool=False) -> str:
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text

def pre_process_document(document : str) -> str:
    # strip HTML
    document = strip_html_tags(document)
    # lower case
    document = document.lower()
    # remove extra newlines (often might be present in really noisy text)
    document = document.translate(document.maketrans("\n\t\r", "   "))
    # remove accented characters
    document = remove_accented_chars(document)
    # expand contractions    
    document = expand_contractions(document)  
    # remove special characters and\or digits    
    # insert spaces between special characters to isolate them    
    #special_char_pattern = re.compile(r'([{.(-)}])')
    #document = special_char_pattern.sub(" \\1 ", document)
    #document = remove_special_characters(document, remove_digits=True)  
    # remove extra whitespace
    document = re.sub(' +', ' ', document)
    document = document.strip()
    
    return document


pre_process_corpus = np.vectorize(pre_process_document)

In [14]:
x_train = pre_process_corpus(train_dataset['text'])
x_train_preprocessed = []

for elt in x_train:
    x_train_preprocessed.append(np.array(elt.split(" ")))
    
y_train = train_dataset['label']



x_test_preprocessed = []

x_test = pre_process_corpus(test_dataset['text'])
for elt in x_test:
    x_test_preprocessed.append(np.array(elt.split(" ")))
                            
y_test = test_dataset['label']

In [15]:
print(x_train_preprocessed[0])
print(x_test_preprocessed[0])

['bromwell' 'high' 'is' 'a' 'cartoon' 'comedy.' 'it' 'ran' 'at' 'the'
 'same' 'time' 'as' 'some' 'other' 'programs' 'about' 'school' 'life,'
 'such' 'as' '"teachers".' 'my' '35' 'years' 'in' 'the' 'teaching'
 'profession' 'lead' 'me' 'to' 'believe' 'that' 'bromwell' "high's"
 'satire' 'is' 'much' 'closer' 'to' 'reality' 'than' 'is' '"teachers".'
 'the' 'scramble' 'to' 'survive' 'financially,' 'the' 'insightful'
 'students' 'who' 'can' 'see' 'right' 'through' 'their' 'pathetic'
 "teachers'" 'pomp,' 'the' 'pettiness' 'of' 'the' 'whole' 'situation,'
 'all' 'remind' 'me' 'of' 'the' 'schools' 'i' 'knew' 'and' 'their'
 'students.' 'when' 'i' 'saw' 'the' 'episode' 'in' 'which' 'a' 'student'
 'repeatedly' 'tried' 'to' 'burn' 'down' 'the' 'school,' 'i' 'immediately'
 'recalled' '.........' 'at' '..........' 'high.' 'a' 'classic' 'line:'
 'inspector:' 'I' 'am' 'here' 'to' 'sack' 'one' 'of' 'your' 'teachers.'
 'student:' 'welcome' 'to' 'bromwell' 'high.' 'i' 'expect' 'that' 'many'
 'adults' 'of' 

### Logistic Regression

In [31]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.linear_model import LogisticRegression
import pandas as pd

In [38]:
type(x_train[0])

numpy.str_

In [42]:
def import_lexicon(path: str) -> pd.core.frame.DataFrame: 
    data = pd.read_csv(path, sep='\t', names=[0, 1, 2, 3])
    df = pd.DataFrame()
    df['token'] = data[0]
    df['sentiment'] = data[1]
    return df

In [44]:
def does_no_appear(review: np.str_) -> int:
    if "no" in review:
        return 1
    return 0

In [45]:
def count_first_and_second_pro(review: np.str_) -> int:
    count = 0
    for word in review:
        if word in ["I", "i", "you", "yours"]:
            count += 1
    return count

Let's search for all the words finishing with !

As it would make no sense that the ! character appears before the end of the word

In [46]:
def does_exclamation_appear(review: np.str_) -> int:
    if "!" in review:
        return 1
    return 0

it appears no words ends with ! in the training set

Thus, there is no need to check for ! in the documents

In [47]:
def log_word_count_in_doc(review: np.str_) -> int:
    return np.log(len(review))

In [51]:
def split_lexicon(lexicon: pd.core.frame.DataFrame) -> tuple:
    return lexicon[lexicon.sentiment > 0], lexicon[lexicon.sentiment < 0]

In [54]:
def positivity_counter(review: np.str_ , positive_df: pd.core.frame.DataFrame) -> tuple:
    posi = np.isin(positive_df.token, review)
    #return the # of positive words and their sum
    return sum(posi), sum(positive_df.sentiment[posi])

def negativity_counter(review : np.str_, negative_df: pd.core.frame.DataFrame) -> tuple:
    nega = np.isin(negative_df.token, review)
    #return the # of negative words and their sum
    return sum(nega), sum(negative_df.sentiment[nega])

In [55]:
print(x_train_preprocessed[0])

['bromwell' 'high' 'is' 'a' 'cartoon' 'comedy.' 'it' 'ran' 'at' 'the'
 'same' 'time' 'as' 'some' 'other' 'programs' 'about' 'school' 'life,'
 'such' 'as' '"teachers".' 'my' '35' 'years' 'in' 'the' 'teaching'
 'profession' 'lead' 'me' 'to' 'believe' 'that' 'bromwell' "high's"
 'satire' 'is' 'much' 'closer' 'to' 'reality' 'than' 'is' '"teachers".'
 'the' 'scramble' 'to' 'survive' 'financially,' 'the' 'insightful'
 'students' 'who' 'can' 'see' 'right' 'through' 'their' 'pathetic'
 "teachers'" 'pomp,' 'the' 'pettiness' 'of' 'the' 'whole' 'situation,'
 'all' 'remind' 'me' 'of' 'the' 'schools' 'i' 'knew' 'and' 'their'
 'students.' 'when' 'i' 'saw' 'the' 'episode' 'in' 'which' 'a' 'student'
 'repeatedly' 'tried' 'to' 'burn' 'down' 'the' 'school,' 'i' 'immediately'
 'recalled' '.........' 'at' '..........' 'high.' 'a' 'classic' 'line:'
 'inspector:' 'I' 'am' 'here' 'to' 'sack' 'one' 'of' 'your' 'teachers.'
 'student:' 'welcome' 'to' 'bromwell' 'high.' 'i' 'expect' 'that' 'many'
 'adults' 'of' 

In [56]:
def LoRegression(x_train: list, y_train: list):
    lexicon = import_lexicon("vader_lexicon.txt")
    positive_df, negative_df = split_lexicon(lexicon)
    X_features = []
    for review in tqdm(x_train):
        feature = np.zeros(8)
        feature[0] = does_no_appear(review)
        feature[1] = does_exclamation_appear(review)
        feature[2] = count_first_and_second_pro(review)
        feature[3] = log_word_count_in_doc(review)
        feature[4], feature[5] = negativity_counter(review, negative_df)
        feature[6], feature[7] = positivity_counter(review, positive_df)
        X_features.append(feature)
    return np.asarray(X_features)

In [33]:
X_train_features = LoRegression(x_train_preprocessed, y_train)

100%|██████████| 25000/25000 [15:00<00:00, 27.76it/s]


In [35]:
X_train_features.shape
type(X_train_features)

numpy.ndarray

In [24]:
clf = LogisticRegression(random_state=0).fit(X_train_features, y_train)

In [25]:
#create X_test using loRegression to have a usable informations
X_test_features = LoRegression(x_test_preprocessed, y_test)

100%|██████████| 25000/25000 [14:47<00:00, 28.16it/s]


In [26]:
y_pred = clf.predict(X_test_features)
print(X_test_features.shape, y_pred.shape)

(25000, 8) (25000,)


In [27]:
clf.score(X_test_features, y_test)

0.71532

In [32]:
from sklearn.metrics import classification_report

target_names = ['negative review', 'positive review']
print(classification_report(y_test, y_pred, target_names=target_names))

                 precision    recall  f1-score   support

negative review       0.71      0.73      0.72     12500
positive review       0.72      0.70      0.71     12500

       accuracy                           0.72     25000
      macro avg       0.72      0.72      0.72     25000
   weighted avg       0.72      0.72      0.72     25000

