# Logistic Regression and Boosting Algorithms

© Data Trainers LLC. GPL v 3.0.

**Author:** Axel Sirota


## Predicting a Single Categorical Response
---



### Installing stuff

In [None]:
!pip install --upgrade textblob spacy 'gensim==4.2.0' swifter keras_preprocessing

In [None]:
!python -m textblob.download_corpora lite
!python -m spacy download en_core_web_sm

In [None]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB         # Naive Bayes
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer

import spacy
import gensim
import warnings
import nltk
warnings.filterwarnings('ignore')
nltk.download('punkt')
textblob_tokenizer = lambda x: TextBlob(x).words


In [None]:
%%writefile get_data.sh
if [ ! -f yelp.csv ]; then
  wget -O yelp.csv https://www.dropbox.com/s/xds4lua69b7okw8/yelp.csv?dl=0
fi

In [None]:
!bash get_data.sh

In [7]:
# Read yelp.csv into a DataFrame.
path = './yelp.csv'
yelp = pd.read_csv(path)
# Create a new DataFrame that only contains the 5-star and 1-star reviews.
yelp_best_worst = yelp[ (yelp.stars == 1) | (yelp.stars == 5) ]

# Define X and y.
X = yelp_best_worst.text
y = yelp_best_worst.stars

# Split the new DataFrame into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3)

<a id="using-logistic-regression-for-classification"></a>
## Using Logistic Regression for Classification
---



In [None]:
# Fit a logistic regression model to predict stars from text

logreg = LogisticRegression(solver='lbfgs')

logreg.fit(X,y)


Of course this simply fails, we need to preprocess the text, convert it into a Tensor format and then and only then we can use models!

### Converting text to vectors

In [9]:
import re
nltk.download('stopwords')
my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'


def preprocess_text(text, should_join=True):
    text = ' '.join(word.lower() for word in textblob_tokenizer(text))
    text = re.sub(r'http\S+', '', text) # remove http links
    text = re.sub(r'bit.ly/\S+', '', text) # rempve bitly links
    text = text.strip('[link]') # remove [links]
    text = re.sub('['+my_punctuation + ']+', ' ', text) # remove punctuation
    text = re.sub('\s+', ' ', text) #remove double spacing
    text = re.sub(r"[^a-zA-Z.,&!?]+", r" ", text) # only normal characters
    text_token_list = [word for word in text.split(' ')
                            if word not in my_stopwords] # remove stopwords
    text_token_list = [word_rooter(word) if '#' not in word else word
                        for word in text_token_list] # apply word rooter
    text = ' '.join(text_token_list)
    if should_join:
      return ' '.join(gensim.utils.simple_preprocess(text))
    else:
      return gensim.utils.simple_preprocess(text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
# Apply the preprocessing to the dataset
import swifter
X_preprocessed = X.swifter.apply(preprocess_text)

Pandas Apply:   0%|          | 0/4086 [00:00<?, ?it/s]

In [15]:
X[0]

'My wife took me here on my birthday for breakfast and it was excellent.  The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.  Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning.  It looked like the place fills up pretty quickly so the earlier you get here the better.\n\nDo yourself a favor and get their Bloody Mary.  It was phenomenal and simply the best I\'ve ever had.  I\'m pretty sure they only use ingredients from their garden and blend them fresh when you order it.  It was amazing.\n\nWhile EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious.  It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete.  It was the best "toast" I\'ve ever had.\n\nAnyway, I can\'t wait to go back!'

In [14]:

X_preprocessed[0]

'wife took birthday breakfast excel weather perfect made sit outsid overlook ground absolut pleasur waitress excel food arriv quickli semi busi saturday morn look like place fill pretti quickli earlier get better favor get bloodi mari phenomen simpli best ever pretti sure use ingredi garden blend fresh order amaz everyth menu look excel white truffl scrambl egg veget skillet tasti delici came piec griddl bread amaz absolut made meal complet best toast ever anyway ca wait go bac'

How do we pass from text to numbers? With tokenizers. We will use Tensorflow ones!

In [16]:
# Find a set named vocab that has all unique words
# prompt: # Find a set named vocab that has all unique words

vocab = set()
for text in X_preprocessed:
  for word in text.split():
    vocab.add(word)

In [17]:
print(f'{len(vocab)} unique words')

13140 unique words


In [22]:
def get_maximum_review_length(srs):
    maximum = 0
    for text in srs:
      maximum = max(maximum, len(text.split()))
    return maximum


maximum = get_maximum_review_length(X_preprocessed)

In [23]:
print(f'The maximum review was {maximum} words long')

The maximum review was 477 words long


In [24]:
from tensorflow.keras.layers.experimental import preprocessing
ids_from_words = preprocessing.StringLookup(vocabulary=list(vocab), mask_token=None)

In [25]:
words_from_ids = preprocessing.StringLookup(
    vocabulary=ids_from_words.get_vocabulary(), invert=True, mask_token=None)

In [26]:
import tensorflow as tf
def text_from_ids(ids):
  return tf.strings.reduce_join(words_from_ids(ids), axis=-1, separator=' ')

In [27]:
ids = ids_from_words(preprocess_text('Only you can prevent forest fires', should_join=False))
ids

<tf.Tensor: shape=(3,), dtype=int64, numpy=array([7362, 6802, 9842])>

In [53]:
vocab_list = list(vocab)
print(vocab_list[7361])


prevent


In [28]:
preprocess_text('Only you can prevent forest fires', should_join=False)

['prevent', 'forest', 'fire']

In [29]:
text_from_ids(ids)


<tf.Tensor: shape=(), dtype=string, numpy=b'prevent forest fire'>

In [30]:
def pad_sequence_of_tokens(x, maxlen, unk_token='[UNK]'):
  if len(x)<maxlen:
    x.extend([unk_token]*(maxlen-len(x)))
  return x

In [31]:
from keras_preprocessing.sequence import pad_sequences
# Very useful method to keep in mind
def get_ids_tensor(srs):

  processed = srs.swifter.apply(lambda x: pad_sequence_of_tokens(preprocess_text(x, should_join=False), maxlen=maximum)).to_list()
  return tf.squeeze(tf.constant(pad_sequences(ids_from_words(processed), maxlen=maximum, padding='post'), dtype='int32'))



In [32]:
all_ids = get_ids_tensor(srs=X_preprocessed.reset_index(drop=True))
all_ids

Pandas Apply:   0%|          | 0/4086 [00:00<?, ?it/s]

<tf.Tensor: shape=(4086, 477), dtype=int32, numpy=
array([[ 6028, 11595, 12216, ...,     0,     0,     0],
       [10326,   537,  5877, ...,     0,     0,     0],
       [12616,  1200,  1188, ...,     0,     0,     0],
       ...,
       [10900,  3070,  7452, ...,     0,     0,     0],
       [ 8354,  3464,  2911, ...,     0,     0,     0],
       [  833, 11069, 10151, ...,     0,     0,     0]], dtype=int32)>

In [33]:
all_ids.shape

TensorShape([4086, 477])

In [55]:
X_preprocessed[0]

'wife took birthday breakfast excel weather perfect made sit outsid overlook ground absolut pleasur waitress excel food arriv quickli semi busi saturday morn look like place fill pretti quickli earlier get better favor get bloodi mari phenomen simpli best ever pretti sure use ingredi garden blend fresh order amaz everyth menu look excel white truffl scrambl egg veget skillet tasti delici came piec griddl bread amaz absolut made meal complet best toast ever anyway ca wait go bac'

In [56]:
print(vocab_list[6027])

wife


In [54]:
all_ids[0]

<tf.Tensor: shape=(477,), dtype=int32, numpy=
array([ 6028, 11595, 12216,  9081,  6610,  4859,  7930,  8985,  2666,
        8296,  7101,  5099,  6915,  9302,   792,  6610,  3240,  7243,
        2194, 12717,  9163, 10052, 12496,  7514,  2911,  4828,  3418,
        7813,  2194, 10254, 12339,   413, 11438, 12339,  2008,  5904,
        9610,  9760,  6645,  9497,  7813,  1742,  6940,  4408, 13053,
        5591,  5253,  8487,  4795, 10524,  9852,  7514,  6610,  8999,
        3790,  1538,  6983,  3351,  7380,  7022,   244,  2302,  3740,
        7315,  1372,  4795,  6915,  8985, 12847,  1516,  6645,  4618,
        9497, 12234,   797,  3432,  5947,  2875,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,   

In [35]:
all_ids_np = all_ids.numpy()

In [36]:
# Split the all_ids into.a train a test sets
X_train, X_test, y_train, y_test = train_test_split(all_ids_np, y, test_size=0.2, random_state=42)

### Using Logistic Regression

In [41]:

# Train a Logistic Regression on X_train and give the accuracy
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))


0.7787286063569682


## Using Boosting Algorithms and other things

In [45]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=50, learning_rate=0.5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.7640586797066015


In [46]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=50, learning_rate=0.5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.7836185819070904


In [47]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=50)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.793398533007335


## Multiclass Classification

Just check in the estimators, most support multiclass classification.

In [None]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
X, y = load_iris(return_X_y=True)
clf = LogisticRegression(random_state=0, multi_class='multinomial').fit(X, y)
clf.predict(X[:2, :])
clf.predict_proba(X[:2, :])
clf.score(X, y)

### **Homework**: Try to perform the stars classification with Logistic Regression but without filtering only for 5 and 1 stars.