# Class - Natural Language Processing 2 : classification

In [None]:
#-----------------------------
# Natural Language Toolkit 
!pip install "nltk==3.4.5"
import nltk
nltk.download('punkt') #word tokenizer
nltk.download('gutenberg')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize, WordPunctTokenizer
import requests #web
from collections import Counter #counting words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string 
import re
#-----------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import matplotlib.cm as cm
import seaborn as sns; 
sns.axes_style("whitegrid")
sns.set_context("talk")
np.random.seed(0)
#-----------------------------
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
#from nltk.lm.preprocessing import padded_everygram_pipeline
#from nltk.lm import MLE
import time
#-----------------------------

## Text classification: Is it Spam?


The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged acording being ham (legitimate) or spam.

The files contain one message per line. Each line is composed by two columns: v1 contains the label (ham or spam) and v2 contains the raw text.


#### Approach
* Load the Data
* Split/Tokenize words
* Lower case
* Stem and handle Stop Words
* Applying Classifier

In [None]:
webaddr='http://public.gettysburg.edu/~jpuckett/ds325/data/'
df = pd.read_csv(webaddr+"spam.csv", encoding = "latin-1")
df = df[['v1', 'v2']]
df = df.rename(columns = {'v1': 'class', 'v2': 'text'})
#df.dropna(inplace=True,axis=1)
class_names=['ham','spam']

In [None]:
df.head(3)

In [None]:
df.shape

In [None]:
plt.rcParams["figure.figsize"] = (7,5)

In [None]:
print(df['class'].value_counts())
ax =sns.countplot(x=df['class']); 
plt.show()

### Encode the classes

In [None]:
df['target'] = df['class'].map( {'spam': 1, 'ham': 0})

In [None]:
target_count = df.target.value_counts()
print('Proportion:', round(target_count[1] / target_count.sum(), 2))

This is the threshold for our classifier.  If our classifier just picked the largest class every time, then it would have **87%** accuracy.  We need to do better than that.  In the following example, we show one strategy on how to mitigate un-balanced datasets.

### Find the length of text

In [None]:
df['length'] = df['text'].apply(len)

In [None]:
sns.kdeplot(data=df, x='length', hue='class', shade=True, common_norm=False)  
plt.xlim(0,220)
plt.show()

It can be seen that ham messages are shorter than spam messages as the distribution of ham and spam message lengths are centered around 30-40 and 155-160 characters, respectively.



In [None]:
df.head()

### classify with Naive Bayes on length



In [None]:
# splitting training data into train and test
X = df['length'].values.reshape(-1, 1)
Y = df['target']
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=.3, random_state=325)
print(X_train.shape)
print(y_train.shape)

In [None]:
clf = MultinomialNB()
clf.fit(X_train, y_train);

In [None]:
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
print("Accuracy : {} %".format(round(acc*100, 2)))

In [None]:
fig,ax = plt.subplots(1,1,figsize=(5, 5))
cmd=ConfusionMatrixDisplay(cm, display_labels=class_names) 
cmd.plot(ax=ax)
plt.title("Accuracy : {} %".format(round(acc*100, 2)))
plt.show()

Just using length, we were able to get **87%** accuracy, let's see if we can do better with NLP tools.  

* You can see the NB classifier is just choosing everything is **ham**.

### Use Natural Language Processing

### define our tokenizer

In [None]:
#define function to convert raw text into tokens as detailed above
    
def cleanText(raw_text): #tokenize, lowercase, remove stopwords, remove punctuation, lemmatize
    tokenizer      = nltk.tokenize.word_tokenize
    stop_words     = set(nltk.corpus.stopwords.words('english'))
    stemmer        = nltk.stem.PorterStemmer()
    wnl            = nltk.WordNetLemmatizer()
    ##
    tokens         = tokenizer(raw_text)                                     #step 1
    tokens         = [ word.lower() for word in tokens ]                     #step 2 make all tokens lowercase
    tokens         = [ w for w in tokens if not w in stop_words ]            #step 3 remove stop words
    tokens         = [word for word in tokens if word.isalpha()]             #step 4 remove non-alpha characters     
    tokens         = [ porter.stem( t ) for t in tokens ]              #step 5 stem or lemmatize
    text           = ' '.join(tokens)
    return text

In [None]:
df['clean']  = df.text.map(lambda x: cleanText(x))
df.head()

### Train test split

In [None]:
# splitting training data into train and test
Y = df['target']
X = df['clean']
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=.3, random_state=325)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

### TF-IDF

Machine learning algorithms cannot work with raw text directly. The text must be converted into numbers—more specifically, vectors of numbers.  We'll use TF-IDF which stands for Term Frequency-Inverse Document Frequency. 


##### max_df is used for removing terms that appear too frequently
* also known as "corpus-specific stop words". 

* **max_df** = 0.50 means "ignore terms that appear in more than 50% of the documents".
* **max_df** = 25 means "ignore terms that appear in more than 25 documents".
* The default **max_df=1**, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms.


##### **min_df** is used for removing terms that appear too infrequently. 

* **min_df** = 0.01 means "ignore terms that appear in less than 1% of the documents".
* **min_df** = 5 means "ignore terms that appear in less than 5 documents".
* The default **min_df=1**, which means "ignore terms that appear in less than 1 document". Thus, the default setting does not ignore any terms.

In [None]:
tfidf        = TfidfVectorizer(min_df=0.01) #ignore in-frequently used terms
tfidf.fit(X_train);

In [None]:
X_train_vec  = tfidf.transform(X_train).toarray()

### classify with Naive Bayes



In [None]:
clf = MultinomialNB()
clf.fit(X_train_vec, y_train);

In [None]:
X_test_vec  = tfidf.transform(X_test).toarray()
y_pred = clf.predict(X_test_vec)
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
print("Accuracy : {} %".format(round(acc*100, 2)))

In [None]:
fig,ax = plt.subplots(1,1,figsize=(5, 5))
cmd=ConfusionMatrixDisplay(cm, display_labels=class_names) 
cmd.plot(ax=ax)
plt.title("Accuracy : {} %".format(round(acc*100, 2)))
plt.show()

NB is handling this spam filter with over **96%** accuracy!

### Find the Important features

How to determine the important features?

In [None]:
def getMostImportantFeaturesNB(clf,tfidf,N=20):
    feature_names = tfidf.get_feature_names()
    nclasses      = clf.feature_log_prob_.shape[0]
    features      = []
    for i in range(nclasses):
        feature_prob  = (clf.feature_log_prob_[i]) #class i
        indices       = np.argsort(np.abs(feature_prob))[::-1]
        features_i    = []
        for j in range(N):
            features_i.append(feature_names[indices[j]])
        features.append(features_i)
    for i in range(nclasses):
        print("class %d important features"%i)
        print(features[i])

In [None]:
getMostImportantFeaturesNB(clf,tfidf)

# Example 2: amazon review sentiment

This dataset consists of reviews from amazon. The data span a period of 18 years and refer to cell phone and accessories reviewed by users.


* [http://snap.stanford.edu/data/web-Amazon.html](http://snap.stanford.edu/data/web-Amazon.html)
* J. McAuley and J. Leskovec. Hidden factors and hidden topics: understanding rating dimensions with review text. RecSys, 2013.

In [None]:
webaddr='http://public.gettysburg.edu/~jpuckett/ds325/data/'
df = pd.read_csv(webaddr+'amazon-cellphone.csv.zip',compression='gzip')
df.head(2)

In [None]:
df.isnull().sum()

Drop missing reviews.

In [None]:
df.dropna(inplace=True)

Rename our columns.

In [None]:
df.rename({'overall': 'class', 'reviewText': 'text'}, axis=1, inplace=True)

In [None]:
print(df['class'].value_counts())

In [None]:
ax =sns.countplot(x=df['class']); 
plt.show()

### Coarse-graining

The labels for the reviews are “fine-grained” sentiment labels ranging from 1 to 5: highly negative, negative, neutral, positive, and highly positive.

We are tackling a simplified version of this task which frequently appears in the literature: positive/negative
binary sentiment classification of sentences, with neutral sentences discarded from the dataset

* We remove 3 data as neutral to focus on 'positive' and 'negative' sentiment.
* Do we really expect 1-star and 2-star reviews to be very different?

In [None]:
#remove neutral 3's
df = df[df['class'] != 3]

### encode the review into positive and negative based on rating

In [None]:
# Encoding 4s and 5s as 1 (positive sentiment) and 1s and 2s as 0 (negative sentiment)
df['target'] = np.where(df['class'] > 3, 1, 0)
class_names=['negative','positive']
print(df['target'].value_counts())

In [None]:
ax =sns.countplot(x=df['target']); 
ax.set_xticklabels(class_names)
plt.show()

In [None]:
target_count = df.target.value_counts()
print('Proportion:', round(target_count[1] / target_count.sum(), 2))

### This dataset is very unbalanced.

* For instance, if our model predicted every reviewer's response to be **positive**, the accuracy would be **86%**. 

### Resampling
A widely adopted technique for dealing with highly unbalanced datasets is called resampling. It consists of removing samples from the majority class (under-sampling) and / or adding more examples from the minority class (over-sampling).

* under-sampling
* over-sampling

<img src='https://raw.githubusercontent.com/rafjaa/machine_learning_fecib/master/src/static/img/resampling.png'></img>


Despite the advantage of balancing classes, these techniques also have their weaknesses (there is no free lunch). The simplest implementation of over-sampling is to duplicate random records from the minority class, which can cause overfitting. In under-sampling, the simplest technique involves removing random records from the majority class, which can cause loss of information.

Let's implement a basic example, which uses the DataFrame.sample method to get random samples each class:


## Random under-sampling

In [None]:
# Divide by class
df_class_0    = df[df['target'] == 0] #class with fewer samples
df_class_1    = df[df['target'] == 1] #class with more samples
count_class_0 = len(df_class_0)
count_class_1 = len(df_class_1)

In [None]:
df_class_1_under = df_class_1.sample(count_class_0)
df_under         = pd.concat([df_class_1_under, df_class_0], axis=0)
print('Random under-sampling:')
print(df_under.target.value_counts())
#plot
ax = sns.countplot(x=df_under['target']); 
ax.set_xticklabels(class_names)
plt.show()

## Random over-sampling

In [None]:
df_class_0_over = df_class_0.sample(count_class_1, replace=True)
df_over         = pd.concat([df_class_0_over, df_class_1], axis=0)
print('Random over-sampling:')
print(df_over.target.value_counts())
#plot
ax = sns.countplot(x=df_over['target']); 
ax.set_xticklabels(class_names)
plt.show()

### define our tokenizer, clean the text, prepare for TFIDF

In [None]:
#define function to convert raw text into tokens as detailed above
tokenizer      = nltk.tokenize.word_tokenize
stop_words     = set(nltk.corpus.stopwords.words('english'))
wnl            = nltk.WordNetLemmatizer()


def cleanText(raw_text): #tokenize, lowercase, remove stopwords, removePunctuation, lemmatize
    tokenizer      = nltk.tokenize.word_tokenize
    stop_words     = set(nltk.corpus.stopwords.words('english'))
    wnl            = nltk.WordNetLemmatizer()
    #---
    tokens         = tokenizer(raw_text)                                #step 1    
    tokens         = [ word.lower() for word in tokens ]                #step 2
    tokens         = [ w for w in tokens if not w in stop_words ]       #step 3
    tokens         = [ w for w in tokens if w.isalpha() ]               #step 4
    tokens         = [ wnl.lemmatize ( t ) for t in tokens ]            #step 5
    text           = ' '.join(tokens)
    return text

### re-check to make sure no NaN values

our NLTK tokenizer can't handle nans

In [None]:
df_under.isnull().sum()

In [None]:
# cleaning/processing the text can take a few minutes
df_under['clean']  = df_under.text.map(lambda x: cleanText(x)) #this may take a few minutes
df_under['length'] = df_under['text'].apply(len)

In [None]:
df_under.head()

### train-test split

In [None]:
# splitting training data into train and test
Y = df_under['target']
X = [d for d in df_under['clean']]
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=.3, random_state=325)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

### TF-IDF

Machine learning algorithms cannot work with raw text directly. The text must be converted into numbers—more specifically, vectors of numbers.  We'll use TF-IDF which stands for Term Frequency-Inverse Document Frequency. 


In [None]:
tfidf        = TfidfVectorizer(min_df=0.001)
tfidf.fit(X_train);

In [None]:
X_train_vec  = tfidf.transform(X_train).toarray()
X_train_vec.shape

### classify with Naive Bayes



In [None]:
clf = MultinomialNB()
clf.fit(X_train_vec, y_train);

In [None]:
X_test_vec  = tfidf.transform(X_test).toarray()
y_pred = clf.predict(X_test_vec)
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
print("Accuracy : {} %".format(round(acc*100, 2)))

In [None]:
fig,ax = plt.subplots(1,1,figsize=(5, 5))
cmd=ConfusionMatrixDisplay(cm, display_labels=class_names) 
cmd.plot(ax=ax)
plt.title("Accuracy : {} %".format(round(acc*100, 2)))
plt.show()

#### Important features

In [None]:
def getMostImportantFeaturesNB(clf,tfidf,N=20):
    feature_names = tfidf.get_feature_names()
    nclasses      = clf.feature_log_prob_.shape[0]
    clf.feature_log_prob_.shape
    features      = []
    for i in range(nclasses):
        feature_prob  = (clf.feature_log_prob_[i]) #class i
        indices       = np.argsort(np.abs(feature_prob))[::-1]
        features_i    = []
        for j in range(N):
            features_i.append(feature_names[indices[j]])
        features.append(features_i)
    for i in range(nclasses):
        print("class %d important features"%i)
        print(features[i])

In [None]:
getMostImportantFeaturesNB(clf,tfidf)

## Introducing the pipeline

### Use Bi-grams with TF-IDF

##### max_df is used for removing terms that appear too frequently
* also known as "corpus-specific stop words". 

* **max_df** = 0.50 means "ignore terms that appear in more than 50% of the documents".
* **max_df** = 25 means "ignore terms that appear in more than 25 documents".
* The default **max_df=1**, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms.


##### **min_df** is used for removing terms that appear too infrequently. 

* **min_df** = 0.01 means "ignore terms that appear in less than 1% of the documents".
* **min_df** = 5 means "ignore terms that appear in less than 5 documents".
* The default **min_df=1**, which means "ignore terms that appear in less than 1 document". Thus, the default setting does not ignore any terms.

In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()), #define our vectorizer
    ('clf', MultinomialNB()),      #define our classifier
])

parameters = {
    'tfidf__min_df': (0,   0.001),
    'tfidf__max_df': (0.5, 1.0),
    'tfidf__max_features': [1000, None],
    'tfidf__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
}

grid = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=3);

In [None]:
start_time = time.time()
grid.fit(X_train,y_train);
print("GridSearch took %3.2f seconds " % (time.time() - start_time)) #took about 72s

In [None]:
print("Best Score: ", grid.best_score_)
print("Best Params: ", grid.best_params_)

In [None]:
model = grid.best_estimator_  #should already be trained/fit
#model.fit(X_train,y_train)

In [None]:
y_pred=model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
print("Accuracy : {} %".format(round(acc*100, 2)))


In [None]:
fig,ax = plt.subplots(1,1,figsize=(5, 5))
cmd=ConfusionMatrixDisplay(cm, display_labels=class_names) 
cmd.plot(ax=ax)
plt.title("Accuracy : {} %".format(round(acc*100, 2)))
plt.show()

## Application

Let's now use our model to make a function that will determine if a review is positive or negative.


#### Label probabilities for a sentence
The classification is essentially $$P(\text{class}_1|\text{review})$$. 

To find out, we need to use the **.predict_proba** method instead of the usual .predict. 

Below demonstrates how to find the probability estimates assigned to either label for some reviews I scaped from Amazon.


In [None]:
def findSentiment(raw_text,tfidf,clf): #tfidf is our vectorizer, clf is our classifier
    clean_text = [cleanText(rt) for rt in raw_text] #important to prepare data the same way
    tfs_vecs   = tfidf.transform(clean_text) #must just be transform, don't re-fit
    tfidf_data = tfs_vecs.toarray()
    y_pred     = clf.predict(tfidf_data)
    y_class    = ['positive' if y==1 else 'negative' for y in y_pred  ]
    y_prob     = clf.predict_proba(tfidf_data)
    out        = [(c,round(p.max(),4)) for c,p in zip(y_class,y_prob)]   #get class and probability
    for o in out:
        print(o)

In [None]:
tfidf      = model[0]    #vectorizer can be indexed from the pipeline model
clf        = model[1]    #classifier

In [None]:
#=============================   manually pulled from website
review1="It fits perfectly with my iPhone 12 and the quality looks good too!"
review2="Sturdy case as always. It's not small, and adds a fair bit to the tablet, but it protects extremely well and the stand that is built into the top is extremely handy. Gives you options on what angle to hold the tablet as well as horizontal and vertical positions. The screen protector built in has not affect the touch screen at all either."
review3="I really love it. The dust absorber helps a lot and remove all dust, and the screen protector is super easy to put on without any bubbles. Finally I had a glass-like screen and protect. See the pics attached, I love it and strongly recommend."
review4="One word: impressive. I like the mint green color.I did not attach the strap bc I don’t plan on walking around with an iPad on my shoulder. I think for the price you will not be disappointed."
goodReviews=[review1,review2,review3,review4]
findSentiment(goodReviews,tfidf,clf) #pass our model, vectorizer and classifier

In [None]:
#=============================   manually pulled from website   
review1="This is was a complete waste of money. The protective screen on the cover is super flimsy and wobbles and bubbles when you touch it. The rubber pay off the case doesn't sir on the plastic case right and you can't get at the volume and power button. Threw it in the trash."
review2="It looked good for a few days. But broke in a few days for a simple fall."
review3="The hand strap tore up after a day, terrible quality. Not worth the money."
review4="I believe this case will fit my needs, however when I rec'd the package, I noticed a couple of things were missing which makes me wonder if I rec'd a used item. Still waiting for a reply back. I was told by the manufacture when I reached out to them to contact the seller. So not to sure about the seller not taking responsibility with their product? I'm sure this will get worked out, but have to be honest with my review."
badReviews=[review1,review2,review3,review4]
findSentiment(badReviews,tfidf,clf) #pass our model, vectorizer and classifier

# Maximal likelihood estimates and text generation

We will use MLE to generate fake text.

We don't want to used the cleaned text above that was 'stemmed' or 'lemmatized' and we'd like to keep stop words.

In [None]:
# lets just clean up the raw reviews by remove special characters and making every lower case.
X_mle = [re.sub("[()!#]", " ", d.lower() ) for d in df_under['text']]
X_mle[0]

In [None]:
#tokenize the words of each review
tokenized_text = [list(word_tokenize(x)) for x in X_mle]

In [None]:
n=2 #bigrams
train_data, padded_vocab = nltk.lm.preprocessing.padded_everygram_pipeline(n, tokenized_text)

Construct model from the n-gram padded vocabulary.

In [None]:
model = nltk.lm.MLE(n)
# fit on padded vocab that the model know the new tokens added to vocab (<s>, </s>, UNK etc)
model.fit(train_data, padded_vocab) 

In [None]:
model.counts['one'] # i.e. Count('was')

In [None]:
model.score("one")

In [None]:
model.counts[['one']]['click'] # i.e. Count('one'|'click')

In [None]:
from nltk.tokenize.treebank import TreebankWordDetokenizer
detokenize = TreebankWordDetokenizer().detokenize
def generate_sent(model, num_words, random_seed=42):
    content = []
    for token in model.generate(num_words, random_seed=random_seed):
        if token == '<s>':
            continue
        if token == '</s>':
            break
        content.append(token)
    return detokenize(content)

Generate a sentence based on the text:

In [None]:
generate_sent(model, num_words=20, random_seed=0)

### Advantages of N-grams
1. It gives insight at different levels. (bigram, trigram, N-gram).
2. Simple and conceptually easy to understand.

### Disadvantages of N-grams
1. We may need to use stop words to avoid any noise in results.
2. A count may not necessarily indicate importance to text or entity.