<img src="https://bdaaosu.org/img/Logo.png" width="60%"/>

# Text Classification Workshop

A note from the editor:

If you are new to Python, this may seem like a lot, _all at once_. Don't worry if you don't understand everything, right now! If you do not understand what a function does, simply run the following code:

`help(<FUNCTION NAME, WITHOUT PARENTHESIS>)`

And you should get some helpful documentation back!

In [None]:
# Run these lines if you do not have sklearn and/or pandas installed!
!pip install sklearn
!pip install pandas

In [None]:
# Helper function - Run this, but don't necessarily worry about how it works!
def most_associated_words(tfidf, features, labels):
    """
    Print the most associated unigrams and bigrams for the given
    feature matrix and class labels.
    
    Params
    ------
    tfidf : TfidfVectorizer object from sklearn.feature_extraction.text
    features : feature matrix returned from tfidf.fit_transform.to_array()
    labels : class labels from input data
    """
    from sklearn.feature_selection import chi2
    import numpy as np
    N = 5
    for cond in list(set(labels)):
      features_chi2 = chi2(features, labels == cond)
      indices = np.argsort(features_chi2[0])
      feature_names = np.array(tfidf.get_feature_names())[indices]
      unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
      bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
      #trigrams = [v for v in feature_names if len(v.split(' ')) == 3]
      print("# '{}':".format(cond))
      print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
      print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))
      #print("  . Most correlated trigrams:\n. {}".format('\n. '.join(trigrams[-N:])))
      print('\n')

## How important is a word?

### Term Frequency, Inverse Document Frequency
<img src="https://i.imgur.com/WkddTVo.png"/>

In [None]:
# Two example "documents"
example = [
    'Perfectly cooked and seasoned', # A document we might classify as being about "food"
    'Illustrious and a symbol of strength' # A document we would probably classify as not being about food - "not food"
]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer # Import the "TfidfVectorizer" from scikit-learn
import pandas as pd # Allows us to view data as a nicely formatted table!

# Make a TF-IDF vectorizer object
tfidf = TfidfVectorizer( 
    lowercase=True, # Make every word in our documents lowercase
    stop_words='english' # Remove common words like "a", "is", "they", "with", etc.
)

# Transform the "documents" we have to a matrix of TF-IDF values
features = tfidf.fit_transform(example).toarray()

print(
    # Create a dataframe from the matrix we got, with term labels
    pd.DataFrame(
        features, 
        columns = tfidf.get_feature_names() # Gets the names of the features from the tfidf object
    )
)

### Let's put on our math hats

Why does the word "illustrious" have a TF-IDF value of 0.57735?

In [None]:
import numpy as np

num_documents = 2 # We have two documents in our example
num_words = 3 # There are three unique/meaningful words in each document (does not include the meaningless "stopwords")
term_frequency = 1/num_words # The "term frequency" for each term in each document, given our example
inverse_document_frequency = np.log(num_documents/1)+1 # The "inverse document frequency" for each term in each document

# Note: These values are only accurate to the example we have given, 
# because each word is unique in each document.

tf_idf = (1/3)*(np.log(2)+1)
print(tf_idf) # Should equal about ~.56 - which is basically what we got back before, minus some "smoothing"!

### Okay, _cool math dude_, so now we have a huge matrix. 
### So, how do we use it to make classifications?

# What the f@!k is Multinomial Naive Bayes?

## Enter Bayes Rule
<img src="https://miro.medium.com/max/512/0*EfYTXtTJ9X-Ua9Nh.png" />

## From our first (small) example
<img src="https://i.imgur.com/WnKCeD3.png" width="80%"/>

## Food or not food?
<img src="https://i.imgur.com/a83Evsd.png" width="55%"/>

## We're going to need some training data...

<img src="https://i.imgur.com/qXcsZPi.png" width="50%"/>

## Let's do some classification

In [87]:
# Get the classifications we came up with!
descriptions = pd.read_csv('iPhone Listing Descriptions.csv')

In [None]:
# What's the distribution of the condition classes we came up with?
descriptions.Condition.value_counts()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    min_df=.05, 
    lowercase=True,
    ngram_range=(1, 2), # Consider both one word and two word combinations
    stop_words='english'
)

features = tfidf.fit_transform(descriptions.ItemDescription).toarray()
labels = descriptions.Condition

In [None]:
# Get most associated words with each condition category 
most_associated_words(tfidf, features, labels)

## Train, train, train

In [98]:
from sklearn.model_selection import train_test_split # Split a dataset into training and test datasets
from sklearn.feature_extraction.text import CountVectorizer # Make a matrix of word counts
from sklearn.feature_extraction.text import TfidfTransformer # Transform a matrix of word counts into TF-IDF values
from sklearn.naive_bayes import MultinomialNB # Make a Multinomial Naive Bayes model

# Make training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(
    descriptions['ItemDescription'], 
    descriptions['Condition'], 
    random_state = 0
)

# Transform our training data into word counts
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

# ..and then TF-IDF values
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Train the Multinomial Naive Bayes model!
clf = MultinomialNB().fit(X_train_tfidf, y_train)

## Classify the real data!

In [None]:
# We need some value data!
iphone_listings = pd.read_csv('iPhone Listings.csv', error_bad_lines=False)

In [None]:
# What does the data look like?
iphone_listings.head()

In [None]:
print(
    # Use the model we trained to predict labels!
    clf.predict(
        # Transform the listing descriptions into a matrix of counts
        count_vect.transform(
            iphone_listings.head(100).ItemDescription.tolist()
        )
    )
)

In [None]:
# Make a new column in the real data with the predicted Condition
iphone_listings['Condition'] = clf.predict(
        # Transform the listing descriptions into a matrix of counts
        count_vect.transform(
            iphone_listings.ItemDescription.tolist()
        )
    )

In [None]:
# How many of each class did we predict?
iphone_listings.Condition.value_counts()

# Let's try valuing a used iPhone

How about the:

 - Model: iPhone X
 - Storage Capacity: 256 GB
 - Color: Space Gray
 - Carrier: AT&T

In [None]:
# How we might describe our iPhone
iphone_listing_description = "Very good - I've only had it since June. No scratches anywhere! It's been in a screen protector so you know it has to be in great condition!!!"

In [None]:
# Predict the iPhone listing description we made
prediction = clf.predict(
    # Transform the listing descriptions into a matrix of counts
    count_vect.transform(
        [iphone_listing_description]
    )
)

prediction

In [None]:
# What does the data look like, again?
iphone_listings.head()

In [None]:
# Subset the listings to the following conditions
iphone_matches = iphone_listings[
    (iphone_listings.ModelName == 'iPhone X')
    & (iphone_listings.Carrier == 'at&t')
    & (iphone_listings.Storage == '256 GB')
    & (iphone_listings.Color == 'space gray')
    & (iphone_listings.Condition == int(prediction))
]

iphone_matches.Value.mean() # Get the "Value" column from the dataframe and then get the mean of the column