In [1]:
from sklearn.svm import LinearSVC
from sklearn.decomposition import TruncatedSVD
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from collections import Counter
import spacy
from sklearn.model_selection import train_test_split
import tqdm
import re
import numpy as np
import matplotlib.pyplot as plt

In [22]:
def get_unique(text_in, feature):
    
    target_class = ['successful', 'failed']
    success_out, fail_out = [], []
    key1 = target_class[0] + "_" + feature
    key2 = target_class[1] + "_" + feature
    success = set(text_in[key1])
    fail = set(text_in[key2])
    
    for item in success:
        if (item not in fail) :
            success_out.append(item)
    for item in fail:
        if (item not in success) :
            fail_out.append(item)

    return success_out, fail_out

In [1]:
def plot_frequency_distribution_of_ngrams(sample_texts,
                                          ngram_range=(1, 2),
                                          num_ngrams=50,
                                          title='Frequency distribution of n-grams'):
    """Plots the frequency distribution of n-grams.

    # Arguments
        samples_texts: list, sample texts.
        ngram_range: tuple (min, mplt), The range of n-gram values to consider.
            Min and mplt are the lower and upper bound values for the range.
        num_ngrams: int, number of n-grams to plot.
            Top `num_ngrams` frequent n-grams will be plotted.
    """
    # Create args required for vectorizing.
    kwargs = {
        'ngram_range': ngram_range,
        'dtype': 'int32',
        'stop_words': 'english',
        'strip_accents': 'unicode',
        'decode_error': 'replace',
        'analyzer': 'word',  # Split text into word tokens.
    }
    vectorizer = CountVectorizer(**kwargs)

    # This creates a vocabulary (dict, where keys are n-grams and values are
    # idxices). This also converts every text to an array the length of
    # vocabulary, where every element idxicates the count of the n-gram
    # corresponding at that idxex in vocabulary.
    vectorized_texts = vectorizer.fit_transform(sample_texts)

    # This is the list of all n-grams in the index order from the vocabulary.
    all_ngrams = list(vectorizer.get_feature_names())
    num_ngrams = min(num_ngrams, len(all_ngrams))
    # ngrams = all_ngrams[:num_ngrams]

    # Add up the counts per n-gram ie. column-wise
    all_counts = vectorized_texts.sum(axis=0).tolist()[0]

    # Sort n-grams and counts by frequency and get top `num_ngrams` ngrams.
    all_counts, all_ngrams = zip(*[(c, n) for c, n in sorted(
        zip(all_counts, all_ngrams), reverse=True)])
    ngrams = list(all_ngrams)[:num_ngrams]
    counts = list(all_counts)[:num_ngrams]

    idx = np.arange(num_ngrams)
    plt.figure(figsize=(14, 6))
    plt.bar(idx, counts, width=0.8, color='b')
    plt.xlabel('N-grams')
    plt.ylabel('Frequencies')
    plt.title(title)
    plt.xticks(idx, ngrams, rotation=45)
    plt.show()
    return ngrams, counts


def plot_sample_length_distribution(sample_texts):
    """Plots the sample length distribution.

    # Arguments
        samples_texts: list, sample texts.
    """
    plt.hist([len(s) for s in sample_texts], 50)
    plt.xlabel('Length of a sample')
    plt.ylabel('Number of samples')
    plt.title('Sample length distribution')
    plt.show()

In [None]:
def count(tokens):
    """
    Calculates some basic statistics about tokens in our corpus (i.e. corpus means collections text data)
    """
    # stores the count of each token

    word_counts = Counter()

    # stores the number of docs that each token appears in
    appears_in = Counter()
    total_docs = len(tokens)

    for token in tokens:
        # stores count of every appearance of a token
        word_counts.update(token)
        # use set() in order to not count duplicates, thereby count the num of docs that each token appears in
        appears_in.update(set(token))

    # build word count dataframe
    temp = zip(word_counts.keys(), word_counts.values())
    wc = pd.DataFrame(temp, columns=['word', 'count'])

    # rank the the word counts
    wc['rank'] = wc['count'].rank(method='first', ascending=False)
    total = wc['count'].sum()

    # calculate the percent total of each token
    wc['pct_total'] = wc['count'].apply(lambda token_count: token_count / total)

    # calculate the cumulative percent total of word counts
    wc = wc.sort_values(by='rank')
    wc['cul_pct_total'] = wc['pct_total'].cumsum()

    # create dataframe for document stats
    t2 = zip(appears_in.keys(), appears_in.values())
    ac = pd.DataFrame(t2, columns=['word', 'appears_in'])

    # merge word count stats with doc stats
    wc = ac.merge(wc, on='word')

    wc['appears_in_pct'] = wc['appears_in'].apply(lambda x: x / total_docs)

    return wc.sort_values(by='rank')


def my_tokenizer(text):
    clean_text = re.sub('[^a-zA-Z ]', '', text)
    tokens = clean_text.lower().split()
    return tokens


In [None]:
def plot_and_add_hi_freq_feature(X_train_in, X_val_in, X_test_in, y_train, y_val, feature):
    target_dict = {0: 'Successful', 1: 'Failed'}
    blurb = {}
    for i in range(0, 2):
        idx = y_train == i
        target_rating = target_dict[i]
        X_i = X_train_in.loc[idx]
        title = 'Frequency Distribution of ngrams for ' + whiskey_rating
        ngrams, counts = plot_frequency_distribution_of_ngrams(X_i[feature], title=title)
        key1 = target_rating + "_ngrams"
        key2 = target_rating + "_counts"
        blurb[key1] = ngrams
        blurb[key2] = counts

    # Get the unique ngrams with the highest frequency associated with each class and create numerical feature
    # with how often the highest frequency, unique ngram appeared in the description
    sucess_n_grams, failed_n_grams = get_unique(blurb, "ngrams")
    ngram_range = (1, 2)
    kwargs = {
        'ngram_range': ngram_range,
        'dtype': 'int32',
        'stop_words': 'english',
        'strip_accents': 'unicode',
        'decode_error': 'replace',
        'analyzer': 'word',  # Split text into word tokens.
    }

    vect = CountVectorizer(**kwargs)
    unique_ngrams = success_n_grams + failed_n_grams

    n1, n2, n3 = len(X_train_in), len(X_test_in), len(X_val_in)
    X_combined = pd.concat([X_train_in, X_test_in, X_val_in])

    X_combined = make_feature(X_combined, vect, unique_ngrams, excel_uni_ngrams,
                              good_uni_ngrams, poor_uni_ngrams, feature)

    X_train = X_combined.iloc[0:n1]
    X_test = X_combined.iloc[n1:n1 + n2]
    X_val = X_combined.iloc[n1 + n2:n1 + n2 + n3]

    return X_train, X_val, X_test