In [1]:
import pandas as pd
import spacy
import category_encoders as ce
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

# Load Spacy large
nlp = spacy.load("en_core_web_lg")

In [42]:
nlp_cols  = ['blurb', 'name']
cat_cols  = ['country', 'spotlight', 'staff_pick', 'currency']
num_cols  = ['converted_pledged_amount', 'backers_count', 'goal']
target    = 'state'
date_cols = ['launched_at', 'deadline']

In [45]:
df_train = pd.read_pickle('df_train.pkl')
df_test  = pd.read_pickle('df_test.pkl')


X_train, y_train = df_train.drop(columns = [target]), df_train[target]
X_test,   y_test =  df_test.drop(columns = [target]),  df_test[target]

In [46]:
X_train.head()

Unnamed: 0,backers_count,blurb,converted_pledged_amount,country,country_displayable_name,created_at,currency,currency_symbol,currency_trailing_code,current_currency,...,is_superbacker,avatar,urls,avatar.thumb,avatar.small,avatar.medium,urls.web.user,urls.api.user,campaign_length,pledge_pct_goal
1,146,Help us finish a feature film about two strang...,10120,US,the United States,1327602001,USD,$,True,USD,...,,,,https://ksr-ugc.imgix.net/assets/005/910/605/a...,https://ksr-ugc.imgix.net/assets/005/910/605/a...,https://ksr-ugc.imgix.net/assets/005/910/605/a...,https://www.kickstarter.com/profile/1939666693,https://api.kickstarter.com/v1/users/193966669...,49 days 10:58:34,1.012
2,41,Considered playfully mischievous and outlaws i...,3785,US,the United States,1447448673,USD,$,True,USD,...,,,,https://ksr-ugc.imgix.net/assets/007/087/961/6...,https://ksr-ugc.imgix.net/assets/007/087/961/6...,https://ksr-ugc.imgix.net/assets/007/087/961/6...,https://www.kickstarter.com/profile/roguecodan...,https://api.kickstarter.com/v1/users/101884698...,30 days 00:00:00,1.261667
3,466,A fantastical love story about two New Yorkers...,21634,US,the United States,1359739327,USD,$,True,USD,...,,,,https://ksr-ugc.imgix.net/assets/005/987/612/0...,https://ksr-ugc.imgix.net/assets/005/987/612/0...,https://ksr-ugc.imgix.net/assets/005/987/612/0...,https://www.kickstarter.com/profile/488675737,https://api.kickstarter.com/v1/users/488675737...,29 days 23:00:00,1.0817
4,70,A story about a girl named Jane who is learnin...,3081,US,the United States,1330026300,USD,$,True,USD,...,,,,https://ksr-ugc.imgix.net/assets/005/925/157/3...,https://ksr-ugc.imgix.net/assets/005/925/157/3...,https://ksr-ugc.imgix.net/assets/005/925/157/3...,https://www.kickstarter.com/profile/shakyballoon,https://api.kickstarter.com/v1/users/220105304...,29 days 15:38:13,1.027
5,166,A short film portraying the despair of a resea...,6115,GB,the United Kingdom,1408536568,GBP,£,False,USD,...,,,,https://ksr-ugc.imgix.net/assets/008/591/512/2...,https://ksr-ugc.imgix.net/assets/008/591/512/2...,https://ksr-ugc.imgix.net/assets/008/591/512/2...,https://www.kickstarter.com/profile/1567687816,https://api.kickstarter.com/v1/users/156768781...,30 days 01:00:00,2.038333


In [48]:
X_test.head()

Unnamed: 0,backers_count,blurb,converted_pledged_amount,country,country_displayable_name,created_at,currency,currency_symbol,currency_trailing_code,current_currency,...,is_superbacker,avatar,urls,avatar.thumb,avatar.small,avatar.medium,urls.web.user,urls.api.user,campaign_length,pledge_pct_goal
1,146,Help us finish a feature film about two strang...,10120,US,the United States,1327602001,USD,$,True,USD,...,,,,https://ksr-ugc.imgix.net/assets/005/910/605/a...,https://ksr-ugc.imgix.net/assets/005/910/605/a...,https://ksr-ugc.imgix.net/assets/005/910/605/a...,https://www.kickstarter.com/profile/1939666693,https://api.kickstarter.com/v1/users/193966669...,49 days 10:58:34,1.012
2,41,Considered playfully mischievous and outlaws i...,3785,US,the United States,1447448673,USD,$,True,USD,...,,,,https://ksr-ugc.imgix.net/assets/007/087/961/6...,https://ksr-ugc.imgix.net/assets/007/087/961/6...,https://ksr-ugc.imgix.net/assets/007/087/961/6...,https://www.kickstarter.com/profile/roguecodan...,https://api.kickstarter.com/v1/users/101884698...,30 days 00:00:00,1.261667
3,466,A fantastical love story about two New Yorkers...,21634,US,the United States,1359739327,USD,$,True,USD,...,,,,https://ksr-ugc.imgix.net/assets/005/987/612/0...,https://ksr-ugc.imgix.net/assets/005/987/612/0...,https://ksr-ugc.imgix.net/assets/005/987/612/0...,https://www.kickstarter.com/profile/488675737,https://api.kickstarter.com/v1/users/488675737...,29 days 23:00:00,1.0817
4,70,A story about a girl named Jane who is learnin...,3081,US,the United States,1330026300,USD,$,True,USD,...,,,,https://ksr-ugc.imgix.net/assets/005/925/157/3...,https://ksr-ugc.imgix.net/assets/005/925/157/3...,https://ksr-ugc.imgix.net/assets/005/925/157/3...,https://www.kickstarter.com/profile/shakyballoon,https://api.kickstarter.com/v1/users/220105304...,29 days 15:38:13,1.027
5,166,A short film portraying the despair of a resea...,6115,GB,the United Kingdom,1408536568,GBP,£,False,USD,...,,,,https://ksr-ugc.imgix.net/assets/008/591/512/2...,https://ksr-ugc.imgix.net/assets/008/591/512/2...,https://ksr-ugc.imgix.net/assets/008/591/512/2...,https://www.kickstarter.com/profile/1567687816,https://api.kickstarter.com/v1/users/156768781...,30 days 01:00:00,2.038333


In [39]:
test

Unnamed: 0,launched_at,deadline
1,1329076826,1333349940
2,1452796491,1455388491


datetime.datetime(2012, 2, 12, 15, 0, 26)

In [None]:
def count(tokens):
    """
    Calculates some basic statistics about tokens in our corpus (i.e. corpus means collections text data)
    """
    # stores the count of each token

    word_counts = Counter()

    # stores the number of docs that each token appears in
    appears_in = Counter()
    total_docs = len(tokens)

    for token in tokens:
        # stores count of every appearance of a token
        word_counts.update(token)
        # use set() in order to not count duplicates, thereby count the num of docs that each token appears in
        appears_in.update(set(token))

    # build word count dataframe
    temp = zip(word_counts.keys(), word_counts.values())
    wc = pd.DataFrame(temp, columns=['word', 'count'])

    # rank the the word counts
    wc['rank'] = wc['count'].rank(method='first', ascending=False)
    total = wc['count'].sum()

    # calculate the percent total of each token
    wc['pct_total'] = wc['count'].apply(lambda token_count: token_count / total)

    # calculate the cumulative percent total of word counts
    wc = wc.sort_values(by='rank')
    wc['cul_pct_total'] = wc['pct_total'].cumsum()

    # create dataframe for document stats
    t2 = zip(appears_in.keys(), appears_in.values())
    ac = pd.DataFrame(t2, columns=['word', 'appears_in'])

    # merge word count stats with doc stats
    wc = ac.merge(wc, on='word')

    wc['appears_in_pct'] = wc['appears_in'].apply(lambda x: x / total_docs)

    return wc.sort_values(by='rank')


def my_tokenizer(text):
    clean_text = re.sub('[^a-zA-Z ]', '', text)
    tokens = clean_text.lower().split()
    return tokens


In [None]:
def plot_frequency_distribution_of_ngrams(sample_texts,
                                          ngram_range=(1, 2),
                                          num_ngrams=50,
                                          title='Frequency distribution of n-grams'):
    """Plots the frequency distribution of n-grams.

    # Arguments
        samples_texts: list, sample texts.
        ngram_range: tuple (min, mplt), The range of n-gram values to consider.
            Min and mplt are the lower and upper bound values for the range.
        num_ngrams: int, number of n-grams to plot.
            Top `num_ngrams` frequent n-grams will be plotted.
    """
    # Create args required for vectorizing.
    kwargs = {
        'ngram_range': ngram_range,
        'dtype': 'int32',
        'stop_words': 'english',
        'strip_accents': 'unicode',
        'decode_error': 'replace',
        'analyzer': 'word',  # Split text into word tokens.
    }
    vectorizer = CountVectorizer(**kwargs)

    # This creates a vocabulary (dict, where keys are n-grams and values are
    # idxices). This also converts every text to an array the length of
    # vocabulary, where every element idxicates the count of the n-gram
    # corresponding at that idxex in vocabulary.
    vectorized_texts = vectorizer.fit_transform(sample_texts)

    # This is the list of all n-grams in the index order from the vocabulary.
    all_ngrams = list(vectorizer.get_feature_names())
    num_ngrams = min(num_ngrams, len(all_ngrams))
    # ngrams = all_ngrams[:num_ngrams]

    # Add up the counts per n-gram ie. column-wise
    all_counts = vectorized_texts.sum(axis=0).tolist()[0]

    # Sort n-grams and counts by frequency and get top `num_ngrams` ngrams.
    all_counts, all_ngrams = zip(*[(c, n) for c, n in sorted(
        zip(all_counts, all_ngrams), reverse=True)])
    ngrams = list(all_ngrams)[:num_ngrams]
    counts = list(all_counts)[:num_ngrams]

    idx = np.arange(num_ngrams)
    plt.figure(figsize=(14, 6))
    plt.bar(idx, counts, width=0.8, color='b')
    plt.xlabel('N-grams')
    plt.ylabel('Frequencies')
    plt.title(title)
    plt.xticks(idx, ngrams, rotation=45)
    plt.show()
    return ngrams, counts


def plot_sample_length_distribution(sample_texts):
    """Plots the sample length distribution.

    # Arguments
        samples_texts: list, sample texts.
    """
    plt.hist([len(s) for s in sample_texts], 50)
    plt.xlabel('Length of a sample')
    plt.ylabel('Number of samples')
    plt.title('Sample length distribution')
    plt.show()