# Kickstarter Trend Analysis

In [2]:
import pandas as pd
from time import time
from random import randint
from IPython.core.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline
import re
from random import randint
from datetime import datetime,date
from numpy import nan as Nan

### Load DataFrame from CSV

In [4]:
df = pd.read_csv('kickstarter_tech_db',sep='\t')
df.head(3)

Unnamed: 0,project,backers,pledged,goal,pct_funded,successful,funding_start_dt,funding_end_dt,live,location,category,tag,summary,description,url
0,"Fluent Forever, The App: Learn to *Think* in A...",4434,587785.0,250000.0,2.35114,True,2017-09-19,2017-10-19,False,"Chicago, IL",Apps,Project We Love,"Why learn to translate, when you can build flu...","Why learn to translate, when you can learn tot...",https://www.kickstarter.com/projects/gabrielwy...
1,Flag・free photo prints - forever!,5120,331949.0,10000.0,33.1949,True,2016-09-14,2016-10-28,False,"Venice, Los Angeles, CA",Apps,,An app that delivers 20 free photo prints a mo...,"Flag is currently available for iOS, you cando...",https://www.kickstarter.com/projects/flag/flag...
2,Devslopes - ANYONE Can Learn to Code,2149,192056.0,39500.0,4.862177,True,2016-04-19,2016-05-19,False,"Orem, UT",Apps,,Devslopes is the world's most effective and af...,Devslopes Game Development AcademyLater this y...,https://www.kickstarter.com/projects/912791163...


## NLP

In [5]:
import nltk
import string
#nltk.download_shell()

In [25]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

### Create 'corpus' column

In [6]:
for col in df[['project','summary','description']]:
    df[col] = df[col].apply(lambda x: str(x))

df['corpus'] = df['project'] + ' ' + df['summary'] + ' ' + df['description']
df['corpus'] = df['corpus'].apply(lambda x: x.replace('nan ',''))

In [7]:
# We'll also need to clean up messy punctuations (ie when a punctuation is not followed by a space)
def pad_punct(c):
    if c in string.punctuation:
        c = ' '+c+' '
    return c

df['corpus'] = df['corpus'].apply(lambda x: ''.join([pad_punct(c) for c in str(x)]))

### Tokenize corpus

In [10]:
#from nltk.tokenize import PunktSentenceTokenizer
df['corpus_tokenized'] = df['corpus'].apply(nltk.word_tokenize)

In [11]:
df['corpus_tokenized'].head()

0    [Fluent, Forever, ,, The, App, :, Learn, to, *...
1    [Flag・free, photo, prints, -, forever, !, An, ...
2    [Devslopes, -, ANYONE, Can, Learn, to, Code, D...
3    [Dwell, Scripture, Listening, App, Our, Kickst...
4    [The, Disaster, Prediction, App, The, Disaster...
Name: corpus_tokenized, dtype: object

### POS Tagging, Chunking & Chinking of tokens

In [12]:
def process_content(tokenized):
    
    tree,words = [],[]
    
    try:
        for i in tokenized:
            
            i = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(i)
            chunkGram = r"""Chunk: {<.*>+} 
                                    }<DT|WRB|TO|IN|PRP.*|.|CD|POS|PDT|W.*?|MD|CC|RB.*>+{"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            tree.append(chunked)
            
        for branch in tree:
            for leaf in branch:
                if type(leaf) != tuple:
                    word_ = str(leaf)
                    word_ = re.sub('Chunk ','',word_)
                    word_ = re.split('/',word_)[0][1:]
                    words.append(word_)
        
    except Exception as e:
        tree.append(str(e))
    
    return words

In [13]:
df['corpus_chinked'] = df['corpus_tokenized'].apply(process_content)

### Remove stop words

In [14]:
from nltk.corpus import stopwords

stops = stopwords.words('english')
df['corpus_chinked'] = df['corpus_chinked'].apply(lambda x: [word for word in x if word.lower() not in stops])

In [15]:
import string

df['corpus_chinked'] = df['corpus_chinked'].apply(lambda x: [word for word in x if word not in string.punctuation])

In [16]:
df['corpus_chinked'].iloc[0][:8]

['Fluent', 'App', 'Learn', 'Think', 'Language', 'learn', 'translate', 'build']

### Lemmatize tokens

In [17]:
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

def multi_lemmatizer(word):
    lemma = WordNetLemmatizer()
    for word, tag in pos_tag(word_tokenize(word)):
        if tag.startswith("NN"):
            return lemma.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            return lemma.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            return lemma.lemmatize(word, pos='a')
        else:
            return word

In [18]:
df['corpus_lemmatized'] = df['corpus_chinked'].apply(lambda x: [multi_lemmatizer(word) for word in x])

In [19]:
df['corpus_lemmatized'].iloc[0][:5]

['Fluent', 'App', 'Learn', 'Think', 'Language']

### Stem tokens

In [20]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()

df['corpus_stemmed'] = df['corpus_lemmatized'].apply(lambda x: [ps.stem(word) for word in x])

### Remove common words found in Kickstarter listings

In [21]:
stops_2 = ['app','apps','software','program','website','get','month','year','add','pledge','beta','shipping','deliver',
          'subscription','com','org','kickstarter','offer','online','goal','target','go','make','use','``','iphone',
          'android','demand','need','people','person','let','free','support','something','take','application','etc',
          'order','version','mobile','funding','detail','launch','product','project','phase','fund','thing','want',
          'everything','next','start','us','we','you','new','web','able','fund','funding','funded']

df['corpus_lemmatized'] = df['corpus_lemmatized'].apply(lambda x: [word for word in x if word.lower() not in stops_2])
df['corpus_lemmatized'] = df['corpus_lemmatized'].apply(lambda x: [word for word in x if len(word)>1])

### Remove Named Entities

In [22]:
def namedEnt(tokenized):
    
    tree,words = [],[]
    
    try:
        for i in tokenized:
            i = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(i)
            namedEnt = nltk.ne_chunk(tagged,binary=True)
            tree.append(namedEnt)
            
        for branch in tree:
            for leaf in branch:
                if type(leaf) != tuple:
                    word_ = str(leaf)
                    word_ = re.sub('NE ','',word_)
                    word_ = re.split('/',word_)[0][1:]
                    words.append(word_)
            
    except Exception as e:
        print(str(e))

    return words

In [23]:
df['corpus_namedEnt'] = df['corpus_lemmatized'].apply(namedEnt)

In [24]:
# Remove named entities from corpus_lemmatized
new_lems = []

for i in range(len(df)):
    a = df['corpus_lemmatized'].iloc[i]
    b = df['corpus_namedEnt'].iloc[i]
    new_lem = [word for word in a if word not in b]
    new_lems.append(new_lem)

df['corpus_lemmatized_noNE'] = new_lems

In [25]:
# Convert word to word.lower() in corpus_lemmatized
df['corpus_lemmatized_noNE'] = df['corpus_lemmatized_noNE'].apply(lambda x: [word.lower() for word in x])

### Frequency Distribution of tokens

In [26]:
df['corpus_top10'] = df['corpus_lemmatized'].apply(lambda x: nltk.FreqDist(x).most_common(20))

In [27]:
df['corpus_top10'].head()

0    [(language, 35), (learn, 24), (word, 13), (sen...
1    [(print, 26), (Flag, 19), (photo, 16), (ad, 10...
2    [(Devslopes, 5), (Game, 3), (Bootcamp, 3), (wo...
3    [(listen, 18), (Bible, 15), (audio, 15), (Dwel...
4    [(earthquake, 5), (Disaster, 4), (storm, 4), (...
Name: corpus_top10, dtype: object

In [28]:
df['corpus_top10_clean'] = df['corpus_top10'].apply(lambda x: [word for word,freq in x])

In [33]:
df['corpus_top10_clean'].head()

0    [language, learn, word, sentence, flashcard, S...
1    [print, Flag, photo, ad, send, help, paper, cr...
2    [Devslopes, Game, Bootcamp, work, bootcamp, HT...
3    [listen, Bible, audio, Dwell, voice, Scripture...
4    [earthquake, Disaster, storm, time, solar, dis...
Name: corpus_top10_clean, dtype: object

### Create Document-Term Matrix

To run any mathematical model on text corpus, it is a good practice to convert it into a matrix representation. LDA model looks for repeating term patterns in the entire DT matrix.

https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/

In [30]:
import gensim
from gensim import corpora

In [31]:
# Revector corpora into lists of lists
df['corpus_lemmatized_revect'] = df['corpus_lemmatized'].apply(lambda x: [x[i:i+randint(3,5)] 
                                                                          for i in range(0,len(x),randint(3,5))])

# Create the term dictionary of our corpus, where every unique term is assigned an index
df['dictionary'] = df['corpus_lemmatized_revect'].apply(lambda x: corpora.Dictionary(x))

# Convert list of documents (corpus) into Document-Term Matrix using dictionary prepared above
matrices = []

for i in range(len(df)):
    matrix = [df['dictionary'].iloc[i].doc2bow(doc) for doc in df['corpus_lemmatized_revect'].iloc[i]]
    matrices.append(matrix)

df['doc_term_matrix'] = matrices

### Topic Modelling with Latent Dirichlet Allocation (LDA)
Next step is to create an object for LDA model and train it on Document-Term matrix. The training also requires few parameters as input which are explained in the above section. The gensim module allows both LDA model estimation from a training corpus and inference of topic distribution on new, unseen documents.

In [34]:
# Create the object for LDA model using gensim library
lda = gensim.models.ldamodel.LdaModel

# Run & Train LDA model on the document-term matrix
ldas = []

for i in range(len(df)):
    #df['dictionary'].iloc[i].filter_extremes(no_below=2, no_above=0.1)
    try:
        lda_ = lda(df['doc_term_matrix'].iloc[i], num_topics=3, id2word = df['dictionary'].iloc[i], passes=50)
    except:
        lda_ = None
    ldas.append(lda_)

df['ldamodel'] = ldas

In [35]:
df['ldamodel'].head()

0    LdaModel(num_terms=336, num_topics=3, decay=0....
1    LdaModel(num_terms=208, num_topics=3, decay=0....
2    LdaModel(num_terms=68, num_topics=3, decay=0.5...
3    LdaModel(num_terms=390, num_topics=3, decay=0....
4    LdaModel(num_terms=93, num_topics=3, decay=0.5...
Name: ldamodel, dtype: object

In [36]:
# Add LDA topics to DataFrame
topics = []

for i in range(len(df)):
    topics_ = []
    try:
        for j in df['ldamodel'].iloc[i].print_topics(num_topics=2, num_words=5):
            st = j[1]
            topics__ = " ".join(re.findall("[a-zA-Z]+", st))
            topics__ = re.sub('nan','',topics__).split()
            topics_ += topics__
        topics.append(topics_)
    except:
        topics.append(None)

df['ldatopics'] = topics

In [37]:
df['ldatopics'].iloc[0]

['list',
 'sentence',
 'trainer',
 'Spanish',
 'user',
 'language',
 'learn',
 'review',
 'work',
 'include']

### pyLDAvis
https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21

pyLDAvis is designed to help users interpret the topics in a topic model that has been fit to a corpus of text data. The package extracts information from a fitted LDA topic model to inform an interactive web-based visualization.

* Saliency: a measure of how much the term tells you about the topic.
* Relevance: a weighted average of the probability of the word given the topic and the word given the topic normalized by the probability of the topic.
* The size of the bubble measures the importance of the topics, relative to the data.

First, we got the most salient terms, means terms mostly tell us about what’s going on relative to the topics. We can also look at individual topic.

In [38]:
import pyLDAvis.gensim

# First let's test for our first project
i = 0

lda = df['ldamodel'].iloc[i]
docterm = df['doc_term_matrix'].iloc[i]
dictionary = df['dictionary'].iloc[i]

lda_display  = pyLDAvis.gensim.prepare(lda, docterm, dictionary, sort_topics=True, lambda_step=1)
pyLDAvis.display(lda_display)

In [39]:
import pyLDAvis.gensim

pyLDAvis_topics = []

for i in range(len(df)): 
    
    ldas_ = []
    
    try:
        lda = df['ldamodel'].iloc[i]
        corpus = df['doc_term_matrix'].iloc[i]
        dictionary = df['dictionary'].iloc[i]
        
        lda_display  = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
        lda_display_df = lda_display[1]
        # Extract top 5 terms per topic (15 total)
        topic1 = list(lda_display_df[lda_display_df['Category'] == 'Topic1'].sort_values(['Freq'],ascending=False)['Term'][:5])
        topic2 = list(lda_display_df[lda_display_df['Category'] == 'Topic2'].sort_values(['Freq'],ascending=False)['Term'][:5])
        topic3 = list(lda_display_df[lda_display_df['Category'] == 'Topic3'].sort_values(['Freq'],ascending=False)['Term'][:5])

        ldas_ = topic1 + topic2 + topic3
        pyLDAvis_topics.append(ldas_)
        
    except:
        try:
            pyLDAvis_topics.append(df['ldatopics'].iloc[i][:5])
        except:
            pyLDAvis_topics.append(None)

df['pyLDAvis_topics'] = pyLDAvis_topics

In [40]:
df['pyLDAvis_topics'].head()

0    [language, learn, word, choice, additional, li...
1    [Flag, photo, print, frame, printing, print, p...
2    [Devslopes, cover, Unity, Academy, course, Pla...
3    [Bible, love, come, verse, playlist, audio, li...
4    [ObservatoryProject, Ben, direct, SpaceWeather...
Name: pyLDAvis_topics, dtype: object

Next we'll try to reduce our topics to a list of 3 most relevant terms

In [41]:
# Merge pyLDAvis and top 10 most frequent terms
# later, we'll find the top 3 most recurring terms which will correspond to the intersection of pyLDAvis topics and most frequent terms
def topics_transform(x):
    try:
        return [word.lower() for word in x]
    except:
        return x
    
df['pyLDAvis_topics_'] = df['pyLDAvis_topics'] #+ df['corpus_top10_clean']
df['pyLDAvis_topics_'] = df['pyLDAvis_topics_'].apply(topics_transform)

# Return top 3 topics by frequency/relevance
def top3_topics(x):
    try:
        return nltk.FreqDist(x).most_common(5)
    except:
        return x
    
df['pyLDAvis_topics_top3'] = df['pyLDAvis_topics_'].apply(top3_topics)

# Clean up format
def cleanup_topics(x):
    try:
        return [word for word,freq in x]
    except:
        return x
df['pyLDAvis_topics_top3_clean'] = df['pyLDAvis_topics_top3'].apply(cleanup_topics)

In [42]:
df['pyLDAvis_topics_top3_clean']

0             [language, learn, word, choice, additional]
1                   [photo, print, flag, frame, printing]
2              [devslopes, cover, unity, academy, course]
3                      [bible, listen, love, come, verse]
4       [disaster, observatoryproject, ben, direct, sp...
5                       [kid, code, cod, show, character]
6                 [learn, language, bus, italian, membus]
7                 [horse, body, connect, care, equisense]
8                   [yoga, level, ultimate, yogi, mental]
9            [feature, church, scripture, ministry, voir]
10        [service, provider, disabled, help, assistance]
11                  [math, prep, prestige, sponsor, test]
12                [village, autism, service, ask, friend]
13                       [gps, map, outdoor, update, ios]
14           [class, developer, time, development, cocoa]
15                      [dog, slobbr, pet, boston, owner]
16                   [course, developer, time, full, way]
17            

### Save locally to CSV

In [43]:
df.to_csv('kickstarter_tech_post_nlp_db',sep='\t',index = False)