# Kickstarter Trend Analysis

In [1]:
from requests import get
from bs4 import BeautifulSoup
from bs4 import NavigableString
import pandas as pd
from time import sleep
from time import time
from random import randint
from IPython.core.display import clear_output
from warnings import warn
import matplotlib.pyplot as plt
%matplotlib inline
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from datetime import datetime
from random import randint
from datetime import datetime,date
from numpy import nan as Nan

### For Loop Progress Bar

In [2]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

## 1) Web Scraping

### Declaring our URLs

In [3]:
tech_earth_mostfunded = 'https://www.kickstarter.com/discover/advanced?category_id=16&sort=most_funded&seed=2567796&page=1'
apps_earth_mostfunded = 'https://www.kickstarter.com/discover/advanced?category_id=332&sort=most_funded&seed=2567796&page=1'
software_earth_mostfunded = 'https://www.kickstarter.com/discover/advanced?category_id=51&sort=most_funded&seed=2567796&page=1'
web_earth_mostfunded = 'https://www.kickstarter.com/discover/advanced?category_id=342&sort=most_funded&seed=2567796&page=1'

urls = [apps_earth_mostfunded,software_earth_mostfunded,web_earth_mostfunded]
url = apps_earth_mostfunded

### Web scraping function

In [51]:
def scrape_kickstarter_project(url_project):
    
    # Stage HTML soup for scraping
    headers = {"Accept-Language": "en-US, en;q=0.5"}
    response = get(url_project, headers=headers)
    html_soup = BeautifulSoup(response.text, 'html.parser')

    # Project
    try: 
        #closed
        project = html_soup.find('a', class_='hero__link').text
        #live = False
    except: 
        try:
            project = re.findall(r'medium mb3">(.*?)</h2>',str(html_soup))[0]
        except:
            project = None

    # Backers
    try:
        backers = html_soup.find('div',class_='NS_campaigns__spotlight_stats').b.text
        backers = re.sub(' backers','',backers)
        backers = int(re.sub(',','',backers))
    except:
        try:
            backers = re.findall(r'type-24-md medium soft-black"><span>(.*?)</span>',str(html_soup))[0]
        except:
            backers = None

    # Pledged
    try:
        pledged = html_soup.find('h3', class_='mb0')
        pledged = re.sub(',','',pledged.span.text)
        #pledged = re.sub('$','',pledged)
        pledged_num = sum(c.isalpha() for c in pledged)
        pledged = int(pledged[pledged_num+1:])
    except:
        try:
            pledged = re.findall(r'class="soft-black">(.*?)</span>',str(html_soup))[0]
            pledged_num = sum(c.isalpha() for c in pledged)
            pledged = int(pledged[pledged_num+1:])
        except:
            pledged = None

    # Goal
    try:
        goal = html_soup.find('div', class_='type-12').span.text
        goal = re.sub(',','',goal)
        #goal = re.sub('$','',goal)
        goal_num = sum(c.isalpha() for c in goal)
        goal = int(goal[goal_num+1:])
    except:
        try:
            goal = re.findall(r'class="money">(.*?)</span>',str(html_soup))[0]
            goal = re.sub(',','',goal)
            #goal = re.sub('$','',goal)
            goal_num = sum(c.isalpha() for c in goal)
            goal = int(goal[goal_num+1:])
        except:
            goal = None
    
    # Pct funded
    try:
        pct_funded = float(pledged/goal)
    except:
        pct_funded = None
    
    # Succesful
    try:
        successful = True if pct_funded >= 1 else False
    except:
        successful = None
    
    # Funding period
    try:
        times = html_soup.find('p', class_='f5').contents
        funding_start_dt = times[1].text
        funding_end_dt = times[3].text
        funding_start_dt = datetime.strptime(funding_start_dt, '%b %d, %Y')
        funding_end_dt = datetime.strptime(funding_end_dt, '%b %d, %Y')
    except:
        funding_start_dt,funding_end_dt = None,None
    
    # Live
    try:
        live = True if funding_end_dt > datetime.now() else False
    except:
        live = True
    
    # Location
    try:
        if len(html_soup.find_all('a', class_='grey-dark')) == 3:
            location = html_soup.find_all('a', class_='grey-dark')[1].text.strip()
        else:
            location = html_soup.find_all('a', class_='grey-dark')[0].text.strip()
    except:
        try:
            location = re.findall(r'class="ml1">(.*?)</span>',str(html_soup))[1]
        except:
            location = None

    # Category
    try:
        if len(html_soup.find_all('a', class_='grey-dark')) == 3:
            category = html_soup.find_all('a', class_='grey-dark')[2].text.strip()
        else:
            category = html_soup.find_all('a', class_='grey-dark')[1].text.strip()
    except:
        try:
            category = re.findall(r'class="ml1"><span>(.*?)</span>',str(html_soup))[0]
        except:
            category = None

    # Tags
    try:
        if len(html_soup.find_all('a', class_='grey-dark')) == 3:
            tag = html_soup.find_all('a', class_='grey-dark')[0].text.strip()
        else:
            tag = None
    except:
        tag = None

    # Summary
    try:
        summary = html_soup.find('span', class_='content').text.strip()
    except:
        summary = None

    # Description
    try:
        description_ = html_soup.find('div', class_='full-description')
        description = ''
        for string in description_.stripped_strings:
            description += string
    except:
        description = None

    scrape = [project,backers,pledged,goal,pct_funded,successful,funding_start_dt,
              funding_end_dt,live,location,category,tag,summary,description]

    return scrape

### Web browsing function

In [66]:
def browse_kickstarter_results(urls):
    
    # Create empty DataFrame that will contain results of our scraping
    head = ['project','backers','pledged','goal','pct_funded','successful','funding_start_dt',
            'funding_end_dt','live','location','category','tag','summary','description','url']
    scrape_df = pd.DataFrame(columns=head)

    # Loop through each URL
    for url in urls:

        # Stage HTML soup for scraping Projects page
        headers = {"Accept-Language": "en-US, en;q=0.5"}
        response = get(url, headers=headers) 
        html_soup = BeautifulSoup(response.text, 'html.parser')

        # Store results count
        results_count = html_soup.find('b', class_='count')
        results_count = results_count.text.strip()
        results_count = re.sub(' projects','',results_count)
        results_count = re.sub(',','',results_count)
        results_count = int(results_count)

        # Store loop iteration count to determine how many times to execute subsequent loop
        load_iter = int(results_count/12)
        load_iter = load_iter+1

        # Loop through results and scrape
        for i in log_progress(range(1,load_iter)):

            # Update URL with page count
            url_ = url[:-1]+str(i)
            
            # Open and close browser to keep jupyter from timing out
            browser = webdriver.Firefox()
            browser.get(url_)
            sleep(randint(1,2))
            browser.quit()

            # Stage HTML soup for scraping Projects page
            headers_ = {"Accept-Language": "en-US, en;q=0.5"}
            response_ = get(url_, headers=headers_) 
            html_soup_ = BeautifulSoup(response_.text, 'html.parser')

            # Find projects' relevant HTML tags
            container = html_soup_.find_all(lambda tag: tag if tag.has_attr('data-pid') else None)

            # Loop through each page's 12 results 
            for i in range(len(container)):

                # Extract URL for each result
                s = str(container[i])
                try:
                    regex = re.search(r'https://www.kickstarter.com/projects/(.*?)&quot',s).group(1)
                except:
                    regex = re.search(r'https://www.kickstarter.com/projects/(.*?)"',s).group(1)

                url_p = 'https://www.kickstarter.com/projects/'+regex
                url_p = re.sub('/description','',url_p)
                if url_p.count('/') > 5:
                    url_p = '/'.join(url_p.split('/')[:6])

                # Scrape each project's page
                scrape = scrape_kickstarter_project(url_p)
                if scrape:
                    scrape.append(url_p)

                # Append result of scrape to scrape_df
                scrape_df.loc[len(scrape_df)] = scrape
    
    return scrape_df

### Load results of web scraping into DataFrame

In [None]:
scrape_df = browse_kickstarter_results(urls)
scrape_df.to_csv('kickstarter_tech_db',sep='\t',index = False)

### Save DataFrame to CSV

In [27]:
scrape_df.to_csv('kickstarter_tech_db',sep='\t',index = False)

### Load DataFrame from CSV

In [65]:
df = pd.read_csv('kickstarter_tech_db',sep='\t')
df.head(3)

Unnamed: 0,project,backers,pledged,goal,pct_funded,successful,funding_start_dt,funding_end_dt,live,location,category,tag,summary,description,url
0,"Fluent Forever, The App: Learn to *Think* in A...",4434.0,587785.0,250000.0,2.35114,True,2017-09-19 00:00:00,2017-10-19 00:00:00,False,"Chicago, IL",Apps,Project We Love,"Why learn to translate, when you can build flu...","Why learn to translate, when you can learn tot...",https://www.kickstarter.com/projects/gabrielwy...
1,Flag・free photo prints - forever!,5120.0,331949.0,10000.0,33.1949,True,2016-09-14 00:00:00,2016-10-28 00:00:00,False,"Venice, Los Angeles, CA",Apps,,An app that delivers 20 free photo prints a mo...,"Flag is currently available for iOS, you cando...",https://www.kickstarter.com/projects/flag/flag...
2,Devslopes - ANYONE Can Learn to Code,2149.0,192056.0,39500.0,4.862177,True,2016-04-19 00:00:00,2016-05-19 00:00:00,False,"Orem, UT",Apps,,Devslopes is the world's most effective and af...,Devslopes Game Development AcademyLater this y...,https://www.kickstarter.com/projects/912791163...


## 2) Data Cleansing

### Drop null datetimes

In [4]:
df = df[~df['funding_start_dt'].isnull()]

### Convert date strings to datetimes

In [5]:
df['funding_start_dt'] = df['funding_start_dt'].apply(lambda x: 
                                                      datetime.strptime(x[:10],'%Y-%m-%d'))
df['funding_end_dt'] = df['funding_end_dt'].apply(lambda x: 
                                                  datetime.strptime(x[:10],'%Y-%m-%d'))

### Fix 'live' mistake in DataFrame

In [6]:
df['live'] = df['funding_end_dt'].apply(lambda x: False if x < datetime(2018,11,9) else True)

In [7]:
len(df[df['live'] == True])

4

### Replace NaN values to False in 'successful' column

In [8]:
df['successful'] = df['successful'].apply(lambda x: False if x is Nan else x)