### Option 1: IndiGoGo GreenTech Success
**Question/need**: What is the liklihood of successfully finishing my Energy/GreenTech funding?  

**Description of my sample data**: Scrape profile URLs from IndiGoGo search/filter site, compile list of target URLs, then fetch data from list of target URLs   

**Characteristics of each entity**: Description word count, number of backers, goal, location, final/current funding, existence/number of videos, existence/number of photos.

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import time, os
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import preprocessing
from scipy import stats
import statsmodels.api as sm

In [2]:
 # path to the chromedriver executable
chromedriver = r"C:\Users\tyler\Documents\GitHub\chromedriver.exe"
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver) #launch the browser

### Get List of URLs to Loop Through 

In [3]:
def open_page_and_click_load_more(indiegogo_query, x):
    '''
    click the "show more" button x number of times; 10 is suggested as that is 
    roughly how many projects there are.
    to be used in conjunction with get_url_list()
    '''
    driver.get(indiegogo_query)
    for i in range(0, x):
        time.sleep(2)
        # see if the 'Load More' button is still available (if size is 1)
        load_more_button = len(driver.find_elements_by_xpath("/html/body/div[2]/div/div/div[3]/explore-detail/div/div/div[3]/section[2]/div[3]/div[2]/div[1]/div/a"))
        # this if statement does not function as intended
        # but code will still run - revisit if possible
        if load_more_button > 0:
            # click it!
            driver.find_element_by_xpath("/html/body/div[2]/div/div/div[3]/explore-detail/div/div/div[3]/section[2]/div[3]/div[2]/div[1]/div/a").click()
    url_list = driver.find_elements_by_tag_name("a")
    return url_list

In [4]:
def get_url_list(url_list):
    '''
    parse list of target urls to be scraped
    '''
    return_list = []
    for url in url_list:
        project_link = url.get_attribute('href')
        if '/projects/' in project_link:
            return_list.append(project_link)
    return return_list

##### To run a shorter loop, change the number for click_load_more

In [5]:
# indiegogo_query='https://www.indiegogo.com/explore/energy-green-tech' # filtered for GreenTech only
#greentech only doesn't have many inprogress
# and only those projet in _prorgress have numbers on desired gaol
# to delineate who is successful, presently, must change the search
indiegogo_query = "https://www.indiegogo.com/explore/all?project_type=campaign&project_timing=all&sort=trending"
click_load_more = 4
url_list = open_page_and_click_load_more(indiegogo_query, click_load_more)
url_list = get_url_list(url_list)

### Make Objects to Parse and Loops Through List of URLs

In [6]:
def parse_url_text(url, soup):
    '''
    input: a beautifulsoup object from an IndiGogo url
    output: a list of data specific to each IndiGogo project
    '''
    
    # project title
    title = soup.find("div", class_="basicsSection-title is-hidden-tablet t-h3--sansSerif").text.strip()
    
    # amount
    amount = soup.find("span", class_="basicsGoalProgress-amountSold t-h5--sansSerif t-weight--bold").text.replace('$', '').replace(',', '').strip()
    
    #backers
    backers = soup.find("span", class_="basicsGoalProgress-claimedOrBackers").text.strip().replace(' backers', '')
    
    # % of goal
    percent_of_goal = soup.find("span", class_="basicsGoalProgress-progressDetails-detailsGoal-goalPercentageOrInitiallyRaised").text.replace('%', '').strip().split(' of ')[0]
    if len(percent_of_goal) > 8:
        return ''
    
    # number of campaigns / serial campaigner
    number_of_campaigns = soup.find("div", class_="basicsCampaignOwner-details-count").text.strip().replace(' ', '').replace('\n', '').replace('|', '').replace('Campaigns', '').replace('Campaign', '')
    
    # location
    location = soup.find("div", class_="basicsCampaignOwner-details-city").text.strip()
    
    # time left
    time_left = soup.find("div", class_="basicsGoalProgress-progressDetails-detailsTimeLeft column t-body--sansSerif t-align--right").text.replace(' ', '').replace(',', '').replace('[', '').replace(']', '').replace('daysleft','').strip()
    
    # give back structured data
    return [url, title, amount, backers, percent_of_goal, number_of_campaigns, location, time_left]

In [7]:
# loop through target URLs, keeping the time.sleep element
def get_url_text(url):
    '''
    input: a string representation of a single URL
    output: the full driver.page_source text of the provided URL
    '''
    driver.get(url) # to test this without querying a site, comment out this line...
    driver.page_source # ...and this line
    soup = BeautifulSoup(driver.page_source, 'lxml') # ... and this line
    time.sleep(2) # ... and this
    return(soup)

        # print(driver.page_source)

In [8]:
def loop_through_url_list(url_list):
    '''
    This function receives a list of URLs, and loops through those. For each URL
    it will call 2 other functions: get_url_text(), which performs the .get() function
    using selenium, and parse_url_text(), which parses a soup object and returns a
    list with parsed data from the same soup object.
    ---
    input: a list of string URLs and an empty dataframe
    output: a dataframe with data from each URL in the list
    '''
    # this declares the df shape
    columns = ['url', 'title', 'amount', 'backers', 'percent_of_goal', 
               'number_of_campaigns', 'location', 'time_left']
    df = pd.DataFrame(columns=columns, dtype='str')
    x=0
    for url in url_list:
        print('getting', x, 'of', len(url_list), url)
        url_text = get_url_text(url) # calls another function
        data = parse_url_text(url, url_text) # calls another function
        if data == '':
            pass
        else:
            df.loc[df.shape[0] + 1] = data
        x+=1
    return df

### Kick Off Data Gathering

In [13]:
df = loop_through_url_list(url_list[1:]) # can run a subset of this list if preferred

getting 0 of 599 https://www.indiegogo.com/projects/princube-the-world-s-smallest-mobile-color-printer/pica
getting 1 of 599 https://www.indiegogo.com/projects/ciga-design-z-series-mechanical-titanium-watch--4/pica
getting 2 of 599 https://www.indiegogo.com/projects/cash-grab-the-graphic-novel-by-cecil/pica
getting 3 of 599 https://www.indiegogo.com/projects/scary-sleepover-the-resurrection/pica
getting 4 of 599 https://www.indiegogo.com/projects/hyperjuice-world-s-first-100w-gan-usb-c-charger/pica
getting 5 of 599 https://www.indiegogo.com/projects/thermosage-7-in-1-circulation-enhancing-massage/pica
getting 6 of 599 https://www.indiegogo.com/projects/mr-charger-2-0-4-in-1-hybrid-charger/pica
getting 7 of 599 https://www.indiegogo.com/projects/shine-ultra-next-gen-portable-powerful-scanner/pica
getting 8 of 599 https://www.indiegogo.com/projects/incharge-6-the-swiss-army-knife-of-cables/pica
getting 9 of 599 https://www.indiegogo.com/projects/being-alive/pica
getting 10 of 599 https:/

AttributeError: 'NoneType' object has no attribute 'text'

**Save it**

In [None]:
# df.to_pickle("C:\\Users\\tyler\\Documents\\GitHub\\sf20_ds17\\curriculum\\project-02\\Project_Precipitation\\indigogo3.pkl")

df = pd.read_pickle("C:\\Users\\tyler\\Documents\\GitHub\\sf20_ds17\\curriculum\\project-02\\Project_Precipitation\\indigogo4.pkl")

**Adjusting Data Types**

In [None]:
df['amount'] = pd.to_numeric(df['amount'])
df['backers'] = pd.to_numeric(df['backers'])
df['number_of_campaigns'] = pd.to_numeric(df['number_of_campaigns'])
df['percent_of_goal'] = pd.to_numeric(df['percent_of_goal'])
df['time_left'] = pd.to_numeric(df['time_left'])

### Feature Generation and Outlier Removal

In [None]:
df['goal'] = df['amount']/(0.0001+df['percent_of_goal']*.01) # add 0.0001 to avoid inifinite and NaN
df['title_length'] = df['title'].str.split().apply(len)

# split out to country only, then get_dummies
df['country'] = df['location'].str.split(',').str[1]
df = pd.get_dummies(df, columns = ['country'], drop_first=False)

This undoes a good bit of the *get_dummies()* but it should help the model

In [None]:
df['country_ Other'] = df['country_ Argentina'] + df['country_ Australia'] + df['country_ Canada'] + df['country_ Dorset']
+ df['country_ Ecuador'] + df['country_ Finland'] + df['country_ Georgia'] + df['country_ Germany']
+ df['country_ Italy'] + df['country_ Netherlands'] + df['country_ Panama'] + df['country_ Portugal']
+ df['country_ Sweden'] + df['country_ Turkey'] + df['country_ United Kingdom']

In [None]:
# df.append(removed_data) # to undo data removal
before = df.shape
df = df[df['amount'] < 60000]
print('before', before, 'after', df.shape)

In [None]:
goal_plot = df['goal']
sns.distplot(goal_plot)

**Drop the now-defunct columns**

In [None]:
df = df.drop(['country_ Argentina', 'country_ Australia', 'country_ Canada', 'country_ Dorset',
'country_ Ecuador', 'country_ Finland', 'country_ Georgia', 'country_ Germany',
'country_ Italy', 'country_ Netherlands', 'country_ Panama', 'country_ Portugal',
'country_ Sweden', 'country_ Turkey', 'country_ United Kingdom'], axis=1)

In [None]:
df = df.drop(['url', 'title', 'location', 'goal', 'title_length'], axis=1)

In [None]:
df.describe()

### Charts

In [None]:
# sns.pairplot(df);

In [None]:
axis_labels = ['Amount', 'Backers', 'Percent\nof Goal', 'Number of\nCampaigns', 'Time Left', 'Denamrk', 'United\nStates', 'Other\nCountry']
plt.figure(figsize=(12,12))
heat = sns.heatmap(df.corr(), square=True, annot=True, linewidths=0, cmap="BuGn_r", fmt='.2g', xticklabels=axis_labels, yticklabels=axis_labels, cbar=False)
heat.axes.set_title("IndieGoGo Feature Correlation",fontsize=40, color='w')
heat.tick_params(axis='both', labelsize=20, colors='white')
plt.xticks(rotation=90)
plt.yticks(rotation=360)
plt.savefig("heatmap.png", transparent=True)

In [None]:
corr = df.corr()
corr = corr.sort_values('amount', ascending=False)
corr

In [None]:
'''
might need to click "continue reading" 
---
Additional Feature Ideas:

Social Media Presence
1. Likes/Reactions
2. Retweets
3. Responses
Length of Story -- are more words better?
Count of Perks -- Is more perks more value?
Values of Perks -- Is there a perfect pricepoint?
Number of Pictures
Separate feature for 100% +++ -- what makes extreme success?
Project Tags -- is there an out performing tag?
'''

## Linear Regression

### With data as-is

In [None]:
X = df.iloc[:, :1]
y = df.iloc[:, 1:]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
model = LinearRegression()
model = model.fit(X_train, y_train)
# model.summary()
model.score(X_test, y_test)


In [None]:
predict = model.predict(X_train)
coef_list = list(zip(model.coef_, list(df.columns[1:])))
for coef in coef_list:
    print(coef)

### With scaled data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 2)
min_max_scaler = preprocessing.MinMaxScaler()
#scale values
min_max_scaler.fit(X_train)
X_train_scaled = min_max_scaler.transform(X_train)
X_test_scaled = min_max_scaler.transform(X_test)

model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

from sklearn.metrics import r2_score

r2_score(y_test, y_pred)

In [None]:
model_results_list = []

for i in range(52):    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=i)
    min_max_scaler = preprocessing.MinMaxScaler()
    #scale values
    min_max_scaler.fit(X_train)
    X_train_scaled = min_max_scaler.transform(X_train)
    X_test_scaled = min_max_scaler.transform(X_test)

    model = LinearRegression()
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    from sklearn.metrics import r2_score

    score = r2_score(y_test, y_pred)
    model_results_list.append(score)
    
import statistics
print(statistics.mean(model_results_list))

In [None]:
predict = model.predict(X_train_scaled)
coef_list = list(zip(model.coef_, list(df.columns[1:])))
for coef in coef_list:
    print(coef)

**Predictions vs Actuals**

In [None]:
plt.figure(figsize=(10, 10))
plt.title("Amount Raised",fontsize=40, color='w')
plt.scatter(y_test, y_pred, s=50, alpha=0.7, c='w')
plt.xlabel('Measured', fontsize=30, color='white')
plt.ylabel('Predicted', fontsize=30, color='white')
plt.tick_params(axis='both', labelsize=20, colors='white')
plt.plot(y_pred, y_pred, 'k--', lw=2)
ax = plt.gca()
ax.set_facecolor('xkcd:grey green')
plt.savefig("linechart.png", transparent=True)

## Lasso

In [None]:
lasso = Lasso(alpha=0.1)
lasso.fit(X_train_scaled, y_train)
y_pred = lasso.predict(X_test_scaled)
r2_score(y_test, y_pred)

In [None]:
predict = lasso.predict(X_test_scaled)

coef_list = list(zip(lasso.coef_, list(df.columns[1:])))
for coef in coef_list:
    print(coef)

### Ridge

In [None]:
ridge = Ridge(alpha=0.1)
ridge.fit(X_train_scaled, y_train)
y_pred = ridge.predict(X_test_scaled)
r2_score(y_test, y_pred)

### StatsModels

In [None]:
model = sm.OLS(X_train_scaled, y_train)
model = model.fit()
model.summary()