# Kickstarter project

In [0]:
# variables for tokenizing regarding to word embedding
MAX_SEQUENCE_LENGTH = 100
MAX_NUM_WORDS = 20000

In [0]:
import numpy as np
import pandas as pd

## Preparation of data
The first part has functions that we used to clean the data offline

### Cleaning the kaggle dataset

Dataset 1: https://www.kaggle.com/kemical/kickstarter-projects#ks-projects-201801.csv

The dataset contained some rows that couldn't be parsed properly (more or less columns, improper punctuation),
so it had to be cleaned first by dropping the rows that didn't contain the necessary amount of columns.

In [0]:
def cleankaggle():
  data = []
  linecnt = 0
  with open("ks-projects-201801.csv", encoding='latin1') as f:
      for line in f.readlines():
          split_ = line.split(',')
          linecnt += 1
          if linecnt % 10000 == 0:
              print(linecnt)
          if len(split_) == 15:
              data.append(line)

  print("Added {} to output".format(len(data)))
  with open("test_newer.csv", 'w', encoding='latin1') as outf:
      outf.writelines(data)

### Cleaning and joining the webcrawler dataset
Dataset 2: https://webrobots.io/kickstarter-datasets/

The webcrawler dataset was split originally into 51 CSV files, so these had to be merged into one big CSV file.
The webcrawler also had rows that had more or less columns, so those had to be dropped too.

In [0]:
def cleanwebcrawler():
  import os
  from csv import reader, writer

  inputfolder = r"G:\Deep Learning"
  csv_filetemplate = "Kickstarter%03d.csv"
  first = True


  def clean_columns(split2):
      max_len = 37
      keep = [1, 16, 29, 30]
      temp = []
      for i in range(0, max_len):
          if i in keep:
              temp.append(split2[i])

      return temp


  for i in range(0, 51):
      linecnt = 0
      data = []
      fname = csv_filetemplate % i if i != 0 else "Kickstarter.csv"
      fname = inputfolder + os.sep + fname
      print("Opening and processing {}".format(fname))
      with open(fname, encoding='latin1') as f:
          csvreader = reader(f)
          for line in csvreader:
              if first:
                  split_ = line
                  print("First line: {}".format(split_))
                  split_ = clean_columns(split_)
                  print("Cleaned first line: {}".format(split_))
                  data.append(split_)
                  first = False
              else:
                  split_ = line
                  linecnt += 1
                  if linecnt % 1000 == 0:
                      print(linecnt)
                      print(len(split_))
                  if len(split_) == 37:
                      split_ = clean_columns(split_)
                      data.append(split_)
      print("Added {} to output from file {}".format(len(data), fname))
      with open("test_dataset2.csv", 'a', encoding='latin1', newline='') as outf:
          csvwriter = writer(outf)
          for row in data:
              csvwriter.writerow(row)

### Joining the two datasets on ID
We joined the two datasets using the ID columns.

In [0]:
def joinonids():
  from csv import reader, writer

  with open("test_newer.csv", encoding='latin1') as f:
      with open("test_dataset2.csv", encoding='latin1') as second:
          csvreader1 = reader(f)
          csvreader2 = reader(second)
          cache = dict()
          firstrow = None
          for row in csvreader1:
              if firstrow is None:
                  firstrow = row
              else:
                  cache[row[0]] = row

          print("Done caching IDs from first file.")
          output = list()
          for row in csvreader2:
              if firstrow is not None:
                  key = row.pop(1)
                  row.extend(firstrow)
                  output.append(row)
                  firstrow = None
              else:
                  if row[1] in cache.keys():
                      key = row.pop(1)
                      row.extend(cache[key])
                      del cache[key]
                      output.append(row)

  print("IDs found in test.csv = {}".format(len(cache) + len(output)))
  print("Matching IDs output = {}".format(len(output)))
  print("Writing out...")
  with open("test_joined.csv", 'w', encoding='latin1', newline='') as outf:
      csvwriter = writer(outf)
      for row in output:
          csvwriter.writerow(row)

## Github clone the dataset -> separate from normal github
Here we clone the github repository that has the processed CSV files in order to process them a bit further.

In [0]:
!git clone https://github.com/Strongkong/cleaned_kickstarted_dataset

In [0]:
folder = "./cleaned_kickstarted_dataset/"

## Add helper function so that prints dont clutter code if not necessary
This helper function helps reduce code cluttering a bit

In [0]:
def print_if(text,pred):
  if pred:
    print(text)

### Debug predicate, use this if you want debug output

In [0]:
isdebug = True

## Load csv file
This is a CSV file cloned from the github repo.

In [0]:
test_joined_df = pd.read_csv(folder+"test_joined.csv", encoding='utf-8', sep=',')

### Remove whitespaces from header names
Some header names have whitespaces so we remove them.

In [0]:
df = test_joined_df.rename(columns=lambda x: x.strip())


### Peak at head
(This is for debug only)

In [12]:
print_if(df.blurb[3], isdebug)

Für die Finanzierung der Veröffentlichung meiner Masterarbeit, die dann als eigenständiges Buch bei Turia&Kant erscheint.


In [13]:
print_if(df.head(), isdebug)

                                               blurb  spotlight  staff_pick  \
0  If you've ever been disheartened by how some p...       True       False   
1  This book will show you how to attract your id...       True       False   
2  I told my first lie when I was six and never s...       True       False   
3  Für die Finanzierung der Veröffentlichung mein...       True       False   
4  Six teens (age 13-15) are hiking VT's 272-mile...       True       False   

           ID                                               name    category  \
0    62545913  "How We've Changed Jesus" - Neal Samudre's Deb...  Nonfiction   
1  2031730466  Your Client Vision (Book) - How to sell withou...  Nonfiction   
2   614137516                        The Longest Lie I Ever Told  Nonfiction   
3  1164271753  Spurenlesen – Zur Philosophie der Human-Animal...  Nonfiction   
4   384486589  Teens Hike VT's Long Trail and Write How-To an...  Nonfiction   

  main_category currency    deadline    goal

In [14]:
print_if(df.head(), isdebug)

                                               blurb  spotlight  staff_pick  \
0  If you've ever been disheartened by how some p...       True       False   
1  This book will show you how to attract your id...       True       False   
2  I told my first lie when I was six and never s...       True       False   
3  Für die Finanzierung der Veröffentlichung mein...       True       False   
4  Six teens (age 13-15) are hiking VT's 272-mile...       True       False   

           ID                                               name    category  \
0    62545913  "How We've Changed Jesus" - Neal Samudre's Deb...  Nonfiction   
1  2031730466  Your Client Vision (Book) - How to sell withou...  Nonfiction   
2   614137516                        The Longest Lie I Ever Told  Nonfiction   
3  1164271753  Spurenlesen – Zur Philosophie der Human-Animal...  Nonfiction   
4   384486589  Teens Hike VT's Long Trail and Write How-To an...  Nonfiction   

  main_category currency    deadline    goal

## Data description
Here we look at some of the data inside the DataFrame

### Numeric values

In [15]:
df.describe()

Unnamed: 0,ID,goal,pledged,backers,usd pledged,usd_pledged_real,usd_goal_real
count,160317.0,160317.0,160317.0,160317.0,160317.0,160317.0,160317.0
mean,1075937000.0,47755.05,11376.83,132.164356,8970.394,10782.99,44044.14
std,618147100.0,1213108.0,86167.65,922.405757,74868.27,82306.88,1158859.0
min,18520.0,0.01,0.0,0.0,0.0,0.0,0.01
25%,540488100.0,1500.0,75.0,3.0,36.54,74.37,1569.69
50%,1079762000.0,5000.0,1291.0,23.0,870.0,1301.0,5000.0
75%,1609329000.0,14000.0,5731.0,79.0,4735.25,5695.0,12979.26
max,2147476000.0,100000000.0,10266850.0,91585.0,10266850.0,10266850.0,151395900.0


### Categories

In [16]:
categories = df.category.unique()
print_if(categories, isdebug)

['Nonfiction' 'Publishing' 'Mixed Media' 'Web' 'People' 'Comic Books'
 'Comedy' 'Punk' 'Technology' 'Theater' 'Jazz' 'Software' 'Anthologies'
 'Science Fiction' 'Illustration' 'Gadgets' 'Product Design' 'Farms'
 'Workshops' 'Footwear' 'Weaving' 'Sculpture' 'Experimental' 'Pet Fashion'
 'Art' 'Graphic Design' 'Civic Design' 'Tabletop Games' 'Calendars'
 'Design' 'Electronic Music' 'Installations' 'Family' 'Festivals'
 'Conceptual Art' 'Performance Art' 'Glass' 'Painting' 'Plays' 'Jewelry'
 'Letterpress' 'Public Art' 'Accessories' 'DIY' 'Small Batch'
 'Video Games' 'Fantasy' 'Music' 'Comics' 'Digital Art' 'Immersive'
 'Hardware' 'Radio & Podcasts' "Children's Books" 'Flight' 'Fine Art'
 'Musical' 'Rock' 'Apparel' 'Nature' 'Games' 'Crafts' 'Robots'
 'Stationery' 'DIY Electronics' 'Space Exploration' 'Apps'
 'Movie Theaters' 'Woodworking' 'Printing' 'Knitting' 'Crochet'
 'Camera Equipment' 'Wearables' 'Makerspaces' 'Pottery' 'Photography'
 'Fashion' 'Candles' 'Embroidery' 'Photobooks' 'Art

### Main categories

In [17]:
main_categories = df.main_category.unique()
print_if(main_categories, isdebug)

['Publishing' 'Art' 'Journalism' 'Photography' 'Comics' 'Film & Video'
 'Music' 'Technology' 'Theater' 'Design' 'Food' 'Dance' 'Fashion' 'Crafts'
 'Games']


### Currencies

In [18]:
currencies = df.currency.unique()
print_if(currencies, isdebug)

['USD' 'GBP' 'CAD' 'EUR' 'AUD' 'NZD' 'SEK' 'NOK' 'DKK' 'MXN' 'HKD' 'SGD'
 'CHF' 'JPY']


### States

In [19]:
states = df.state.unique()
print_if(states, isdebug)

['successful' 'live' 'failed' 'canceled' 'suspended']


### Country

In [20]:
countries = df.country.unique()
print_if(countries, isdebug)

['US' 'GB' 'CA' 'DE' 'AU' 'BE' 'FR' 'ES' 'NZ' 'SE' 'LU' 'NL' 'NO' 'AT'
 'DK' 'MX' 'IE' 'IT' 'HK' 'SG' 'CH' 'JP']


## Cleaning the dataset
Here we clean the dataset further by removing unnecessary columns, nulls.

### Get rid of unnecessary columns
We need the real values of the pledgings and goals, and staff-pick and spotlight are features that are not input values

In [0]:
# Not necessary: spotlight, staff_pick, goal, usd pledged, pledged
# Only successful projects get spotlight and staff_pick.
# Use usd_goal_real instead of goal, usd_pledged_real instead of pledged.
df = df[['blurb', 'ID', 'name', 'category', 'main_category', 'currency', 'deadline', 'usd_goal_real', 'launched', 'usd_pledged_real', 'state', 'backers', 'country']]

In [22]:
print_if(df.count()['ID'], isdebug)
print_if(df.count()['blurb'], isdebug)

160317
160307


In [23]:
# keep only notnull lines
df = df[df.notnull().all(axis=1)].reset_index(drop=True)
print_if(len(df), isdebug)

160307


### Keep only failed and successful
We keep only failed/successful rows, because the other types are different versions of failed

In [0]:
df = df.loc[(df['state'] == 'failed') | (df['state'] == 'successful')].reset_index(drop=True)

In [25]:
print_if(df.groupby('state').count()[['ID']], isdebug)

               ID
state            
failed      65918
successful  85646


## Scaling
We scale the dollar values down by a 1000.

In [0]:
# To avoid accidentally running this multiple times, I added a guard
was_scaled = False

In [0]:
if not was_scaled:
  df['usd_goal_real'] = df['usd_goal_real'].apply(lambda x: x/1000, 1)
  df['usd_pledged_real'] = df['usd_pledged_real'].apply(lambda x: x/1000, 1)
  was_scaled = True

## Feature Engineering
We have to encode the categorical features using one-hot encoding.

In [28]:
print_if(df.state.unique(), isdebug)

['successful' 'failed']


In [29]:
print_if(df.tail(), isdebug)

                                                    blurb          ID  \
151559  A full-color book of my best robot themed sket...  1219087950   
151560  [Kickstarter Gold] A culinary love story with ...  1161973724   
151561  Creating a tourism promotional calendar of the...   362841351   
151562  Jesus has saved my life so I am creating this ...   672477562   
151563  "Bash" is an action drama web series about a b...   782376245   

                                                     name   category  \
151559                          Mecha Zone Robot Sketches  Art Books   
151560  The Lotus and the Artichoke - INDIA vegan cook...      Vegan   
151561                           Best of the 1000 Islands     Places   
151562                     St. Christopher - God Is Great      Faith   
151563                                               Bash  Webseries   

       main_category currency    deadline  usd_goal_real             launched  \
151559    Publishing      USD  2016-11-03      

### Encode labels with OneHotEncoder

In [0]:
!pip install -U scikit-learn

In [0]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [0]:
categories_le = LabelEncoder()
categories_ohe = OneHotEncoder(sparse=False) # readable for humans

categories_led = categories_le.fit_transform(df.category)
categories_led = categories_led.reshape(-1, 1)

df_category = pd.DataFrame(categories_ohe.fit_transform(categories_led))
df_category = df_category.rename(columns=lambda x: 'category_' + str(x))
df = df.join(df_category)

In [0]:
maincategories_le = LabelEncoder()
maincategories_ohe = OneHotEncoder(sparse=False) # readable for humans

maincategories_led = maincategories_le.fit_transform(df.main_category)
maincategories_led = maincategories_led.reshape(-1, 1)

df_main_category = pd.DataFrame(maincategories_ohe.fit_transform(maincategories_led))
df_main_category = df_main_category.rename(columns=lambda x: 'maincategory_' + str(x))
df = df.join(df_main_category)

In [0]:
state_le = LabelEncoder()
state_ohe = OneHotEncoder(sparse=False) # readable for humans

state_led = state_le.fit_transform(df.state)
state_led = state_led.reshape(-1, 1)

df_state = pd.DataFrame(state_ohe.fit_transform(state_led))
df_state = df_state.rename(columns=lambda x: 'state_' + str(x))
df = df.join(df_state)

In [0]:
country_le = LabelEncoder()
country_ohe = OneHotEncoder(sparse=False) # readable for humans

country_led = country_le.fit_transform(df.country)
country_led = country_led.reshape(-1, 1)

df_country = pd.DataFrame(country_ohe.fit_transform(country_led))
df_country = df_country.rename(columns=lambda x: 'country_' + str(x))
df = df.join(df_country)

In [0]:
currency_le = LabelEncoder()
currency_ohe = OneHotEncoder(sparse=False) # readable for humans

currency_led = currency_le.fit_transform(df.currency)
currency_led = currency_led.reshape(-1, 1)

df_currency = pd.DataFrame(currency_ohe.fit_transform(currency_led))
df_currency = df_currency.rename(columns=lambda x: 'currency_' + str(x))
df = df.join(df_currency, rsuffix='_curr')

### Calculate length of funding
We calculate how many days the funding took.

In [0]:
diff = (pd.to_datetime(df.deadline) - pd.to_datetime(df.launched)).map(lambda x: x.days)


In [38]:
print_if(diff.head(), isdebug)

0     7
1    28
2    29
3    29
4    36
dtype: int64


### Drop date times
We don't need the original dates, we have now a number of days.

In [0]:
df = df.drop(['deadline', 'launched'], axis=1)


In [40]:
print_if(df.shape, isdebug)

(151564, 223)


### Add duration to the dataframe
We add how many days the kickstarter lasted to the dataframe

In [0]:
df['duration'] = diff

In [42]:
print_if(df.shape, isdebug)

(151564, 224)


In [43]:
print_if(df.head(), isdebug)

                                               blurb          ID  \
0  If you've ever been disheartened by how some p...    62545913   
1  This book will show you how to attract your id...  2031730466   
2  I told my first lie when I was six and never s...   614137516   
3  Für die Finanzierung der Veröffentlichung mein...  1164271753   
4  Six teens (age 13-15) are hiking VT's 272-mile...   384486589   

                                                name    category  \
0  "How We've Changed Jesus" - Neal Samudre's Deb...  Nonfiction   
1  Your Client Vision (Book) - How to sell withou...  Nonfiction   
2                        The Longest Lie I Ever Told  Nonfiction   
3  Spurenlesen – Zur Philosophie der Human-Animal...  Nonfiction   
4  Teens Hike VT's Long Trail and Write How-To an...  Nonfiction   

  main_category currency  usd_goal_real  usd_pledged_real       state  \
0    Publishing      USD        3.50000           3.65100  successful   
1    Publishing      GBP        0.21

## Clean and tokenize text

### Remove non-english text -> so we can find the stems, reducing unique words


In [0]:
!pip install langdetect

In [0]:
from langdetect import detect

#### Using langdetect to detect the language of text
Takes 15+ minutes for blurb only

In [0]:
from langdetect import DetectorFactory
DetectorFactory.seed = 123
def detect_or_idk(text):
  try:
    return detect(text)
  except:
    return 'idk'
import datetime

#~15 min
df['blurb_language'] = df.blurb.apply(lambda text: detect_or_idk(text))


#### Recognized languages

In [47]:
df.blurb_language.value_counts()

en     148574
es        704
fr        699
de        611
sv        166
it        144
nl        135
da        105
no         68
af         60
ca         56
ro         39
tl         38
pt         31
vi         20
so         17
idk        15
id         13
cy         13
et          9
sw          8
hr          8
pl          5
sl          5
hu          4
ru          3
sk          3
lt          2
sq          2
tr          2
cs          2
he          1
ja          1
fi          1
Name: blurb_language, dtype: int64

#### Only keeping english entries
We lose around 3000 entries total

In [0]:
df = df[df.blurb_language=='en']
df = df.drop('blurb_language', axis=1)

#### Final size of dataset

In [49]:
df.blurb.count()

148574

#### Export IDs of the english blurbs
We exported them so we don't have to wait for langdetect to finish

In [0]:
english_ids_df = df[['ID']]

In [0]:
english_ids_df.to_csv("english_ids.csv", encoding='utf-8', sep='\t', index=False)

We also save the dataframe before tokenizing the text values

In [0]:
df_onlyen = df.drop(['ID', 'category', 'main_category', 'state', 'country', 'currency'], axis=1)
df_onlyen.to_csv(path_or_buf='train_dataset_with_texts.csv', sep='\t', index=False)

### Attempt 1: Regex approach - not good
Yields suboptimal results

In [0]:
#import re
#regex = re.compile(r"[^\w\d ]", re.UNICODE | re.IGNORECASE)

#name = df.name
#blurb = df.blurb

#name = name.map(lambda x: regex.sub(' ', str(x)).lower())
#blurb = blurb.map(lambda x: regex.sub(' ', str(x)).lower())

#name2 = name.apply(lambda x: x.split())
#blurb2 = blurb.apply(lambda x: x.split())


### Attempt 2: Manual lowercase and removal of punctuations & non-alpha characters - not good
Does not work very well either, better to use a library for it, for example to create stem words from words.


#### Convert to lowercase

In [0]:
#token_name = name.apply(lambda row: [word.lower() for word in row])
#token_blurb = blurb.apply(lambda row: [word.lower() for word in row])

In [0]:
#print_if(token_name[:5], isdebug)
#print_if(token_blurb[:5], isdebug)

#### Remove punctuation

In [0]:
#import string
#table = str.maketrans('','', string.punctuation)
#token_name = token_name.apply(lambda row: [word.translate(table) for word in row])
#token_blurb = token_blurb.apply(lambda row: [word.translate(table) for word in row])

In [0]:
#print_if(token_name[:5], isdebug)
#print_if(token_blurb[:5], isdebug)

#### Remove non-alpha characters

In [0]:
#token_name = token_name.apply(lambda row: [word for word in row if word.isalpha()])
#token_blurb = token_blurb.apply(lambda row: [word for word in row if word.isalpha()])

In [0]:
#print_if(token_name[:5], isdebug)
#print_if(token_blurb[:5], isdebug)

### Attempt 3: Tokenize and clean with Pattern


In [0]:
!apt-get install libmysqlclient-dev

In [0]:
!pip install Pattern

In [0]:
from pattern.en import parse

#### Cleaning Function 
Finds the stem word, cleans punctuation and numbers and removes stop words

In [0]:
def clean_all(text):
  # find lemmata
  words  = [ word[-1] for word in parse(text, lemmata=True).split()[0] ]
  words = [word for word in words if word.isalpha()]
  words = [word for word in words if word not in stop_words]
  return words

#### Import NLTK and download english stopwords

In [0]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

In [0]:
stop_words = set(stopwords.words('english'))

#### Run the cleaning function on the name and blurb columns
Takes around 4 minutes to run!!

In [0]:
df['cleaned_name'] = df.name.apply(lambda name: clean_all(name))

In [0]:
df['cleaned_blurb'] = df.blurb.apply(lambda blurb: clean_all(blurb))

In [0]:
df_cleaned_out = df[['ID', 'cleaned_name', 'cleaned_blurb']]

In [68]:
df_cleaned_out.head()

Unnamed: 0,ID,cleaned_name,cleaned_blurb
0,62545913,"[change, jesus, neal, samudre, debut, book]","[ever, dishearten, person, might, represent, c..."
1,2031730466,"[client, vision, book, sell, without, selling]","[book, show, attract, ideal, client, get, buy,..."
2,614137516,"[longest, lie, ever, tell]","[tell, first, lie, six, never, stop]"
4,384486589,"[teen, hike, vt, long, trail, write, guide, book]","[six, teen, age, hike, vt, long, trail, writin..."
5,655402823,"[ako, ay, pilipino, coffee, table, book]","[ako, ay, pilipino, noon, ngayon, creative, co..."


In [0]:
df_cleaned_out.to_csv("test_cleaned_text.csv", encoding='utf-8', sep='\t', index=False)

In [0]:
# drop name and blurb, we use cleaned versions in the future
df = df.drop(['name', 'blurb'], axis=1)
df.rename(columns={'cleaned_name': 'name', 'cleaned_blurb': 'blurb'}, inplace=True)

### Create bag of words - turns out that it's not good, too
We use HashingVectorizer to create a bag of words.
Not very optimal and does not provide much value on its own.

#### Count seemingly unique words

In [0]:
from sklearn.feature_extraction.text import HashingVectorizer

In [72]:
name_vectorizer = HashingVectorizer(n_features=2**8, lowercase=False, tokenizer = lambda doc: doc)
blurb_vectorizer = HashingVectorizer(n_features=2**8, lowercase=False, tokenizer = lambda doc: doc)

hashed_name = name_vectorizer.transform(df.name) #name_cleaned
hashed_blurb = blurb_vectorizer.transform(df.blurb) #blurb_cleaned

print_if("names:", isdebug)
print_if(hashed_name.shape, isdebug)
print_if(hashed_name[0], isdebug)
print_if("\nblurbs:", isdebug)
print_if(hashed_blurb.shape, isdebug)
print_if(hashed_blurb[0], isdebug)

names:
(148574, 256)
  (0, 14)	0.4082482904638631
  (0, 96)	-0.4082482904638631
  (0, 99)	0.4082482904638631
  (0, 132)	-0.4082482904638631
  (0, 159)	-0.4082482904638631
  (0, 202)	0.4082482904638631

blurbs:
(148574, 256)
  (0, 4)	0.35355339059327373
  (0, 8)	-0.35355339059327373
  (0, 14)	0.35355339059327373
  (0, 25)	0.35355339059327373
  (0, 29)	0.35355339059327373
  (0, 72)	-0.35355339059327373
  (0, 144)	0.35355339059327373
  (0, 166)	0.35355339059327373


#### Create dataframe from scipy sparse matrix

In [73]:
df_hashed_name = pd.DataFrame(hashed_name.todense())
df_hashed_name = df_hashed_name.rename(columns=lambda x: 'name_' + str(x))
df_hashed_blurb = pd.DataFrame(hashed_blurb.todense())
df_hashed_blurb = df_hashed_blurb.rename(columns=lambda x: "blurb_" + str(x))

print_if("names:", isdebug)
print_if(df_hashed_name.head(), isdebug)
print_if("\nblurbs:", isdebug)
print_if(df_hashed_blurb.head(), isdebug)

names:
   name_0  name_1  name_2  name_3  name_4  name_5  name_6  name_7  name_8  \
0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

   name_9    ...     name_246  name_247  name_248  name_249  name_250  \
0     0.0    ...          0.0       0.0       0.0       0.0       0.0   
1     0.0    ...          0.0       0.0       0.0       0.0       0.0   
2     0.0    ...          0.0       0.0       0.0       0.0       0.0   
3     0.0    ...          0.0       0.0       0.0       0.0       0.0   
4     0.0    ...          0.0       0.0       0.0       0.0       0.0   

   name_251  name_252  name_253  name_254  name_255  
0       0.0       0.0       0.0      

#### Concat original matrix and hashed name matrix

In [0]:
df = df.join(df_hashed_name)
df = df.join(df_hashed_blurb)

In [75]:
print_if(df.shape, isdebug)

(148574, 736)


#### Dropping unused columns, they are onehot encoded or vectorized

In [0]:
drop_columns = ['ID', 'category', 'main_category', 'state', 'country', 'currency', 'name', 'blurb']
df = df.drop(drop_columns, axis=1)

In [77]:
print_if(df.head(), isdebug)

   usd_goal_real  usd_pledged_real  backers  category_0  category_1  \
0        3.50000           3.65100       80         0.0         0.0   
1        0.21486           0.33273       31         0.0         0.0   
2        2.22370           2.66845       74         0.0         0.0   
4        3.70000           4.05120       44         0.0         0.0   
5        6.00000           6.06760       30         0.0         0.0   

   category_2  category_3  category_4  category_5  category_6    ...      \
0         0.0         0.0         0.0         0.0         0.0    ...       
1         0.0         0.0         0.0         0.0         0.0    ...       
2         0.0         0.0         0.0         0.0         0.0    ...       
4         0.0         0.0         0.0         0.0         0.0    ...       
5         0.0         0.0         0.0         0.0         0.0    ...       

   blurb_246  blurb_247  blurb_248  blurb_249  blurb_250  blurb_251  \
0        0.0        0.0        0.0        0.0

In [0]:
df.to_csv(path_or_buf='train_dataset.csv', sep='\t', index=False)

### Attempt 4: Tokenize for word embedding -> we run it in ks_training.ipynb

In [0]:
# zip=folder+'train_dataset_with_texts.csv.zip'
# !unzip $zip

In [0]:
# # It turns out that the hashingvectorized name and blurb data does not improve
# # out model's accuracy, so we use the raw texts on another model later on.
# # We're restoring a previous state of out dataset to try another technique.
# df = pd.read_csv('train_dataset_with_texts.csv', sep='\t')

In [0]:
# df = df[['name', 'blurb']]

In [0]:
# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences

In [0]:
# def implode(x):
#   ret = ""
#   for i in x:
#     ret = ret + str(i) + ","
#   ret = ret[:-1]
#   return ret

In [0]:
# # name fields
# name_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
# name_tokenizer.fit_on_texts(df.name)
# name_sequences = name_tokenizer.texts_to_sequences(df.name)
# 
# name_word_index = name_tokenizer.word_index
# print('Found %s unique name tokens.' % len(name_word_index))
# 
# # padding the sequences to make their length same
# padded_name_sequence = pad_sequences(name_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [0]:
# # blurb fields
# blurb_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
# blurb_tokenizer.fit_on_texts(df.blurb)
# blurb_sequences = blurb_tokenizer.texts_to_sequences(df.blurb)
# 
# blurb_word_index = blurb_tokenizer.word_index
# print('Found %s unique blurb tokens.' % len(blurb_word_index))
# 
# # padding the sequences to make their length same
# padded_blurb_sequence = pad_sequences(blurb_sequences, maxlen=MAX_SEQUENCE_LENGTH)