# Kickstarter project

In [0]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

## Preparation of data
The first part has functions that we used to clean the data offline

### Cleaning the kaggle dataset

Dataset 1: https://www.kaggle.com/kemical/kickstarter-projects#ks-projects-201801.csv

The dataset contained some rows that couldn't be parsed properly (more or less columns, improper punctuation),
so it had to be cleaned first by dropping the rows that didn't contain the necessary amount of columns.

In [0]:
def cleankaggle():
  data = []
  linecnt = 0
  with open("ks-projects-201801.csv", encoding='latin1') as f:
      for line in f.readlines():
          split_ = line.split(',')
          linecnt += 1
          if linecnt % 10000 == 0:
              print(linecnt)
          if len(split_) == 15:
              data.append(line)

  print("Added {} to output".format(len(data)))
  with open("test_newer.csv", 'w', encoding='latin1') as outf:
      outf.writelines(data)

### Cleaning and joining the webcrawler dataset
Dataset 2: https://webrobots.io/kickstarter-datasets/

The webcrawler dataset was split originally into 51 CSV files, so these had to be merged into one big CSV file.
The webcrawler also had rows that had more or less columns, so those had to be dropped too.

In [0]:
def cleanwebcrawler():
  import os
  from csv import reader, writer

  inputfolder = r"G:\Deep Learning"
  csv_filetemplate = "Kickstarter%03d.csv"
  first = True


  def clean_columns(split2):
      max_len = 37
      keep = [1, 16, 29, 30]
      temp = []
      for i in range(0, max_len):
          if i in keep:
              temp.append(split2[i])

      return temp


  for i in range(0, 51):
      linecnt = 0
      data = []
      fname = csv_filetemplate % i if i != 0 else "Kickstarter.csv"
      fname = inputfolder + os.sep + fname
      print("Opening and processing {}".format(fname))
      with open(fname, encoding='latin1') as f:
          csvreader = reader(f)
          for line in csvreader:
              if first:
                  split_ = line
                  print("First line: {}".format(split_))
                  split_ = clean_columns(split_)
                  print("Cleaned first line: {}".format(split_))
                  data.append(split_)
                  first = False
              else:
                  split_ = line
                  linecnt += 1
                  if linecnt % 1000 == 0:
                      print(linecnt)
                      print(len(split_))
                  if len(split_) == 37:
                      split_ = clean_columns(split_)
                      data.append(split_)
      print("Added {} to output from file {}".format(len(data), fname))
      with open("test_dataset2.csv", 'a', encoding='latin1', newline='') as outf:
          csvwriter = writer(outf)
          for row in data:
              csvwriter.writerow(row)

### Joining the two datasets on ID
We joined the two datasets using the ID columns.

In [0]:
def joinonids():
  from csv import reader, writer

  with open("test_newer.csv", encoding='latin1') as f:
      with open("test_dataset2.csv", encoding='latin1') as second:
          csvreader1 = reader(f)
          csvreader2 = reader(second)
          cache = dict()
          firstrow = None
          for row in csvreader1:
              if firstrow is None:
                  firstrow = row
              else:
                  cache[row[0]] = row

          print("Done caching IDs from first file.")
          output = list()
          for row in csvreader2:
              if firstrow is not None:
                  key = row.pop(1)
                  row.extend(firstrow)
                  output.append(row)
                  firstrow = None
              else:
                  if row[1] in cache.keys():
                      key = row.pop(1)
                      row.extend(cache[key])
                      del cache[key]
                      output.append(row)

  print("IDs found in test.csv = {}".format(len(cache) + len(output)))
  print("Matching IDs output = {}".format(len(output)))
  print("Writing out...")
  with open("test_joined.csv", 'w', encoding='latin1', newline='') as outf:
      csvwriter = writer(outf)
      for row in output:
          csvwriter.writerow(row)

## Github clone the dataset -> separate from normal github
Here we clone the github repository that has the processed CSV files in order to process them a bit further.

In [5]:
!git clone https://github.com/Strongkong/cleaned_kickstarted_dataset

Cloning into 'cleaned_kickstarted_dataset'...
remote: Enumerating objects: 4, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 20 (delta 0), reused 3 (delta 0), pack-reused 16[K
Unpacking objects: 100% (20/20), done.


In [0]:
folder = "./cleaned_kickstarted_dataset/"

## Add helper function so that prints dont clutter code if not necessary
This helper function helps reduce code cluttering a bit

In [0]:
def print_if(text,pred):
  if pred:
    print(text)

### Debug predicate, use this if you want debug output

In [0]:
isdebug = True

## Load csv files
These are the CSV files cloned from the github repo.

In [0]:
test_joined_df = pd.read_csv(folder+"test_joined.csv", encoding='utf-8', sep=',')

### Remove whitespaces from header names
Some header names have whitespaces so we remove them.

In [0]:
df = test_joined_df.rename(columns=lambda x: x.strip())


## Peak at head
(This is for debug only)

In [13]:
print_if(df.blurb[3], isdebug)

Für die Finanzierung der Veröffentlichung meiner Masterarbeit, die dann als eigenständiges Buch bei Turia&Kant erscheint.


In [14]:
print_if(df.head(), isdebug)

                                               blurb  spotlight  staff_pick  \
0  If you've ever been disheartened by how some p...       True       False   
1  This book will show you how to attract your id...       True       False   
2  I told my first lie when I was six and never s...       True       False   
3  Für die Finanzierung der Veröffentlichung mein...       True       False   
4  Six teens (age 13-15) are hiking VT's 272-mile...       True       False   

           ID                                               name    category  \
0    62545913  "How We've Changed Jesus" - Neal Samudre's Deb...  Nonfiction   
1  2031730466  Your Client Vision (Book) - How to sell withou...  Nonfiction   
2   614137516                        The Longest Lie I Ever Told  Nonfiction   
3  1164271753  Spurenlesen – Zur Philosophie der Human-Animal...  Nonfiction   
4   384486589  Teens Hike VT's Long Trail and Write How-To an...  Nonfiction   

  main_category currency    deadline    goal

In [15]:
print_if(df.head(), isdebug)

                                               blurb  spotlight  staff_pick  \
0  If you've ever been disheartened by how some p...       True       False   
1  This book will show you how to attract your id...       True       False   
2  I told my first lie when I was six and never s...       True       False   
3  Für die Finanzierung der Veröffentlichung mein...       True       False   
4  Six teens (age 13-15) are hiking VT's 272-mile...       True       False   

           ID                                               name    category  \
0    62545913  "How We've Changed Jesus" - Neal Samudre's Deb...  Nonfiction   
1  2031730466  Your Client Vision (Book) - How to sell withou...  Nonfiction   
2   614137516                        The Longest Lie I Ever Told  Nonfiction   
3  1164271753  Spurenlesen – Zur Philosophie der Human-Animal...  Nonfiction   
4   384486589  Teens Hike VT's Long Trail and Write How-To an...  Nonfiction   

  main_category currency    deadline    goal

## Data description
Here we look at some of the data inside the DataFrame

### Numeric values

In [16]:
df.describe()

Unnamed: 0,ID,goal,pledged,backers,usd pledged,usd_pledged_real,usd_goal_real
count,160317.0,160317.0,160317.0,160317.0,160317.0,160317.0,160317.0
mean,1075937000.0,47755.05,11376.83,132.164356,8970.394,10782.99,44044.14
std,618147100.0,1213108.0,86167.65,922.405757,74868.27,82306.88,1158859.0
min,18520.0,0.01,0.0,0.0,0.0,0.0,0.01
25%,540488100.0,1500.0,75.0,3.0,36.54,74.37,1569.69
50%,1079762000.0,5000.0,1291.0,23.0,870.0,1301.0,5000.0
75%,1609329000.0,14000.0,5731.0,79.0,4735.25,5695.0,12979.26
max,2147476000.0,100000000.0,10266850.0,91585.0,10266850.0,10266850.0,151395900.0


### Categories

In [17]:
categories = df.category.unique()
print_if(categories, isdebug)

['Nonfiction' 'Publishing' 'Mixed Media' 'Web' 'People' 'Comic Books'
 'Comedy' 'Punk' 'Technology' 'Theater' 'Jazz' 'Software' 'Anthologies'
 'Science Fiction' 'Illustration' 'Gadgets' 'Product Design' 'Farms'
 'Workshops' 'Footwear' 'Weaving' 'Sculpture' 'Experimental' 'Pet Fashion'
 'Art' 'Graphic Design' 'Civic Design' 'Tabletop Games' 'Calendars'
 'Design' 'Electronic Music' 'Installations' 'Family' 'Festivals'
 'Conceptual Art' 'Performance Art' 'Glass' 'Painting' 'Plays' 'Jewelry'
 'Letterpress' 'Public Art' 'Accessories' 'DIY' 'Small Batch'
 'Video Games' 'Fantasy' 'Music' 'Comics' 'Digital Art' 'Immersive'
 'Hardware' 'Radio & Podcasts' "Children's Books" 'Flight' 'Fine Art'
 'Musical' 'Rock' 'Apparel' 'Nature' 'Games' 'Crafts' 'Robots'
 'Stationery' 'DIY Electronics' 'Space Exploration' 'Apps'
 'Movie Theaters' 'Woodworking' 'Printing' 'Knitting' 'Crochet'
 'Camera Equipment' 'Wearables' 'Makerspaces' 'Pottery' 'Photography'
 'Fashion' 'Candles' 'Embroidery' 'Photobooks' 'Art

### Main categories

In [18]:
main_categories = df.main_category.unique()
print_if(main_categories, isdebug)

['Publishing' 'Art' 'Journalism' 'Photography' 'Comics' 'Film & Video'
 'Music' 'Technology' 'Theater' 'Design' 'Food' 'Dance' 'Fashion' 'Crafts'
 'Games']


### Currencies

In [19]:
currencies = df.currency.unique()
print_if(currencies, isdebug)

['USD' 'GBP' 'CAD' 'EUR' 'AUD' 'NZD' 'SEK' 'NOK' 'DKK' 'MXN' 'HKD' 'SGD'
 'CHF' 'JPY']


### States

In [20]:
states = df.state.unique()
print_if(states, isdebug)

['successful' 'live' 'failed' 'canceled' 'suspended']


### Country

In [21]:
countries = df.country.unique()
print_if(countries, isdebug)

['US' 'GB' 'CA' 'DE' 'AU' 'BE' 'FR' 'ES' 'NZ' 'SE' 'LU' 'NL' 'NO' 'AT'
 'DK' 'MX' 'IE' 'IT' 'HK' 'SG' 'CH' 'JP']


## Cleaning the dataset
Here we clean the dataset further by removing unnecessary columns, nulls.

### Get rid of unnecessary columns
We need the real values of the pledgings and goals, and staff-pick and spotlight are features that are not input values

In [0]:
# not necessary: spotlight, staff_pick, goal, usd pledged, pledged
df = df[['blurb', 'ID', 'name', 'category', 'main_category', 'currency', 'deadline', 'usd_goal_real', 'launched', 'usd_pledged_real', 'state', 'backers', 'country']]

In [23]:
print_if(df.count()['ID'], isdebug)
print_if(df.count()['blurb'], isdebug)

160317
160307


In [24]:
df = df[df.notnull().all(axis=1)].reset_index(drop=True)
print_if(len(df), isdebug)

160307


### Keep only failed and successful
We keep only failed/successful rows, because the other types are different versions of failed

In [0]:
df = df.loc[(df['state'] == 'failed') | (df['state'] == 'successful')].reset_index(drop=True)

In [26]:
print_if(df.groupby('state').count()[['ID']], isdebug)

               ID
state            
failed      65918
successful  85646


## Output scaling
We scale the dollar values down by a 1000.

In [0]:
# To avoid accidentally running this multiple times, I added a guard
was_scaled = False

In [0]:
if not was_scaled:
  df['usd_goal_real'] = df['usd_goal_real'].apply(lambda x: x/1000, 1)
  df['usd_pledged_real'] = df['usd_pledged_real'].apply(lambda x: x/1000, 1)
  was_scaled = True

## Feature Engineering
We have to encode the categorical features using one-hot encoding.

In [29]:
print_if(df.state.unique(), isdebug)

['successful' 'failed']


In [30]:
print_if(df.tail(), isdebug)

                                                    blurb          ID  \
151559  A full-color book of my best robot themed sket...  1219087950   
151560  [Kickstarter Gold] A culinary love story with ...  1161973724   
151561  Creating a tourism promotional calendar of the...   362841351   
151562  Jesus has saved my life so I am creating this ...   672477562   
151563  "Bash" is an action drama web series about a b...   782376245   

                                                     name   category  \
151559                          Mecha Zone Robot Sketches  Art Books   
151560  The Lotus and the Artichoke - INDIA vegan cook...      Vegan   
151561                           Best of the 1000 Islands     Places   
151562                     St. Christopher - God Is Great      Faith   
151563                                               Bash  Webseries   

       main_category currency    deadline  usd_goal_real             launched  \
151559    Publishing      USD  2016-11-03      

### Encode labels with OneHotEncoder

In [31]:
!pip install -U scikit-learn

Collecting scikit-learn
[?25l  Downloading https://files.pythonhosted.org/packages/10/26/d04320c3edf2d59b1fcd0720b46753d4d603a76e68d8ad10a9b92ab06db2/scikit_learn-0.20.1-cp36-cp36m-manylinux1_x86_64.whl (5.4MB)
[K    100% |████████████████████████████████| 5.4MB 6.6MB/s 
Installing collected packages: scikit-learn
  Found existing installation: scikit-learn 0.19.2
    Uninstalling scikit-learn-0.19.2:
      Successfully uninstalled scikit-learn-0.19.2
Successfully installed scikit-learn-0.20.1


In [0]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [33]:
categories_le = LabelEncoder()
categories_ohe = OneHotEncoder(sparse=False) # readable for humans

categories_led = categories_le.fit_transform(df.category)
categories_led = categories_led.reshape(-1, 1)

df_category = pd.DataFrame(categories_ohe.fit_transform(categories_led))
df_category = df_category.rename(columns=lambda x: 'category_' + str(x))
df = df.join(df_category)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [34]:
maincategories_le = LabelEncoder()
maincategories_ohe = OneHotEncoder(sparse=False) # readable for humans

maincategories_led = maincategories_le.fit_transform(df.main_category)
maincategories_led = maincategories_led.reshape(-1, 1)

df_main_category = pd.DataFrame(maincategories_ohe.fit_transform(maincategories_led))
df_main_category = df_main_category.rename(columns=lambda x: 'maincategory_' + str(x))
df = df.join(df_main_category)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [35]:
state_le = LabelEncoder()
state_ohe = OneHotEncoder(sparse=False) # readable for humans

state_led = state_le.fit_transform(df.state)
state_led = state_led.reshape(-1, 1)

df_state = pd.DataFrame(state_ohe.fit_transform(state_led))
df_state = df_state.rename(columns=lambda x: 'state_' + str(x))
df = df.join(df_state)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [36]:
country_le = LabelEncoder()
country_ohe = OneHotEncoder(sparse=False) # readable for humans

country_led = country_le.fit_transform(df.country)
country_led = country_led.reshape(-1, 1)

df_country = pd.DataFrame(country_ohe.fit_transform(country_led))
df_country = df_country.rename(columns=lambda x: 'country_' + str(x))
df = df.join(df_country)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [37]:
currency_le = LabelEncoder()
currency_ohe = OneHotEncoder(sparse=False) # readable for humans

currency_led = currency_le.fit_transform(df.currency)
currency_led = currency_led.reshape(-1, 1)

df_currency = pd.DataFrame(currency_ohe.fit_transform(currency_led))
df_currency = df_currency.rename(columns=lambda x: 'currency_' + str(x))
df = df.join(df_currency, rsuffix='_curr')

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


### Calculate length of funding
We calculate how many days the funding took.

In [0]:
diff = (pd.to_datetime(df.deadline) - pd.to_datetime(df.launched)).map(lambda x: x.days)


In [39]:
print_if(diff.head(), isdebug)

0     7
1    28
2    29
3    29
4    36
dtype: int64


### Drop date times
We don't need the original dates, we have now a number of days.

In [0]:
df = df.drop(['deadline', 'launched'], axis=1)


In [41]:
print_if(df.shape, isdebug)

(151564, 223)


### Add duration to the dataframe
We add how many days the kickstarter lasted to the dataframe

In [0]:
df['duration'] = diff

In [43]:
print_if(df.shape, isdebug)

(151564, 224)


In [44]:
print_if(df.head(), isdebug)

                                               blurb          ID  \
0  If you've ever been disheartened by how some p...    62545913   
1  This book will show you how to attract your id...  2031730466   
2  I told my first lie when I was six and never s...   614137516   
3  Für die Finanzierung der Veröffentlichung mein...  1164271753   
4  Six teens (age 13-15) are hiking VT's 272-mile...   384486589   

                                                name    category  \
0  "How We've Changed Jesus" - Neal Samudre's Deb...  Nonfiction   
1  Your Client Vision (Book) - How to sell withou...  Nonfiction   
2                        The Longest Lie I Ever Told  Nonfiction   
3  Spurenlesen – Zur Philosophie der Human-Animal...  Nonfiction   
4  Teens Hike VT's Long Trail and Write How-To an...  Nonfiction   

  main_category currency  usd_goal_real  usd_pledged_real       state  \
0    Publishing      USD        3.50000           3.65100  successful   
1    Publishing      GBP        0.21

## Clean and tokenize text

### Regex approach - not good
Yields suboptimal results

In [0]:
#import re
#regex = re.compile(r"[^\w\d ]", re.UNICODE | re.IGNORECASE)

#name = df.name
#blurb = df.blurb

#name = name.map(lambda x: regex.sub(' ', str(x)).lower())
#blurb = blurb.map(lambda x: regex.sub(' ', str(x)).lower())

#name2 = name.apply(lambda x: x.split())
#blurb2 = blurb.apply(lambda x: x.split())


### Remove non-english text -> so we can find the stems, reducing unique words


In [45]:
!pip install langdetect

Collecting langdetect
[?25l  Downloading https://files.pythonhosted.org/packages/59/59/4bc44158a767a6d66de18c4136c8aa90491d56cc951c10b74dd1e13213c9/langdetect-1.0.7.zip (998kB)
[K    100% |████████████████████████████████| 1.0MB 23.3MB/s 
Building wheels for collected packages: langdetect
  Running setup.py bdist_wheel for langdetect ... [?25l- \ | done
[?25h  Stored in directory: /root/.cache/pip/wheels/ec/0c/a9/1647275e7ef5014e7b83ff30105180e332867d65e7617ddafe
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.7


In [0]:
from langdetect import detect

#### Using langdetect to detect the language of text
Takes 15+ minutes for blurb only

In [0]:
from langdetect import DetectorFactory
DetectorFactory.seed = 123
def detect_or_idk(text):
  try:
    return detect(text)
  except:
    return 'idk'
import datetime

#~15 min
df['blurb_language'] = df.blurb.apply(lambda text: detect_or_idk(text))


#### Recognized languages

In [48]:
df.blurb_language.value_counts()

en     148574
es        704
fr        699
de        611
sv        166
it        144
nl        135
da        105
no         68
af         60
ca         56
ro         39
tl         38
pt         31
vi         20
so         17
idk        15
id         13
cy         13
et          9
hr          8
sw          8
sl          5
pl          5
hu          4
ru          3
sk          3
lt          2
cs          2
sq          2
tr          2
he          1
ja          1
fi          1
Name: blurb_language, dtype: int64

#### Only keeping english entries
We lose around 3000 entries total

In [0]:
df_onlyen = df[df.blurb_language=='en']

#### Final size of dataset

In [50]:
df_onlyen.blurb.count()

148574

### Export IDs of the english blurbs
We exported them so we don't have to wait for langdetect to finish

In [0]:
df_tosave = df_onlyen.ID

In [0]:
df_tosave.to_csv("english_ids.csv", encoding='utf-8', sep=',', index=False)

### Tokenize and clean with Pattern


In [53]:
!apt-get install libmysqlclient-dev

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libmysqlclient20 mysql-common
The following NEW packages will be installed:
  libmysqlclient-dev libmysqlclient20 mysql-common
0 upgraded, 3 newly installed, 0 to remove and 8 not upgraded.
Need to get 1,986 kB of archives.
After this operation, 11.4 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 mysql-common all 5.8+1.0.4 [7,308 B]
Get:2 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libmysqlclient20 amd64 5.7.24-0ubuntu0.18.04.1 [817 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libmysqlclient-dev amd64 5.7.24-0ubuntu0.18.04.1 [1,162 kB]
Fetched 1,986 kB in 1s (1,425 kB/s)
Selecting previously unselected package mysql-common.
(Reading database ... 26397 files and directories currently installed.)
Preparing to unpack .../mysql-common_5.8+1.0.4_all.deb

In [54]:
!pip install Pattern

Collecting Pattern
[?25l  Downloading https://files.pythonhosted.org/packages/1e/07/b0e61b6c818ed4b6145fe01d1c341223aa6cfbc3928538ad1f2b890924a3/Pattern-3.6.0.tar.gz (22.2MB)
[K    100% |████████████████████████████████| 22.3MB 1.7MB/s 
Collecting backports.csv (from Pattern)
  Downloading https://files.pythonhosted.org/packages/71/f7/5db9136de67021a6dce4eefbe50d46aa043e59ebb11c83d4ecfeb47b686e/backports.csv-1.0.6-py2.py3-none-any.whl
Collecting mysqlclient (from Pattern)
[?25l  Downloading https://files.pythonhosted.org/packages/f7/a2/1230ebbb4b91f42ad6b646e59eb8855559817ad5505d81c1ca2b5a216040/mysqlclient-1.3.14.tar.gz (91kB)
[K    100% |████████████████████████████████| 92kB 14.1MB/s 
Collecting lxml (from Pattern)
[?25l  Downloading https://files.pythonhosted.org/packages/03/a4/9eea8035fc7c7670e5eab97f34ff2ef0ddd78a491bf96df5accedb0e63f5/lxml-4.2.5-cp36-cp36m-manylinux1_x86_64.whl (5.8MB)
[K    100% |████████████████████████████████| 5.8MB 7.0MB/s 
[?25hCollecting feedparser

In [0]:
from pattern.en import parse

#### Cleaning Function 
Finds the stem word, cleans punctuation and numbers and removes stop words

In [0]:
def clean_all(text):
  # find lemmata
  words  = [ word[-1] for word in parse(text, lemmata=True).split()[0] ]
  words = [word for word in words if word.isalpha()]
  words = [word for word in words if word not in stop_words]
  return words

#### Convert to lowercase

In [0]:
token_name = token_name_first.apply(lambda row: [word.lower() for word in row])
token_blurb = token_blurb_first.apply(lambda row: [word.lower() for word in row])

In [0]:
print_if(token_name[:5], isdebug)
print_if(token_blurb[:5], isdebug)

#### Remove punctuation

In [0]:
import string
table = str.maketrans('','', string.punctuation)
token_name = token_name.apply(lambda row: [word.translate(table) for word in row])
token_blurb = token_blurb.apply(lambda row: [word.translate(table) for word in row])

In [0]:
print_if(token_name[:5], isdebug)
print_if(token_blurb[:5], isdebug)

#### Remove non-alpha characters

In [0]:
token_name = token_name.apply(lambda row: [word for word in row if word.isalpha()])
token_blurb = token_blurb.apply(lambda row: [word for word in row if word.isalpha()])

In [0]:
print_if(token_name[:5], isdebug)
print_if(token_blurb[:5], isdebug)

#### Import NLTK and download english stopwords

In [0]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

In [0]:
stop_words = set(stopwords.words('english'))

#### Run the cleaning function on the name and blurb columns
Takes around 4 minutes to run!!

In [0]:
df['cleaned_name'] = df.name.apply(lambda name: clean_all(name))

In [0]:
df['cleaned_blurb'] = df.blurb.apply(lambda blurb: clean_all(blurb))

In [0]:
df_cleaned_out = df[['ID', 'cleaned_name', 'cleaned_blurb']]

In [0]:
df_cleaned_out.head()

In [0]:
name_cleaned = token_name.apply(lambda wlist: [word for word in wlist if word not in stop_words] )
blurb_cleaned = token_blurb.apply(lambda wlist: [word for word in wlist if word not in stop_words] )


In [0]:
print_if(name_cleaned[:5], isdebug)
print_if(blurb_cleaned[:5], isdebug)

### Create bag of words
We use HashingVectorizer to create a bag of words.
Not very optimal and does not provide much value on its own.

#### Count seemingly unique words

In [0]:
from sklearn.feature_extraction.text import HashingVectorizer

In [0]:
name_vectorizer = HashingVectorizer(n_features=2**8, lowercase=False, tokenizer = lambda doc: doc)
blurb_vectorizer = HashingVectorizer(n_features=2**8, lowercase=False, tokenizer = lambda doc: doc)

hashed_name = name_vectorizer.transform(df.name) #name_cleaned
hashed_blurb = blurb_vectorizer.transform(df.blurb) #blurb_cleaned

print_if("names:", isdebug)
print_if(hashed_name.shape, isdebug)
print_if(hashed_name[0], isdebug)
print_if("\nblurbs:", isdebug)
print_if(hashed_blurb.shape, isdebug)
print_if(hashed_blurb[0], isdebug)

names:
(148574, 256)
  (0, 3)	-0.2982749931359468
  (0, 15)	-0.17896499588156806
  (0, 44)	0.05965499862718936
  (0, 45)	-0.05965499862718936
  (0, 47)	-0.05965499862718936
  (0, 80)	0.05965499862718936
  (0, 84)	-0.11930999725437871
  (0, 92)	0.05965499862718936
  (0, 103)	0.2982749931359468
  (0, 115)	0.23861999450875743
  (0, 126)	-0.7158599835262722
  (0, 152)	0.2982749931359468
  (0, 161)	-0.05965499862718936
  (0, 178)	0.17896499588156806
  (0, 179)	-0.05965499862718936
  (0, 194)	0.05965499862718936
  (0, 199)	-0.05965499862718936
  (0, 224)	0.17896499588156806
  (0, 234)	-0.05965499862718936
  (0, 241)	0.05965499862718936
  (0, 253)	-0.11930999725437871

blurbs:
(148574, 256)
  (0, 3)	-0.28135375487034925
  (0, 15)	-0.16077357421162816
  (0, 45)	-0.16077357421162816
  (0, 47)	-0.04019339355290704
  (0, 80)	0.04019339355290704
  (0, 84)	-0.12058018065872111
  (0, 92)	0.04019339355290704
  (0, 102)	-0.08038678710581408
  (0, 103)	0.36174054197616334
  (0, 115)	0.16077357421162816

#### Create dataframe from scipy sparse matrix

In [0]:
df_hashed_name = pd.DataFrame(hashed_name.todense())
df_hashed_name = df_hashed_name.rename(columns=lambda x: 'name_' + str(x))
df_hashed_blurb = pd.DataFrame(hashed_blurb.todense())
df_hashed_blurb = df_hashed_blurb.rename(columns=lambda x: "blurb_" + str(x))

print_if("names:", isdebug)
print_if(df_hashed_name.head(), isdebug)
print_if("\nblurbs:", isdebug)
print_if(df_hashed_blurb.head(), isdebug)

names:
   name_0  name_1  name_2    name_3  name_4  name_5  name_6  name_7  name_8  \
0     0.0     0.0     0.0 -0.298275     0.0     0.0     0.0     0.0     0.0   
1     0.0     0.0     0.0 -0.285831     0.0     0.0     0.0     0.0     0.0   
2     0.0     0.0     0.0 -0.257248     0.0     0.0     0.0     0.0     0.0   
3     0.0     0.0     0.0 -0.328165     0.0     0.0     0.0     0.0     0.0   
4     0.0     0.0     0.0 -0.303170     0.0     0.0     0.0     0.0     0.0   

   name_9    ...     name_246  name_247  name_248  name_249  name_250  \
0     0.0    ...          0.0       0.0       0.0       0.0       0.0   
1     0.0    ...          0.0       0.0       0.0       0.0       0.0   
2     0.0    ...          0.0       0.0       0.0       0.0       0.0   
3     0.0    ...          0.0       0.0       0.0       0.0       0.0   
4     0.0    ...          0.0       0.0       0.0       0.0       0.0   

   name_251  name_252  name_253  name_254  name_255  
0       0.0       0.0 -0.

#### Concat original matrix and hashed name matrix

In [0]:
df = df.join(df_hashed_name)
df = df.join(df_hashed_blurb)

In [0]:
print_if(df.shape, isdebug)

(148574, 736)


### Dropping unused columns, they are onehot encoded or vectorized

In [0]:
drop_columns = ['ID', 'category', 'main_category', 'state', 'country', 'currency', 'name', 'blurb']

df = df.drop(drop_columns, axis=1)

In [0]:
print_if(df.head(), isdebug)

   usd_goal_real  usd_pledged_real  backers  category_0  category_1  \
0        3.50000           3.65100       80         0.0         0.0   
1        0.21486           0.33273       31         0.0         0.0   
2        2.22370           2.66845       74         0.0         0.0   
3        3.70000           4.05120       44         0.0         0.0   
4        6.00000           6.06760       30         0.0         0.0   

   category_2  category_3  category_4  category_5  category_6    ...      \
0         0.0         0.0         0.0         0.0         0.0    ...       
1         0.0         0.0         0.0         0.0         0.0    ...       
2         0.0         0.0         0.0         0.0         0.0    ...       
3         0.0         0.0         0.0         0.0         0.0    ...       
4         0.0         0.0         0.0         0.0         0.0    ...       

   blurb_246  blurb_247  blurb_248  blurb_249  blurb_250  blurb_251  \
0        0.0        0.0        0.0        0.0