In [1]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
def read_info(fn):

    # read the entire file into a python array
    with open(fn, 'rb') as f:
        data = f.readlines()

    # remove the trailing "\n" from each line
    data = map(lambda x: x.rstrip(), data)

    # each element of 'data' is an individual JSON object.
    # have all the individual business JSON objects
    data_json_str = "[" + ','.join(data) + "]"

    # now, load it into pandas
    data_df = pd.read_json(data_json_str)
    data_df.columns = [x.lower() for x in data_df.columns]
    
    return data_df

In [18]:
DATA_DUMP = './data/route_info.txt'

In [19]:
# read in the json created by scrape.py
d = read_info(DATA_DUMP)

AttributeError: 'NoneType' object has no attribute 'keys'

In [6]:
# randomize the order of routes
# not ideal but okay
d = d.sample(frac=1)

In [7]:
# print route name column out to file
# route name RNN can train on this
# d['name'].to_csv('./data/route_name.txt', encoding = 'utf8', index = False)

In [8]:
# d['staraverage'].hist() # significant heaping
# d['starptile'] = pd.Series(d['staraverage'].qcut(), index = d.index)
# q = d.quantile(d['staraverage'])

In [9]:
def scale01(df):
    df -= df.min()
    df /= df.max()
    return df

d['starpct'] = scale01(d['staraverage'])
# pd.concat([ d['staraverage'], scale01(d['staraverage']) ], axis = 1) # pct mapping

In [10]:
rating = d['rateyds'].unique()
rating = np.sort(rating, kind = 'heapsort') # few unique values
# print(rating)

In [17]:
# d.sort_values(['rateza'], inplace=True)
# d[['href','rateyds','ratebritish','rateuiaa','rateewbanks','rateza','ratefrench']].head()
# the + - and protection rating are tricky
# TODO parse this out

In [12]:
# d.plot(kind='scatter', x='staraverage', y='rateyds')

In [13]:
# vote_min = 20
# d['starvotes'][d['starvotes'] >= vote_min].hist()
# looks exponential

In [14]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Lars Buitinck <L.J.Buitinck@uva.nl>
#         Chyi-Kwei Yau <chyikwei.yau@gmail.com>
# License: BSD 3 clause

In [15]:
# from __future__ import print_function
from time import time

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [None]:
# hyperparameters
n_topics = 8
n_samples = 2000
n_features = 1000
n_top_words = 12

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [26]:
print("Loading dataset...")
t0 = time()
d = read_info(DATA_DUMP)
TXT_COLUMN_TO_ANALYZE = 'description'
txt = d[TXT_COLUMN_TO_ANALYZE]
data_samples = txt[pd.notnull(txt)].values.tolist()

print("done in %0.3fs." % (time() - t0))

# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=1, #max_features=n_features,
                                   stop_words='english', ngram_range = (1,2))
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=1, max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

Loading dataset...
done in 0.026s.
Extracting tf-idf features for NMF...
done in 0.147s.
Extracting tf features for LDA...
done in 0.036s.


In [28]:
type(tfidf)

scipy.sparse.csr.csr_matrix

In [27]:
tfidf.shape
# 404 data samples
# 2274 vocabulary

(404, 13447)

In [58]:
import scipy
# max_features=n_features, ???? How many features are there??
tfidf_vectorizer_1x1 = TfidfVectorizer(max_df=0.5, min_df=1,
                                       stop_words='english', ngram_range = (1,1))
tfidf_1x1 = tfidf_vectorizer_1x1.fit_transform(data_samples)

tfidf_vectorizer_2x2 = TfidfVectorizer(max_df=0.5, min_df=1,
                                       stop_words='english', ngram_range = (2,2))
tfidf_2x2 = tfidf_vectorizer_2x2.fit_transform(data_samples)

tfidf = tfidf_2x2 # / 10.
print(tfidf.shape)
scipy.sparse.csr_matrix.mean(sum(tfidf))

(404, 11173)


0.20737429015989606

In [60]:
count_vectorizer_1x1 = CountVectorizer(max_df=0.5, min_df=1, stop_words='english', ngram_range = (1,1))
count_1x1 = count_vectorizer_1x1.fit_transform(data_samples)

count_vectorizer_2x2 = CountVectorizer(max_df=0.5, min_df=1, stop_words='english', ngram_range = (2,2))
count_2x2 = count_vectorizer_2x2.fit_transform(data_samples)

count = count_2x2
print(count.shape)
scipy.sparse.csr_matrix.mean(sum(count))

(404, 11173)


1.4224469703750111

In [65]:
route = d.iloc[3]
route.area_hierarchy[1:]

[u'/v/california/105708959',
 u'/v/tahquitz--suicide-rocks/105788020',
 u'/v/the-others/109014094',
 u'/v/cloud-ripper-towers/109909589',
 u'/v/the-real-cloud-ripper-tower/110540219']