In [6]:
import re
import nltk
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords


In [7]:
df = pd.read_csv('data/monthly-events/april.csv', index_col=0)
df.head()

Unnamed: 0,Cost,Date-range,Time,Title,Venue,event_text
0,Cost: Free Entrance,14 February 2018 to 14 April 2018,Time: Mon to Fri 9am to 6pm | Sat 10am to 2pm,Extra Ordinary: Southern Guild,"Guild, Shop 5B, Silo 5, V&A Waterfront, Cape Town",Southern Guild presents a multidisciplinary gr...
1,,29 March 2018 to 4 April 2018,,KKNK Afrikaans Arts Festival 2018,"Oudtshoorn, Western Cape",\nAnnual Afrikaans arts festival which takes p...
2,R89 – R175,31 October 2017 to 27 April 2018,Time: 6pm | Sat 5pm | Sun 4pm,The Galileo Open Air Cinema,Various,The Galileo Open Air Cinema is back again with...
3,,2 to 28 April 2018,,"Aunty Merle, The Musical",,The Baxter Theatre presents local comedian Mar...
4,R155 – R285 (bookings via phone),11 December 2017 to 30 April 2018,12pm onwards,Garden Picnics at The Cellars-Hohenort,"The Cellars-Hohenort Hotel, 93 Brommersvlei Rd...",The Cellars-Hohenort 5-star hotel offers a sel...


In [None]:
df = df[['Title','event_text']]
df.head()

In [6]:
def tokenize(text):
    """
    Tokenizes sequences of text and stems the tokens.
    :param text: String to tokenize
    :return: List with stemmed tokens
    """
    tokens = nltk.WhitespaceTokenizer().tokenize(text)
    tokens = list(set(re.sub("[^a-zA-Z\']", "", token) for token in tokens))
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    tokens = list(set(re.sub("[^a-zA-Z]", "", token) for token in tokens))
    stems = []
    stemmer = SnowballStemmer("english")
    for token in tokens:
        stem = stemmer.stem(token)
        if stem != "":
            stems.append(stem)
    return stems



# TODO: Optimize the ngram_range, min_df and max_df
vectorizer = CountVectorizer(ngram_range=(1,3), 
                             tokenizer=tokenize,
                             min_df=2, 
                             max_df=0.3, 
                             binary=True, 
                             stop_words='english')

Vectorize the text content and the titles

In [5]:
X_title = vectorizer.fit_transform(df['Title'])
X_title = X_title.toarray()

X_text = vectorizer.fit_transform(df['event_text'])
X_text = X_text.toarray()

Combine vectorized text and title into one feature set

In [18]:
X = np.concatenate([X_title, X_text], axis=1)
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ..., 
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

Reduce dimensionality of data

In [21]:
from sklearn.decomposition import PCA, TruncatedSVD
reducer = PCA(n_components=2)
reducer.fit(X)
X = reducer.transform(X)
X

array([[-0.87930322,  1.93469391],
       [-0.70793441, -0.62167304],
       [-0.62483443, -2.20616772],
       [-0.90429563,  1.96130541],
       [-0.66186396, -0.74631296],
       [-0.88482898, -1.35512164],
       [-0.81266272,  2.457141  ],
       [-0.77426264, -0.47531757],
       [-1.08658636, -0.70312185],
       [-1.06707232,  2.7360914 ],
       [-0.92754057, -0.19270365],
       [-0.7729643 ,  1.0123762 ],
       [-0.85498028, -0.07598827],
       [-0.9406189 , -2.04858457],
       [-0.80144021, -0.33460709],
       [-0.902709  , -0.791194  ],
       [-0.84043056, -0.69684992],
       [-0.76096219, -0.28728765],
       [-0.55207033,  1.23265602],
       [ 8.68235163,  0.05115636],
       [ 8.63293605,  0.01795533],
       [-0.79760058,  0.1645149 ],
       [-0.76032609, -1.03296058]])

In [22]:
df_vec = pd.DataFrame(None)
df_vec['Title'] = df['Title']
df_vec['numbers'] = range(0, len(df_vec))
df_vec['coords'] = df_vec['numbers'].apply(lambda index: X[index, :])
del df_vec['numbers']

for i in range(1, 5):
    df_vec[f'recommended_{i}'] = ""
    
df_vec.head()

Unnamed: 0,Title,coords,recommended_1,recommended_2,recommended_3,recommended_4
0,Extra Ordinary: Southern Guild,"[-0.879303223184, 1.93469390587]",,,,
1,KKNK Afrikaans Arts Festival 2018,"[-0.707934410274, -0.621673040255]",,,,
2,The Galileo Open Air Cinema,"[-0.624834428277, -2.2061677248]",,,,
3,"Aunty Merle, The Musical","[-0.904295630424, 1.96130540895]",,,,
4,Garden Picnics at The Cellars-Hohenort,"[-0.661863962409, -0.746312963898]",,,,


In [26]:
# TODO: Check out this shit
def cosine_similarity(x, y):
    def square_rooted(v):
        return round(np.sqrt(sum([a * a for a in v])), 3)
    numerator = sum(a * b for a, b in zip(x, y))
    denominator = square_rooted(x) * square_rooted(y)
    return round(numerator/float(denominator), 3)


similarity_dict = {}
for idx, row in df_vec.iterrows():
    scores = {}
    for idx2, row2 in df_vec.iterrows():
        if idx != idx2:
            scores[idx2] = cosine_similarity(row['coords'], row2['coords'])
    similarity_dict[idx] = scores

In [28]:
for idx, row in df_vec.iterrows():
    sim_scores = similarity_dict[idx]
    
    for i in range(1, 5):
        
        most_similar_idx = max(sim_scores, key=sim_scores.get)
        most_similar_score = sim_scores[most_similar_idx]
        
        del sim_scores[most_similar_idx]
        
        title = df_vec.loc[most_similar_idx]['Title']
        
        title_plus_score = f'{title} {most_similar_score}'
        
        df_vec.set_value(idx, f'recommended_{i}', title_plus_score)
        
df_vec
        

Unnamed: 0,Title,coords,recommended_1,recommended_2,recommended_3,recommended_4
0,Extra Ordinary: Southern Guild,"[-0.879303223184, 1.93469390587]",Suidoosterfees: Ikhaya Lam 1.0,The Road to Mecca 0.999,El Anatsui at SA National Gallery 0.994,Puppet Guy – Conrad Koch 0.975
1,KKNK Afrikaans Arts Festival 2018,"[-0.707934410274, -0.621673040255]",La Motte Concert Series: James Grace Journey t...,WWE LIVE Superstars 1.0,Garden Picnics at The Cellars-Hohenort 0.992,West Side Story 0.99
2,The Galileo Open Air Cinema,"[-0.624834428277, -2.2061677248]",GoodLuck Live at Vergenoegd 0.988,Outdoor Movie Nights at Spier 0.955,Tales of Little Grey Rabbit 0.936,Garden Picnics at The Cellars-Hohenort 0.9
3,"Aunty Merle, The Musical","[-0.904295630424, 1.96130540895]",Extra Ordinary: Southern Guild 1.0,Suidoosterfees: Ikhaya Lam 1.0,The Road to Mecca 0.998,El Anatsui at SA National Gallery 0.994
4,Garden Picnics at The Cellars-Hohenort,"[-0.661863962409, -0.746312963898]",Tales of Little Grey Rabbit 0.995,KKNK Afrikaans Arts Festival 2018 0.992,La Motte Concert Series: James Grace Journey t...,Outdoor Movie Nights at Spier 0.989
5,Outdoor Movie Nights at Spier,"[-0.884828977373, -1.35512164191]",Tales of Little Grey Rabbit 0.998,Garden Picnics at The Cellars-Hohenort 0.989,GoodLuck Live at Vergenoegd 0.989,KKNK Afrikaans Arts Festival 2018 0.964
6,El Anatsui at SA National Gallery,"[-0.812662718126, 2.45714099748]",The Road to Mecca 0.999,Suidoosterfees: Ikhaya Lam 0.995,Extra Ordinary: Southern Guild 0.994,"Aunty Merle, The Musical 0.994"
7,Trio Esperanza: CT Concert Series,"[-0.774262637132, -0.475317571192]",West Side Story 0.999,WWE LIVE Superstars 0.989,Carlos Santana Divination Tour 0.988,La Motte Concert Series: James Grace Journey t...
8,West Side Story,"[-1.08658635717, -0.703121847702]",Trio Esperanza: CT Concert Series 0.999,WWE LIVE Superstars 0.993,KKNK Afrikaans Arts Festival 2018 0.99,La Motte Concert Series: James Grace Journey t...
9,The Road to Mecca,"[-1.06707232257, 2.73609139594]",Extra Ordinary: Southern Guild 0.999,El Anatsui at SA National Gallery 0.999,"Aunty Merle, The Musical 0.998",Suidoosterfees: Ikhaya Lam 0.998


In [23]:
def euclidean_distance(x, y):
    return np.linalg.norm(x-y)