In [195]:
import gather_data as gd
import api_call as ap
import clean_data as cd
import pandas as pd
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import pyspark
from pyspark.sql.types import *
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_similarity_score
import sklearn.metrics.pairwise as smp
from time import time

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [126]:
df = pd.read_csv('ratings_matrix.csv')

In [127]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10520 entries, 0 to 10519
Data columns (total 4 columns):
Unnamed: 0    10520 non-null int64
artist_id     10520 non-null int64
venue_id      10520 non-null int64
event_id      10520 non-null int64
dtypes: int64(4)
memory usage: 328.8 KB


In [128]:
df = df.drop(labels = 'Unnamed: 0', axis=1)

In [129]:
df.head(15)

Unnamed: 0,artist_id,venue_id,event_id
0,7,2016,1
1,7,3918,1
2,12,1543,1
3,12,2765,1
4,12,5263,1
5,14,1744,1
6,14,2016,1
7,14,2898,1
8,14,3225,1
9,14,4399,1


In [130]:
len(np.unique(df['artist_id']))

3228

In [131]:
np.max(df['venue_id'])

5727

In [133]:
def convert_data(df):
    index_list = list(np.unique(df['artist_id']))
    ids = []
    for value in df['artist_id']:
        #print(value)
        ids.append(index_list.index(value))
    df['artist_id'] = ids

In [134]:
data = convert_data(df)

In [29]:
index_list = list(np.unique(df['artist_id']))
ids = []
for value in df['artist_id']:
    #print(value)
    ids.append(index_list.index(value))
df['artist_id'] = ids

In [31]:
event_mat = make_event_matt(df)

In [37]:
event_mat

<3228x5728 sparse matrix of type '<class 'numpy.float64'>'
	with 10520 stored elements in LInked List format>

In [38]:
simileratyMat = cosine_similarity(event_mat.T)

In [39]:
simileratyMat.shape

(5728, 5728)

In [42]:
event_mat.shape

(3228, 5728)

In [44]:
np.argsort(simileratyMat, axis = 1)

array([[2863, 3822, 3821, ..., 4860, 2586,    0],
       [   0, 3823, 3822, ..., 1912, 5727,    1],
       [   0, 3823, 3822, ..., 1602, 1609, 1603],
       ...,
       [   0, 3817, 3816, ..., 5724, 5725, 5717],
       [   0, 3823, 3822, ..., 5648, 3318, 5726],
       [   0, 3822, 3821, ..., 4470, 1286, 5727]])

In [119]:
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from time import time


class VenueVenueRecommender(object):

    def __init__(self, neighborhood_size):
        self.neighborhood_size = neighborhood_size

    def fit(self, data):
        self.data = data
        self.ratings_mat = self.get_ratings_mat()
        self.n_artist = self.ratings_mat.shape[0]
        self.n_venues = self.ratings_mat.shape[1]
        self.item_sim_mat = cosine_similarity(self.ratings_mat.T)
        self._set_neighborhoods()

    def _set_neighborhoods(self):
        least_to_most_sim_indexes = np.argsort(self.item_sim_mat, 1)
        self.neighborhoods = least_to_most_sim_indexes[:, -self.neighborhood_size:]

    def pred_one_user(self, artist_id, report_run_time=False):
        start_time = time()
        venues_played_by_artist = self.ratings_mat[artist_id].nonzero()[1]
        # Just initializing so we have somewhere to put rating preds
        result = np.zeros(self.n_venues)
        print(artist_id)
        for venue_to_rate in range(self.n_venues):
            relevant_venues = np.intersect1d(self.neighborhoods[venue_to_rate],
                                            venues_played_by_artist,
                                            assume_unique=True)  # assume_unique speeds up intersection op

            '''result[venue_to_rate] = self.ratings_mat[artist_id, relevant_venues] * \
                self.item_sim_mat[venue_to_rate, relevant_venues] / \
                self.item_sim_mat[venue_to_rate, relevant_venues].sum()'''
            #BECEAUSE THE RATINGS MAT IS NOT RATING ITS JUST WETHER OR NOT 
            #A ARTIST HAS PLAYED AT THE VENUE THE ABOVE CODE WILL NOT WORK
            # INSTED 

            
            
            if report_run_time:
            print("Execution time: %f seconds" % (time()-start_time))
        cleaned_out = np.nan_to_num(result)
        return cleaned_out

    def pred_all_users(self, report_run_time=False):
        start_time = time()
        all_ratings = [
            self.pred_one_user(artist_id) for artist_id in range(self.n_artist)]
        if report_run_time:
            print("Execution time: %f seconds" % (time()-start_time))
        return np.array(all_ratings)

    def top_n_recs(self, user_id, n):
        pred_ratings = self.pred_one_user(user_id)
        item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))
        items_rated_by_this_user = self.ratings_mat[user_id].nonzero()[1]
        unrated_items_by_pred_rating = [item for item in item_index_sorted_by_pred_rating
                                        if item not in items_rated_by_this_user]
        return unrated_items_by_pred_rating[-n:]


    def get_ratings_mat(self):

        highest_artist_id = np.max(self.data['artist_id'])
        highest_venue_id = np.max(self.data['venue_id'])
        ratings_as_mat = sparse.lil_matrix((highest_artist_id+1, highest_venue_id+1))
        for _, row in self.data.iterrows():
                # subtract 1 from id's due to match 0 indexing
            ratings_as_mat[row.artist_id, row.venue_id] = row.event_id
        return ratings_as_mat


'''if __name__ == "__main__":
    user_id = 1 
    num_top_recs = 20
    ns = 75 # neighborhood size
    ratings_data_contents, ratings_mat = get_ratings_mat()
    my_rec_engine = ItemItemRecommender(neighborhood_size=ns)
    my_rec_engine.fit(ratings_mat)
    user_preds = my_rec_engine.pred_one_user(user_id=user_id, report_run_time=True)
    # Show predicted ratings for user #1 on first 100 items
    print("\nThe first 100 ratings for user {0}:".format(user_id))
    print(user_preds[:100].round(1))
    print("\nThe top {0} recommended movies are:".format(num_top_recs))
    print(my_rec_engine.top_n_recs(user_id, num_top_recs))
'''

'if __name__ == "__main__":\n    user_id = 1 \n    num_top_recs = 20\n    ns = 75 # neighborhood size\n    ratings_data_contents, ratings_mat = get_ratings_mat()\n    my_rec_engine = ItemItemRecommender(neighborhood_size=ns)\n    my_rec_engine.fit(ratings_mat)\n    user_preds = my_rec_engine.pred_one_user(user_id=user_id, report_run_time=True)\n    # Show predicted ratings for user #1 on first 100 items\n    print("\nThe first 100 ratings for user {0}:".format(user_id))\n    print(user_preds[:100].round(1))\n    print("\nThe top {0} recommended movies are:".format(num_top_recs))\n    print(my_rec_engine.top_n_recs(user_id, num_top_recs))\n'

In [120]:
recomender = VenueVenueRecommender(10)

In [121]:
recomender.fit(df)

In [205]:
def get_ratings_mat(data):

        highest_artist_id = np.max(data['artist_id'])
        highest_venue_id = np.max(data['venue_id'])
        ratings_as_mat = np.zeros((highest_artist_id+1, highest_venue_id+1))
        for i, row in data.iterrows():
                # subtract 1 from id's due to match 0 indexing
            
            ratings_as_mat[row.artist_id, row.venue_id] = row.event_id
        return ratings_as_mat


In [204]:
 highest_artist_id = np.max(data['artist_id'])
highest_venue_id = np.max(data['venue_id'])
ratings_as_mat = np.NaN((highest_artist_id+1, highest_venue_id+1))

5727

In [199]:
a = get_ratings_mat(df)

TypeError: 'float' object is not callable

In [178]:
np.argsort(a, 1)

array([[   0, 3822, 3821, ..., 5727, 3918, 2016],
       [   0, 3822, 3821, ..., 5263, 2765, 1543],
       [   0, 3823, 3822, ..., 4399, 2016, 1744],
       ...,
       [   0, 3822, 3821, ..., 1912, 5727, 4480],
       [   0, 3823, 3822, ...,  230,  229,  228],
       [   0, 3822, 3821, ..., 2104, 5149, 2657]])

In [212]:
a = np.array([[None, 1 , None],[None,1,1]])
b = np.array([[0,1,0],[0,0,1]])

In [214]:
printcosine_similarity(b)

NameError: name 'printcosine_similarity' is not defined

In [189]:
np.unique(sim_mat[1])

array([0., 1.])

In [175]:
sim_mat.shape

(5728, 5728)

In [197]:
a

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])