In [1]:
import gather_data as gd
import api_call as ap
import clean_data as cd
import pandas as pd
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import pyspark
from pyspark.sql.types import *
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [1]:
def make_ratings_matrix(data):
    ratings_matrix = data[['artist_id', 'venue_id', 'event_id']]
    groupby = ratings_matrix.groupby(by = ['artist_id', 'venue_id'], as_index=False)
    ratings_matrix = pd.DataFrame(groupby.count())
    ratings_matrix.rename(columns = {'event_id':'num_shows'}, inplace = True)
    return ratings_matrix

In [2]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()
sc = spark.sparkContext

(<pyspark.sql.session.SparkSession at 0x115a5b0b8>,
 <SparkContext master=local[*] appName=pyspark-shell>)

In [3]:
df = pd.read_csv('ratings_matrix.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10520 entries, 0 to 10519
Data columns (total 4 columns):
Unnamed: 0    10520 non-null int64
artist_id     10520 non-null int64
venue_id      10520 non-null int64
event_id      10520 non-null int64
dtypes: int64(4)
memory usage: 328.8 KB


In [5]:
df = df.drop(labels = 'Unnamed: 0', axis=1)

In [18]:
df.event_id = 1

In [19]:
(np.unique(df.artist_id))

array([       7,       12,       14, ..., 15342483, 15344735, 15344889])

In [20]:
sp_df = spark.createDataFrame(df)

In [21]:
sp_df

DataFrame[artist_id: bigint, venue_id: bigint, event_id: bigint]

In [22]:
sp_df.show()

+---------+--------+--------+
|artist_id|venue_id|event_id|
+---------+--------+--------+
|        7|    2016|       1|
|        7|    3918|       1|
|       12|    1543|       1|
|       12|    2765|       1|
|       12|    5263|       1|
|       14|    1744|       1|
|       14|    2016|       1|
|       14|    2898|       1|
|       14|    3225|       1|
|       14|    4399|       1|
|       18|     999|       1|
|       18|    1537|       1|
|       18|    2799|       1|
|       18|    4080|       1|
|       21|    1632|       1|
|       28|     999|       1|
|       28|    4786|       1|
|       35|    1096|       1|
|       35|    2918|       1|
|       35|    4190|       1|
+---------+--------+--------+
only showing top 20 rows



In [83]:
train, test = sp_df.randomSplit([0.8, 0.2], seed=42)

In [24]:
train.count(), test.count()

(8470, 2050)

In [95]:
als_model = ALS(userCol='venue_id',
                itemCol='artist_id',
                ratingCol='event_id',
                nonnegative=True,
                regParam=0.1,
                rank=5,
                implicitPrefs=True,
                alpha = 40)

In [96]:
recomender = als_model.fit(train)

In [97]:
data = [(876, 14)]
columns = ('venue_id', 'artist_id')
one_row_spark_df = spark.createDataFrame(data, columns)

In [98]:
one_row_spark_df.show()

+--------+---------+
|venue_id|artist_id|
+--------+---------+
|     876|       14|
+--------+---------+



In [102]:
recomender.transform(one_row_spark_df).show()

+--------+---------+----------+
|venue_id|artist_id|prediction|
+--------+---------+----------+
|     876|       14|       0.0|
+--------+---------+----------+



In [103]:
train_pridictions = recomender.transform(train)

In [104]:
train_pridictions.show()


+---------+--------+--------+-------------+
|artist_id|venue_id|event_id|   prediction|
+---------+--------+--------+-------------+
|     2142|    3695|       1|   0.42386827|
|    28024|     835|       1|    0.4738261|
|    35361|     969|       1|     0.661041|
|    35361|    5657|       1|   0.21813901|
|    35361|    4690|       1|   0.36423022|
|    35361|    4687|       1|    0.6343369|
|    35361|    1919|       1|   0.16317807|
|    35361|    2010|       1|    1.0188153|
|   102798|    3884|       1|  0.025709275|
|   102798|    2114|       1|   0.02037028|
|   237568|    1166|       1|  0.028739912|
|   237568|    2459|       1|  0.121663496|
|   289337|    5276|       1|1.5916259E-14|
|   299253|     471|       1|    0.2259088|
|   299253|    1875|       1|    0.9732203|
|   299253|    4315|       1|   0.13213658|
|   299253|     326|       1|   0.61634475|
|   299253|    5542|       1|   0.13213658|
|   299253|    4943|       1|    0.8409917|
|   299253|    1965|       1|   

In [105]:
test_predictions = recomender.transform(test)

In [114]:
test_predictions.dropna().sort('prediction',ascending=False).show()

+---------+--------+--------+----------+
|artist_id|venue_id|event_id|prediction|
+---------+--------+--------+----------+
|  8036896|    2887|       1|  1.116618|
|  1501925|    4535|       1| 1.1126313|
|   193192|    2887|       1| 1.0886555|
|   299253|    1252|       1| 1.0808958|
|  1466945|    1252|       1| 1.0141679|
|  1937813|    2010|       1|   0.92394|
|    42188|    2887|       1| 0.9029753|
| 14942933|    4462|       1| 0.9026153|
|  1016236|    5625|       1|0.89317393|
|      710|    1252|       1|0.88779026|
|    57187|     835|       1| 0.8877058|
|    34503|     835|       1|0.88255304|
|   150550|    2963|       1|0.88184524|
|  7642703|    4276|       1| 0.8805937|
| 11844691|    5625|       1|0.87772936|
|  4888271|    1252|       1| 0.8665171|
|  1938355|    4399|       1|0.86164474|
| 11910894|    1875|       1| 0.8582125|
|   206590|    1875|       1| 0.8551549|
|    32251|    3887|       1| 0.8504138|
+---------+--------+--------+----------+
only showing top

In [40]:
predictions_df = predictions.toPandas()
train_df = train.toPandas()

In [45]:
predictions_df

Unnamed: 0,artist_id,venue_id,event_id,prediction
0,35361,969,1,0.935312
1,35361,5657,1,0.899114
2,35361,4687,1,0.882556
3,102798,2114,1,0.439794
4,299253,471,1,0.879487
5,299253,4315,1,1.163282
6,299253,4943,1,0.877575
7,299253,835,1,0.871067
8,10723218,4864,1,1.163282
9,10723218,477,1,1.163282


In [42]:
predictions_df = predictions.toPandas().fillna(train_df['event_id'].mean())

In [43]:
predictions_df['squared_error'] = (predictions_df['rating'] - predictions_df['prediction'])**2

KeyError: 'rating'