# Overhead

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import random

ratings_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ratings.csv")
users_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/users.csv")
movies_df= pd.read_csv("/content/drive/MyDrive/Colab Notebooks/movies.tsv", sep = "\t")

user_ratings_df = ratings_df.pivot(index='movieID', columns='userID', values='rating').fillna(0)

# Helper funcs for simulation

In [None]:
def get_movieID_from_index(index):
  return(movies_df.iloc[index]['movieID'])

In [None]:
def lookup_movie(movieID):
  return(list(movies_df[movies_df['movieID'] == movieID]['name'])[0])

# KNN Model Building

In [None]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10)
knn.fit(user_ratings_df)

NearestNeighbors(algorithm='brute', metric='cosine', n_neighbors=10)

In [None]:
def knn_calculator(movieIDs):
  all_raw_recommendations = []

  for movieID in movieIDs:
    distances, indices = knn.kneighbors([user_ratings_df.loc[movieID]])
    rec_movieIDs = np.array([get_movieID_from_index(index) for index in indices])
    raw_recommends = \
              sorted(
                  list(
                      zip(
                          rec_movieIDs.squeeze().tolist(),
                          distances.squeeze().tolist()
                      )
                  ),
                  key=lambda x: x[1]
              )[:0:-1]
    all_raw_recommendations.extend(raw_recommends)
  
  recommendations_df = pd.DataFrame(all_raw_recommendations, columns=('movieID', 'distance'))

  return recommendations_df 

# Viewer Object
This object represents a movie viewer, and contains information about the movies they have watched. The object has a method to simulate watching and rating movies (this uses the recommendation engine), and stores the simulated movie views and ratings.

In [None]:
class Viewer():
  def __init__(self, userID):
    self.userID = userID
    #this stores all 'seen' movies, and is appended with new movies
    self.user_ratings_df = ratings_df[ratings_df.userID == userID]
    #this stores the recs from the KNN algorithm. one row per movie rec.
    self.sim_recommended_df = pd.DataFrame(columns=('iterID', 'movie_inputs', 'movieID', 'distance'))
    #this one holds the generated rating, only for the 1 movie watched after the sim.
    self.ratings_df = pd.DataFrame(columns=('iterID', 'movieID', 'sim_rating'))
    self.current_iter = 0

  def get_seen_movies(self, min_rating=0):
    #run whenever this this data needs to be used. user_ratings_df changes over time
    movieIDs = list(self.user_ratings_df[self.user_ratings_df['rating'] >= min_rating]['movieID'])
    #append movies seen via simulation
    if self.ratings_df['movieID'] is not None:
      movieIDs.extend([int(movieID) for movieID in self.ratings_df[self.ratings_df['sim_rating'] >= min_rating]['movieID']])
    if movieIDs == [] and min_rating > 0:
      #if no movies are rated high enough, check a lower rating
      movieIDs = self.get_seen_movies(min_rating-1)
    return(movieIDs)

  def recommendation_cleaner(self, recommendations_df):
    #make sure we do not recommend any movies that the viewer has not already seen\
    return(recommendations_df.drop(index=self.get_seen_movies(), errors='ignore'))
  
  def recommend(self): #add n_recs arg
    self.current_iter += 1
    #get movie recs #NOTE: right now, using all seen movies
    liked_movies = self.get_seen_movies(min_rating=4)
    seen_movies = self.get_seen_movies(min_rating=0) #NOTE: this is reset for every sim
    recommendations_df = knn_calculator(liked_movies)
    recommendations_df = self.recommendation_cleaner(recommendations_df)
    best_movieID = None
    for row in recommendations_df.iterrows():
      movieID = row[1].movieID
      distance = row[1].distance
      if movieID not in seen_movies:
        self.sim_recommended_df = self.sim_recommended_df.append({'iterID':self.current_iter, 'movie_inputs':str(seen_movies), 'movieID':movieID, 'distance':distance}, ignore_index=True)
        if best_movieID is None:
          #fill in with first unseen movie (recommendations_df is sorted by distance)
          best_movieID = movieID
          best_distance = distance
      
    #SPECIAL CASE: rec engine couldn't recommend an unseen movie
    if best_movieID is None:
      print('Unable to recommend a movie. userID: {}; iterID: {}'.format(self.userID, self.current_iter))
      return(False) #don't eval the next simulation entry

    #rating starts at 5.5, minus some Exp value
    sim_rating = 5.5 - np.random.exponential(4 * (1 - best_distance))
    sim_rating = max(1, min(5, sim_rating)) #force it between 0-5
    self.ratings_df = self.ratings_df.append({'iterID':self.current_iter, 'movieID':best_movieID, 'sim_rating':sim_rating}, ignore_index=True)
    #print('     ', lookup_movie(best_movieID), '(', round(sim_rating,1), ')')
    return(True)
  
  def rate_and_watch(self, iter):
    for i in range(iter):
      sim_result = self.recommend()
      if not sim_result:
        print('skipping the rest of this simulation')
        break

# Viewer Simulation Obj
This object simulates over a specified user, simulating multiple alternate seqences of movie views.

In [None]:
class ViewerSim():
  def __init__(self, userID):
    self.userID = userID
    self.sim_ratings_df = pd.DataFrame(columns=('userID', 'simID', 'iterID', 'movieID', 'sim_rating'))

  def simulate_watch(self, n_sim, n_movies):
    for simID in range(1, n_sim+1):
      viewer = Viewer(self.userID) # you have to re-gen this obj from scratch for a 'fresh' instance, with no previous sim data
      viewer.rate_and_watch(n_movies)
      ratings_df = viewer.ratings_df
      ratings_df['userID'] = self.userID
      ratings_df['simID'] = simID
      self.sim_ratings_df = self.sim_ratings_df.append(ratings_df, ignore_index=True)

In [None]:
viewerSim = ViewerSim(5)
viewerSim.simulate_watch(n_sim=10, n_movies=5)
viewerSim.sim_ratings_df.head()

Unnamed: 0,userID,simID,iterID,movieID,sim_rating
0,5,1,1.0,2879.0,4.794131
1,5,1,2.0,2859.0,4.583705
2,5,1,3.0,2690.0,4.850738
3,5,1,4.0,1594.0,4.677547
4,5,1,5.0,2313.0,5.0


# Full Simulation
This object randomly selects a set of viewers, and simulates views from all of them

In [None]:
class NetflixSimulation():
  def __init__(self, n_sim, n_movies):
    self.n_viewers = 0
    self.n_sim = n_sim
    self.n_movies = n_movies
    unused_userIDs = list(users_df['userID'])
    random.shuffle(unused_userIDs) #randomize order for sampling
    self.unused_userIDs = unused_userIDs
    self.all_sim_ratings_df = pd.DataFrame(columns=('userID', 'simID', 'iterID', 'movieID', 'sim_rating'))


  def simulate(self, n_viewers):
    self.n_viewers += n_viewers
    print(' ----------- BEGIN SIMULATION ------------ ')
    for i in range(n_viewers):
      start_time = time.time()
      userID = self.unused_userIDs.pop(0)
      print('    - userID:', userID)
      viewerSim = ViewerSim(userID)
      viewerSim.simulate_watch(self.n_sim, self.n_movies)
      self.all_sim_ratings_df = self.all_sim_ratings_df.append(viewerSim.sim_ratings_df)
      print('       sim time:', round(time.time()-start_time,1))
    print(' ---------- SIMULATION COMPLETE ---------- ')

  def save_data(self):
    pass

netSim = NetflixSimulation(n_sim=10, n_movies=10)

In [None]:
import time
n_viewers = 10
n_sim = 10
n_movies = 10

netSim = NetflixSimulation(n_sim, n_movies)
start_time = time.time()
netSim.simulate(n_viewers)
netSim.all_sim_ratings_df.head()

runtime = time.time() - start_time
print('Runtime:', runtime)
avg_per_movie = runtime / n_viewers / n_sim / n_movies
print('Avg Per Movie:', round(avg_per_movie,1))
iters_per_hr = 60 * 60 / avg_per_movie
print('Avg Iters Per Hr:', round(iters_per_hr,1))

 ----------- BEGIN SIMULATION ------------ 
    - userID: 1536
       sim time: 31.3
    - userID: 2351
       sim time: 23.9
    - userID: 4794
       sim time: 42.1
    - userID: 3444
       sim time: 21.9
    - userID: 5676
       sim time: 28.4
    - userID: 193
       sim time: 27.6
    - userID: 447
       sim time: 100.2
    - userID: 5785
       sim time: 26.5
    - userID: 2849
       sim time: 28.6
    - userID: 5208
       sim time: 21.7
 ---------- SIMULATION COMPLETE ---------- 
Runtime: 352.2341055870056
Avg Per Movie: 0.35223410558700563
Avg Iters Per Hr: 10220.475368222857


In [None]:
import os
os.listdir('drive/Shareddrives/Stat 535 Project/')

['Simulation.ipynb',
 'Moodle Project Files',
 'Contact Info.gsheet',
 'Recommendation systems.pdf',
 'Notes on Recommendation Systems.pdf',
 'Recommendation Feedback Project Proposal.docx',
 'Project Proposal - Final.docx',
 'data_mining_Recommender Systems.pdf',
 '.ipynb_checkpoints',
 'movie_data',
 'Notes Stat 535.gdoc',
 'Model Performance Evaluation Notes.gdoc',
 'Project Management Plan.docx',
 'recEngineKNN.ipynb',
 'Final Presentation.gslides']