# Ant Colony Optimization

### Ant class: the “agents” that will be traversing the graph.


### Ant colony: a colony of ants. It is responsible for moving ants to their starting node as well as prompting the ants to move to the next node on their “journey”.


### Ant graph: the graph our agents will be traversing over


### Task: the task the ants will complete ("watch" movies and rate them)

In [1]:
import ast 
import random 
import numpy as np
import pandas as pd
import copy
import gc
from tqdm import tqdm

In [2]:
#lets think about our cost function for a bit
# we want to use the users ratings to 

In [3]:
class Ant: 
    trail = []
    visited_jobs = []
    review_list = []
    current_film = 0;
    
    def __init__(self, review_list):
        self.review_list = sorted(self.review_list, key=lambda x: x[1])
    
    def trail_len(self):
        return len(self.trail)
    
    def get_trail(self):
        return self.trail
    
    def get_current(self):
        return self.current_film
    
    def has_visited (self,i):
        self.visited_jobs[i]
        
    def watch_movie (self,movieID):
        self.current_film = movieID
        self.trail.append(movieID)
        
    def clear (self):
        self.trail = []
        self.visited_jobs = []
        self.current_film = 0;
        
        
    #get user review for specific movie returns a neutral rating of 2.5 if it does not exist
    def get_task_cost(self,movieID):
        cost = [y for x, y in self.review_list if x == movieID ]
        return  2.5 if cost == [] else cost[0]    
    def get_best_film(self):
        return self.review_list.pop(0) 

In [4]:
class AntColony:
    
    ants = []
    alpha = 0
    beta = 0 
    
    def __init__(self,antID_list,filename,alpha =0.5,beta = 0.4 ):
        user_reviews =  pd.read_csv(filename)
        self.ants = []
        self.alpha = alpha
        self.beta = beta
        
        for ID in antID_list:
            rev = ast.literal_eval(user_reviews.loc[user_reviews['userId'] == ID]["ratings_list"].tolist()[0])
            new_ant = Ant(rev)
#             new_ant.watch_movie(new_ant.get_best_film()[0])
            self.ants.append(new_ant)
    
        
        
    def prob_fx (self,rating, current_weight,sum_weights):
        prob = ((current_weight ** self.alpha) * (rating ** self.beta)) / sum_weights
        return prob
            
        
    def move_all_ants (self,edge_weights):
#         print("moving all ants")
#         print(f"num ants: {len(self.ants)}")
        for ant in self.ants:
            movie_id = self.determine_next_movie(ant,edge_weights[ant.get_current ()])
            ant.watch_movie(movie_id)

    def prep_for_test (self): 
        self.alpha = 0.9 
        self.beta = 0.1
        
    def return_all_tours(self):
        t = []
        #print(f"Num ants : {len(self.ants)}")
        for ant in self.ants:
            #print(len(ant.get_trail()))
            if (len(ant.get_trail()) > 20):
                print(ant.get_trail())
            
            t.append(ant.get_trail())
        
        ## will be used for global pheremone updating 
        
        #print(len(t))
        return t
    
    #here we use the index of the ant rather than its ID 
    def get_ant_rating (self,antIndex,movieID):
        return self.ants[antIndex].get_task_cost(movieID)
    
    def clear (self):
        for ant in self.ants:
            ant.clear()
        
            
    def determine_next_movie(self,ant,edge_weights):
        current_movie = ant.get_current ()
        trail = ant.get_trail()
        temp_weights = {}
        probs = {}
        sum_cost = 0 
        
        #removes all nodes we have already visited
        for ew in edge_weights.keys():
            if (ew not in trail):
                temp_weights [ew] = edge_weights[ew]
                
        # getting sum cost
        for vertex in temp_weights.keys():
            if (vertex not in ant.get_trail()):
                rating = ant.get_task_cost(vertex)
                weight = temp_weights[vertex]
                sum_cost += ((weight ** self.alpha) * (rating ** self.beta))
            
        for vertex in temp_weights.keys():
            if (vertex not in ant.get_trail()):
                rating = ant.get_task_cost(vertex)
                weight = temp_weights[vertex]
                probs[vertex] = self.prob_fx(rating,weight,sum_cost)
            
            
#         print(f"Probs : {probs.keys()}")
        #Chooses the next film based on probabilities 
        #print (random.choices(list(probs.keys()), weights= list(probs.values()), k=1)[0])
        try:
            choice = random.choices(list(probs.keys()), weights= list(probs.values()), k=1)
        except:
            choice = []
        return 0 if choice == [] else choice[0]
    

            
                
        
                

In [5]:
class Task:
    title = ""
    genres = ""
    mID = None
    
    def __init__(self, movie,movie_details):
        self.mId = movie
        self.title = movie_details["title"].to_string(index= False)
        self.genres = movie_details["genres"].to_string().split(" ")[-1]
    
    def print_Task(self):
        print("------------------------------------------------------------------------")
        print(f"Title: {self.title}")
        print(f"genres: {self.genres}")
    

In [6]:
class Ant_Graph:
    
    graph_edge_weights = {}
    verticies   = None
    trail_max_len = 0
    num_ants = 0 
    decay = 0 
    ants = None
    best_recommendations = []
    
    def __init__(self, movie_list,task_list, num_ants,num_movies_wanted,antID_list, filename = "./generation_saves/ratings_gen_50.csv", alpha =0.6, beta= 0.4, decay = 0.05):
        self.num_ants = num_ants
        self.trail_max_len = num_movies_wanted
        self.vertices = task_list
        self.decay = decay
        self.graph_edge_weights = {}
        
        for movieID in movie_list:
            if (movieID not in self.graph_edge_weights.keys()):
                self.graph_edge_weights[movieID] = {}
            for m in movie_list:
                if (m != movieID):
                    self.graph_edge_weights[movieID][m] = 0.5
        
        #creating a start node that does not belong to a movie, all agents will start there
        self.graph_edge_weights[0] = {}
        for movieID in movie_list:
            self.graph_edge_weights[0][movieID] = 0.5
            
        self.ants = AntColony(antID_list,filename,alpha,beta)
            
    def clear(self): 
        self.graph_edge_weights = {}
        self.verticies   = None
        self.trail_max_len = 0
        self.num_ants = 0 
        self.decay = 0 
        self.ants.clear()
        self.ants = None
        self.best_recommendations = []
                    
                    
    def update_edge_weights (self):
        
        # we will chqnge the pheremones based on how much the user enjoyed that set ("trail") of movies
        # it will be calculated as   cumulative_trail_Raitings/ total_possible ratings, bound between 0 and 1 
        cumulative_ratings =  {}
        tours = self.ants.return_all_tours()
        for i,tour in enumerate(tours):
            prev = 0 
            for movie in tour:
#                 print(len(tour))
                curr = movie
                rating = self.ants.get_ant_rating(i,movie)
                if (prev not in list(cumulative_ratings.keys()) ):
                     cumulative_ratings[prev] = {}
                if (curr not in list(cumulative_ratings [prev].keys())):
                    cumulative_ratings [prev][curr] = rating
                else:
                    cumulative_ratings [prev][curr] += rating
                
                prev = curr
            
        for startnode in cumulative_ratings.keys():
            for nextnode in cumulative_ratings[startnode].keys():
                self.graph_edge_weights[startnode][nextnode] *= (1- self.decay)
                self.graph_edge_weights[startnode][nextnode] += (cumulative_ratings[startnode][nextnode] / (5* self.trail_max_len))
                
        del cumulative_ratings
    

    def compute_best_rec (self,exclude_list):
        temp_list = []
        self.ants.prep_for_test()
        while (len(temp_list)< self.trail_max_len):
            test_ant = Ant([])
            nextid = self.ants.determine_next_movie(test_ant,self.graph_edge_weights [test_ant.get_current()])
            test_ant.watch_movie(nextid)
            if (nextid not in exclude_list):
                temp_list.append(nextid)
        
        return temp_list
    
    def best_recomendations (self):
        return self.best_recommendations
    
    def one_iter (self): 
#         print("running one iteration: \n -------------------------------")
        #print(f" trail max len : {self.trail_max_len}")
        self.ants.clear()
        for step in range(0,self.trail_max_len):
            #print("test")
            self.ants.move_all_ants(self.graph_edge_weights)
            
        self.update_edge_weights()
        self.ants.clear()
        
            
                
        
        

In [7]:
from pickle import load
file = open("genresList.pkl",'rb')
genres = load(file)
file.close()

def create_genre_tracker ():
    gt = {}
    for g in genres:
        gt[g] = 0 
    return gt


sim_df =  pd.read_csv("./simillarity_matrix.csv")
all_vals = []

for index, row in sim_df.iterrows():
    sim_list = ast.literal_eval(row["simillarity_vector"])
    all_vals += sim_list

max_sim = max(all_vals)
min_Sim = min(all_vals)

def z_normalization (val):
    return (val-min_Sim)/(max_sim - min_Sim)



In [8]:
def compute_simillarity_ratings (l1,l2,movies):
    l1_dict = dict(l1)
    l2_dict = dict(l2)
    simillarity = 0 
    
    genre_tracker1 = create_genre_tracker ()
    genre_tracker2 = create_genre_tracker ()
    
    l2_keys = list(l2_dict.keys())
    for key in l1_dict.keys():
        m1_genres = movies.loc[movies['movieId'] == key  ,'genres'].values[0].split("|")
        for mg in m1_genres:
            genre_tracker1 [mg] +=1
        if key in l2_keys:
            r1 = l1_dict [key]
            r2 = l2_dict [key]
            if (r1 < r2):
                simillarity += 100*(r1/r2)
            else:
                simillarity+= 100*(r2/r1) 
                
    for key in l2_keys:
        try: 
            m2_genres = movies.loc[movies['movieId'] == key  ,'genres'].values[0].split("|")
            for mg in m2_genres:
                genre_tracker2 [mg] +=1
        except:
            continue
            
    genre_simillarity =  0 
    for g in genres:
        tempg1 = genre_tracker1 [g]
        tempg2 = genre_tracker2 [g]
        if (tempg1 > 0 or tempg2 >0):
            genre_simillarity += (1000* (min(tempg1,tempg2)/max(tempg1,tempg2))* min (tempg1,tempg2))
        
    return z_normalization(genre_simillarity + simillarity)

In [9]:
def compute_simillarity_vector (ratings_df, userlist,movies,exclude_users):
    simillarity_dict = {}
    
    for index, row in ratings_df.iterrows():
        user2list = ast.literal_eval (row["ratings_list"])
        simillarity_dict [row["userId"]] = compute_simillarity_ratings (userlist,user2list,movies)
        
    for u in exclude_users:
        if (u in simillarity_dict.keys()):
            del simillarity_dict[u]
        
    return [(k,v) for k ,v in simillarity_dict.items()]

In [10]:
def select_similar_users(num_agents,user_ratings,exclude_users, userlist,movies, randomize = False):
    if ( not randomize):
        sim_vec = compute_simillarity_vector (user_ratings, userlist,movies,exclude_users) 
    
            
        sim_vec.sort(key=lambda pair: pair[1])
        return [a[0] for a in sim_vec[-num_agents:]]
    else: 
        userids = list (range(1,len(user_ratings.index)))
        return random.sample(userids, num_agents);

In [11]:
def main_printrecs (num_agents, num_iter, num_movies,exclude_users =[], userlist = [],rating_fn = "./generation_saves/ratings_gen_50.csv", simillarility_fn = "./simillarity_matrix_normalized.csv", alpha= 0.7,beta =0.3,decay=0.05):
    # import datasets
    print ("importing datasets....")
    sim_df  = pd.read_csv(simillarility_fn)
    movies  = pd.read_csv(f"./ml-latest-small/movies.csv")
    user_ratings = pd.read_csv(rating_fn)
    
    ## if userlist is empty pick 50 random users as agents
    rand = (len(userlist) == 0)
        
    ## if not empty userlist will be the most simillar users 
    print(f"selecting {num_agents} simillar users...")
    users = select_similar_users(num_agents, user_ratings,exclude_users, userlist,movies, randomize = rand)
    
    ## determining all the movies that these users watched
    print("configuring movie formatting...")
    movie_list = []
    tasks = []
    for uID in users:
        film_list = ast.literal_eval(user_ratings.loc[user_ratings['userId'] == uID]["ratings_list"].tolist()[0])
        for film,rating in film_list:
            if (film not in movie_list):
                movie_list.append(film)
                tasks.append(Task(film,movies.loc[movies['movieId'] == film]))
    
    ## define the graph
    print("creating the Graph...")
    graph = Ant_Graph(movie_list,tasks, num_agents ,num_movies, users, filename = rating_fn , alpha=alpha , beta= beta, decay = decay)
    print(f"Staring training fo {num_iter} iterations...")
    for i in tqdm(range (0,num_iter)):
        graph.one_iter()
        
    movies_user_already_watched = [mid for mid,rate in userlist ]
    recs = graph.compute_best_rec (movies_user_already_watched)
    
    print ("based on your movie list [" + ", ".join([movies.loc[movies['movieId'] == film]["title"].to_string(index= False) for film in movie_list[-10:]  ]) + "] \n You might like the following films")
    for task in tasks:
        if (task.mId in recs):
            task.print_Task()
    
    
    ## Adding some cleanup code to clear all vars because for some reason every second call to main was failing 
    graph.clear()
    del graph, movie_list, tasks, rand, users, user_ratings,sim_df,movies
    gc.collect()
    
    return recs

## Training and testing 

### training 

Training is rather trivial, all weneed to do is make an appropriate call to main()

### testing

Testing is a bit more difficult. We do not have a list of people who have volunteered to test the system by watching the recomendations and rating them so we have to get creative. We can set a few users in our existing dataset as "test users" then we will take the first k ratings of the user as input/"user history" and the n-k as "test ratings" . We will determine how good the recomendations are using the percentage of the reccomendations that appear in the "test ratings list and by comparing the ratings for movies that are in both. 

In [12]:
# evaluation metric
def evaluate_reccomendations (recs, test_set):
    missing_recs =0 
    ratings = []
    for movieID in recs: 
        rating = next((y for x, y in test_set if x == movieID ), None)
        if (rating == None):
            missing_recs+=1
        else: 
            ratings.append(rating)
            
    percent_overlap = (len(recs) - missing_recs) / len(recs)
    
    rating_accuracy = 0.5
    if (len(ratings) > 0):
        rating_accuracy = sum(ratings) / (5*len(ratings))
    
    return percent_overlap, rating_accuracy

In [13]:
### get 50 test users 
user_ratings = pd.read_csv("./ratings_organized.csv")
chosen_users = random.sample (list(range (0, len(user_ratings.index))),50)  ## we will pass this to main to make sure our train data does not include it 

train_revs, test_revs = [], []
for i in chosen_users:
    user_rev  = ast.literal_eval(user_ratings.iloc[[i]] ["ratings_list"].tolist()[0])
    split_index = int (len(user_rev) * 0.5)
    train_revs.append(user_rev[:split_index])
    test_revs.append(user_rev[split_index:])
    


# Testing output now 


In [14]:
recs = main_printrecs (num_agents =30 , num_iter = 50, num_movies = 5 ,exclude_users = chosen_users , userlist = train_revs[0] ,rating_fn = "./generation_saves/ratings_gen_50.csv", simillarility_fn = "./simillarity_matrix_normalized.csv", alpha= 0.7,beta =0.3,decay=0.05)

importing datasets....
selecting 30 simillar users...
configuring movie formatting...
creating the Graph...
Staring training fo 50 iterations...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:14<00:00,  3.34it/s]

based on your movie list [Guess Who (2005), Prefontaine (1997), Once Upon a Time in the West (C'era una volta i..., Night at the Museum (2006), Eragon (2006), De platte jungle (1978), Space Jam (1996), Survivor (2015), Time Bandits (1981), Series([], )] 
 You might like the following films
------------------------------------------------------------------------
Title: Kill Bill: Vol. 2 (2004)
genres: Action|Drama|Thriller
------------------------------------------------------------------------
Title: Paycheck (2003)
genres: Action|Sci-Fi|Thriller
------------------------------------------------------------------------
Title: Terminator 3: Rise of the Machines (2003)
genres: Action|Adventure|Sci-Fi
------------------------------------------------------------------------
Title: My Big Fat Greek Wedding (2002)
genres: Comedy|Romance
------------------------------------------------------------------------
Title: Gremlins 2: The New Batch (1990)
genres: Comedy|Horror





In [19]:
for i,rev_list in enumerate(train_revs):
    print(f"running training and testing for user {i}\n --------------------------------------------")
    recs = main_printrecs (num_agents =30 , num_iter = 30, num_movies = 5 ,exclude_users = chosen_users , userlist = rev_list ,rating_fn = "./generation_saves/ratings_gen_50.csv", simillarility_fn = "./simillarity_matrix_normalized.csv", alpha= 0.7,beta =0.3,decay=0.05)
    precent_overlap, ra = evaluate_reccomendations(recs,test_revs[i])
    if (precent_overlap != 0 ):
        ra_list.append(ra)

running training and testing for user 0
 --------------------------------------------
importing datasets....
selecting 30 simillar users...
configuring movie formatting...
creating the Graph...
Staring training fo 30 iterations...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:09<00:00,  3.23it/s]


based on your movie list [Treasure Planet (2002), All the King's Men (1949), Maid in Manhattan (2002), Saving Grace (2000), Dr. Phibes Rises Again (1972), Secret Window (2004), Pokémon: The First Movie (1998), Stealth (2005), 99 francs (2007), Hellboy (2004)] 
 You might like the following films
------------------------------------------------------------------------
Title: Blockers (2018)
genres: Comedy
------------------------------------------------------------------------
Title: Down Argentine Way (1940)
genres: Comedy|Drama|Romance
------------------------------------------------------------------------
Title: Sommersby (1993)
genres: Drama|Mystery|Romance
------------------------------------------------------------------------
Title: Taps (1981)
genres: Drama
------------------------------------------------------------------------
Title: Harry Potter and the Prisoner of Azkaban (2004)
genres: Adventure|Fantasy|IMAX
running training and testing for user 1
 ------------------------

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:09<00:00,  3.17it/s]


based on your movie list [Total Eclipse (1995), Baby Mama (2008), Vertical Limit (2000), Bronco Billy (1980), Eddie Murphy Delirious (1983), Disclosure (1994), Mission: Impossible II (2000), It's Complicated (2009), Tales from the Crypt Presents: Demon Knight (1995), Scream (1996)] 
 You might like the following films
------------------------------------------------------------------------
Title: Session 9 (2001)
genres: Horror|Thriller
------------------------------------------------------------------------
Title: Batman: Mask of the Phantasm (1993)
genres: Animation|Children
------------------------------------------------------------------------
Title: Diner (1982)
genres: Comedy|Drama
------------------------------------------------------------------------
Title: School of Rock (2003)
genres: Comedy|Musical
------------------------------------------------------------------------
Title: All Dogs Go to Heaven (1989)
genres: Animation|Children|Comedy|Drama|Fantasy
running training and

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:09<00:00,  3.23it/s]


based on your movie list [Crumb (1994), Time Bandits (1981), Man Who Shot Liberty Valance, The (1962), Man Up (2015), American Movie (1999), Borrowers, The (1997), Twins (1988), Edward Scissorhands (1990), Naked Gun: From the Files of Police Squad!, The..., Billy Madison (1995)] 
 You might like the following films
------------------------------------------------------------------------
Title: Snowpiercer (2013)
genres: Action|Drama|Sci-Fi
------------------------------------------------------------------------
Title: Scott Pilgrim vs. the World (2010)
genres: Action|Comedy|Fantasy|Musical|Romance
------------------------------------------------------------------------
Title: Dogma (1999)
genres: Adventure|Comedy|Fantasy
------------------------------------------------------------------------
Title: Hangover, The (2009)
genres: Comedy|Crime
------------------------------------------------------------------------
Title: Religulous (2008)
genres: Comedy|Documentary
running training and t

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:09<00:00,  3.23it/s]


based on your movie list [Mister Roberts (1955), Son of Rambow (2007), Fast Five (Fast and the Furious 5, The) (2011), Karate Kid, Part II, The (1986), Dante's Peak (1997), Space Jam (1996), Survivor (2015), Time Bandits (1981), Series([], ), Man Up (2015)] 
 You might like the following films
------------------------------------------------------------------------
Title: Live Free or Die Hard (2007)
genres: Action|Adventure|Crime|Thriller
------------------------------------------------------------------------
Title: Silence of the Lambs, The (1991)
genres: Crime|Horror|Thriller
------------------------------------------------------------------------
Title: Murder in the First (1995)
genres: Drama|Thriller
------------------------------------------------------------------------
Title: Blue Lagoon, The (1980)
genres: Adventure|Drama|Romance
------------------------------------------------------------------------
Title: Daylight (1996)
genres: Action|Adventure|Drama|Thriller
running tra

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:09<00:00,  3.10it/s]

based on your movie list [Angels and Insects (1995), Point Break (1991), Circle of Friends (1995), Serving Sara (2002), Divergent (2014), Being There (1979), Lawnmower Man, The (1992), Sicario (2015), Secondhand Lions (2003), Let the Right One In (Låt den rätte komma in) (...] 
 You might like the following films
------------------------------------------------------------------------
Title: Stand by Me (1986)
genres: Adventure|Drama
------------------------------------------------------------------------
Title: Rocky IV (1985)
genres: Action|Drama
------------------------------------------------------------------------
Title: Guardians of the Galaxy 2 (2017)
genres: Action|Adventure|Sci-Fi
------------------------------------------------------------------------
Title: Gangs of New York (2002)
genres: Crime|Drama
------------------------------------------------------------------------
Title: Super Size Me (2004)
genres: Comedy|Documentary|Drama





NameError: name 'ra_list' is not defined