# Ant Colony Optimization

### Ant class: the “agents” that will be traversing the graph.


### Ant colony: a colony of ants. It is responsible for moving ants to their starting node as well as prompting the ants to move to the next node on their “journey”.


### Ant graph: the graph our agents will be traversing over


### Task: the task the ants will complete ("watch" movies and rate them)

In [1]:
import ast 
import random 
import numpy as np
import pandas as pd
import copy
import gc
from tqdm import tqdm

In [2]:
#lets think about our cost function for a bit
# we want to use the users ratings to 

In [3]:
class Ant: 
    
    def __init__(self, review_list):
        self.trail = []
        self.visited_jobs = []
        self.review_list = []
        self.current_film = 0;
        self.review_list = sorted(self.review_list, key=lambda x: x[1])
    
    def trail_len(self):
        return len(self.trail)
    
    def get_trail(self):
        return self.trail
    
    def get_current(self):
        return self.current_film
    
    def has_visited (self,i):
        self.visited_jobs[i]
        
    def watch_movie (self,movieID):
        self.current_film = movieID
        self.trail.append(movieID)
        
    def clear (self):
        del self.trail[:]
        self.visited_jobs = []
        self.current_film = 0;
        
        
    #get user review for specific movie returns a neutral rating of 2.5 if it does not exist
    def get_task_cost(self,movieID):
        cost = [y for x, y in self.review_list if x == movieID ]
        return  2.5 if cost == [] else cost[0]    

In [4]:
class AntColony:
    
    
    def __init__(self,antID_list,filename,alpha =0.5,beta = 0.4 ):
        user_reviews =  pd.read_csv(filename)
        self.ants = []
        self.alpha = alpha
        self.beta = beta
        
        for ID in antID_list:
            rev = ast.literal_eval(user_reviews.loc[user_reviews['userId'] == ID]["ratings_list"].tolist()[0])
            new_ant = Ant(rev)
            self.ants.append(new_ant)
    
        
        
    def prob_fx (self,rating, current_weight,sum_weights):
        prob = ((current_weight ** self.alpha) * (rating ** self.beta)) / sum_weights
        return prob
            
        
    def move_all_ants (self,edge_weights):
#         print("moving all ants")
#         print(f"num ants: {len(self.ants)}")
        for ant in self.ants:
            movie_id = self.determine_next_movie(ant,edge_weights[ant.get_current ()])
            ant.watch_movie(movie_id)

    def prep_for_test (self): 
        self.alpha = 0.9 
        self.beta = 0.1
        
    def return_all_tours(self):
        t = []
        #print(f"Num ants : {len(self.ants)}")
        for ant in self.ants:
            #print(len(ant.get_trail()))
            if (len(ant.get_trail()) > 20):
                print(ant.get_trail())
            
            t.append(ant.get_trail())
        
        ## will be used for global pheremone updating 
        
        #print(len(t))
        return t
    
    #here we use the index of the ant rather than its ID 
    def get_ant_rating (self,antIndex,movieID):
        return self.ants[antIndex].get_task_cost(movieID)
    
    def clear (self):
        for ant in self.ants:
            ant.clear()
        
            
    def determine_next_movie(self,ant,edge_weights):
        current_movie = ant.get_current ()
        trail = ant.trail
        temp_weights = {}
        probs = {}
        sum_cost = 0 
        
        #removes all nodes we have already visited
        for ew in edge_weights.keys():
            if (ew not in ant.trail):
                temp_weights [ew] = edge_weights[ew]

        # getting sum cost
        for vertex in temp_weights.keys():
            if (vertex not in ant.get_trail()):
                rating = ant.get_task_cost(vertex)
                weight = temp_weights[vertex]
                sum_cost += ((weight ** self.alpha) * (rating ** self.beta))
            
        for vertex in temp_weights.keys():
            if (vertex not in ant.get_trail() and vertex != 0 ):
                rating = ant.get_task_cost(vertex)
                weight = temp_weights[vertex]
                probs[vertex] = self.prob_fx(rating,weight,sum_cost)
            
            
#         print(f"Probs : {probs.keys()}")
        #Chooses the next film based on probabilities 
        #print (random.choices(list(probs.keys()), weights= list(probs.values()), k=1)[0])
        try:
            choice = random.choices(list(probs.keys()), weights= list(probs.values()), k=1)
        except:
            temp = [k for k in edge_weights.keys()] 
            choice = random.choice(temp)
        return 0 if choice == [] else choice[0]
    

            
                
        
                

In [5]:
class Task:
    title = ""
    genres = ""
    mID = None
    
    def __init__(self, movie,movie_details):
        self.mId = movie
        self.title = movie_details["title"].to_string(index= False)
        self.genres = movie_details["genres"].to_string(index= False)
    
    

In [6]:
class Ant_Graph:
    
    best_recommendations = []
    
    def __init__(self, movie_list,task_list, num_ants,num_movies_wanted,antID_list, filename = "./generation_saves/ratings_gen_50.csv", alpha =0.6, beta= 0.4, decay = 0.05):
        self.num_ants = num_ants
        self.trail_max_len = num_movies_wanted
        self.vertices = task_list
        self.decay = decay
        self.graph_edge_weights = {}
        
        for movieID in movie_list:
            if (movieID not in self.graph_edge_weights.keys()):
                self.graph_edge_weights[movieID] = {}
            for m in movie_list:
                if (m != movieID):
                    self.graph_edge_weights[movieID][m] = 0.5
        
        #creating a start node that does not belong to a movie, all agents will start there
        self.graph_edge_weights[0] = {}
        for movieID in movie_list:
            if (movieID != 0):
                self.graph_edge_weights[0][movieID] = 0.5
            
        self.ants = AntColony(antID_list,filename,alpha,beta)
            
    def clear(self): 
        self.graph_edge_weights = {}
        self.verticies   = None
        self.trail_max_len = 0
        self.num_ants = 0 
        self.decay = 0 
        self.ants.clear()
        self.ants = None
        self.best_recommendations = []
                    
                    
    def update_edge_weights (self):
        
        # we will chqnge the pheremones based on how much the user enjoyed that set ("trail") of movies
        # it will be calculated as   cumulative_trail_Raitings/ total_possible ratings, bound between 0 and 1 
        cumulative_ratings =  {}
        tours = self.ants.return_all_tours()
        for i,tour in enumerate(tours):
            prev = 0 
            for movie in tour:
#                 print(len(tour))
                curr = movie
                rating = self.ants.get_ant_rating(i,movie)
                if (prev not in list(cumulative_ratings.keys()) ):
                     cumulative_ratings[prev] = {}
                if (curr not in list(cumulative_ratings [prev].keys())):
                    cumulative_ratings [prev][curr] = rating
                else:
                    cumulative_ratings [prev][curr] += rating
                
                prev = curr
            
        for startnode in cumulative_ratings.keys():
            for nextnode in cumulative_ratings[startnode].keys():
                self.graph_edge_weights[startnode][nextnode] *= (1- self.decay)
                self.graph_edge_weights[startnode][nextnode] += (cumulative_ratings[startnode][nextnode] / (5* self.trail_max_len))
                
        del cumulative_ratings
    

    def compute_best_rec (self,exclude_list):
        temp_list = []
        self.ants.prep_for_test()
        while (len(temp_list)< self.trail_max_len):
            test_ant = Ant([])
            nextid = self.ants.determine_next_movie(test_ant,self.graph_edge_weights [test_ant.get_current()])
            test_ant.watch_movie(nextid)
            if (nextid not in exclude_list):
                temp_list.append(nextid)
        test_ant.clear()
        print(len(test_ant.trail))
        del test_ant
        
        return temp_list
    
    def best_recomendations (self):
        return self.best_recommendations
    
    def one_iter (self): 
#         print("running one iteration: \n -------------------------------")
        #print(f" trail max len : {self.trail_max_len}")
        self.ants.clear()
        for step in range(0,self.trail_max_len):
            #print("test")
            self.ants.move_all_ants(self.graph_edge_weights)
            
        self.update_edge_weights()
        self.ants.clear()
        
            
                
        
        

In [7]:
from pickle import load
file = open("genresList.pkl",'rb')
genres = load(file)
file.close()

def create_genre_tracker ():
    gt = {}
    for g in genres:
        gt[g] = 0 
    return gt


sim_df =  pd.read_csv("./simillarity_matrix.csv")
all_vals = []

for index, row in sim_df.iterrows():
    sim_list = ast.literal_eval(row["simillarity_vector"])
    all_vals += sim_list

max_sim = max(all_vals)
min_Sim = min(all_vals)

def z_normalization (val):
    return (val-min_Sim)/(max_sim - min_Sim)



In [8]:
def compute_simillarity_ratings (l1,l2,movies):
    l1_dict = dict(l1)
    l2_dict = dict(l2)
    simillarity = 0 
    
    genre_tracker1 = create_genre_tracker ()
    genre_tracker2 = create_genre_tracker ()
    
    l2_keys = list(l2_dict.keys())
    for key in l1_dict.keys():
        m1_genres = movies.loc[movies['movieId'] == key  ,'genres'].values[0].split("|")
        for mg in m1_genres:
            genre_tracker1 [mg] +=1
        if key in l2_keys:
            r1 = l1_dict [key]
            r2 = l2_dict [key]
            if (r1 < r2):
                simillarity += 100*(r1/r2)
            else:
                simillarity+= 100*(r2/r1) 
                
    for key in l2_keys:
        try: 
            m2_genres = movies.loc[movies['movieId'] == key  ,'genres'].values[0].split("|")
            for mg in m2_genres:
                genre_tracker2 [mg] +=1
        except:
            continue
            
    genre_simillarity =  0 
    for g in genres:
        tempg1 = genre_tracker1 [g]
        tempg2 = genre_tracker2 [g]
        if (tempg1 > 0 or tempg2 >0):
            genre_simillarity += (1000* (min(tempg1,tempg2)/max(tempg1,tempg2))* min (tempg1,tempg2))
        
    return z_normalization(genre_simillarity + simillarity)

In [9]:
def compute_simillarity_vector (ratings_df, userlist,movies,exclude_users):
    simillarity_dict = {}
    
    for index, row in ratings_df.iterrows():
        user2list = ast.literal_eval (row["ratings_list"])
        simillarity_dict [row["userId"]] = compute_simillarity_ratings (userlist,user2list,movies)
        
    for u in exclude_users:
        if (u in simillarity_dict.keys()):
            del simillarity_dict[u]
        
    return [(k,v) for k ,v in simillarity_dict.items()]

In [10]:
def select_similar_users(num_agents,user_ratings,exclude_users, userlist,movies, randomize = False):
    if ( not randomize):
        sim_vec = compute_simillarity_vector (user_ratings, userlist,movies,exclude_users) 
    
            
        sim_vec.sort(key=lambda pair: pair[1])
        return [a[0] for a in sim_vec[-num_agents:]]
    else: 
        userids = list (range(1,len(user_ratings.index)))
        return random.sample(userids, num_agents);

In [11]:
def main (num_agents, num_iter, num_movies, genre=[],genre_exclude=[],exclude_users =[], userlist = [],rating_fn = "./generation_saves/ratings_gen_50.csv", simillarility_fn = "./simillarity_matrix_normalized.csv", alpha= 0.7,beta =0.3,decay=0.05):
    # import datasets
    print ("importing datasets....")
    sim_df  = pd.read_csv(simillarility_fn)
    movies  = pd.read_csv(f"./ml-latest-small/movies.csv")
    user_ratings = pd.read_csv(rating_fn)
    
    ## if userlist is empty pick 50 random users as agents
    rand = (len(userlist) == 0)
        
    ## if not empty userlist will be the most simillar users 
    print(f"selecting {num_agents} simillar users...")
    users = select_similar_users(num_agents, user_ratings,exclude_users, userlist,movies, randomize = rand)
    
    ## determining all the movies that these users watched
    print("configuring movie formatting...")
    movie_list = []
    tasks = []
    for uID in users:
        film_list = ast.literal_eval(user_ratings.loc[user_ratings['userId'] == uID]["ratings_list"].tolist()[0])
        for film,rating in film_list:
            if (film not in movie_list):
                t = Task(film,movies.loc[movies['movieId'] == film])
                movgens = t.genres.split("|")
                
                skip = True
                for g in genre:
                    if (g in movgens ):
                        skip = False
                for g in genre_exclude:
                    if (g in movgens ):
                        skip = True
                if (not skip):
                    movie_list.append(film)
                tasks.append(t)
    
    ## define the graph
    print("creating the Graph...")
    graph = Ant_Graph(movie_list,tasks, num_agents ,num_movies, users, filename = rating_fn , alpha=alpha , beta= beta, decay = decay)
    print(f"Staring training fo {num_iter} iterations...")
    for i in tqdm(range (0,num_iter)):
        graph.one_iter()
        
    movies_user_already_watched = [mid for mid,rate in userlist ]
    recs = graph.compute_best_rec (movies_user_already_watched)
    print(f"Recommended ID's: {recs}")
    
    
    ## Adding some cleanup code to clear all vars because for some reason every second call to main was failing 
    graph.clear()
    del graph, movie_list, tasks, rand, users, user_ratings,sim_df,movies
    gc.collect()
    
    return recs

## Training and testing 

### training 

Training is rather trivial, all weneed to do is make an appropriate call to main()

### testing

Testing is a bit more difficult. We do not have a list of people who have volunteered to test the system by watching the recomendations and rating them so we have to get creative. We can set a few users in our existing dataset as "test users" then we will take the first k ratings of the user as input/"user history" and the n-k as "test ratings" . We will determine how good the recomendations are using the percentage of the reccomendations that appear in the "test ratings list and by comparing the ratings for movies that are in both. 

In [12]:
# evaluation metric
def evaluate_reccomendations (recs, test_set):
    missing_recs =0 
    ratings = []
    for movieID in recs: 
        rating = next((y for x, y in test_set if x == movieID ), None)
        if (rating == None):
            missing_recs+=1
        else: 
            ratings.append(rating)
            
    percent_overlap = (len(recs) - missing_recs) / len(recs)
    
    rating_accuracy = 0.5
    if (len(ratings) > 0):
        rating_accuracy = sum(ratings) / (5*len(ratings))
    
    return percent_overlap, rating_accuracy

In [13]:
### get 50 test users 
user_ratings = pd.read_csv("./ratings_organized.csv")
chosen_users = random.sample (list(range (0, len(user_ratings.index))),50)  ## we will pass this to main to make sure our train data does not include it 

train_revs, test_revs = [], []
for i in chosen_users:
    user_rev  = ast.literal_eval(user_ratings.iloc[[i]] ["ratings_list"].tolist()[0])
    split_index = int (len(user_rev) * 0.5)
    train_revs.append(user_rev[:split_index])
    test_revs.append(user_rev[split_index:])
    


# Testing now with different Configs 

## test preds on different genres

In [14]:
from pickle import load
file = open("genresList.pkl",'rb')
genres = load(file)
file.close()

In [None]:
acc_scores = {}
for genre in genres:
    ra_list = []
    for i,rev_list in enumerate(train_revs):
        print(f"running training and testing for user {i}\n --------------------------------------------")
        recs = main (num_agents =30 , num_iter = 30, num_movies = 5 , genre=[genre],genre_exclude=[], exclude_users = chosen_users , userlist = rev_list ,rating_fn = "./generation_saves/ratings_gen_1000.csv", simillarility_fn = "./simillarity_matrix_normalized.csv", alpha= 0.2,beta =0.8,decay=0.05)
        precent_overlap, ra = evaluate_reccomendations(recs,test_revs[i])
        if (precent_overlap != 0 ):
            ra_list.append(ra)

    print(f"genre: {genre}")
    if (len(ra_list) > 0):
        print (f"overall accuracy is {sum(ra_list)/len(ra_list)}")
        acc_scores [genre] = sum(ra_list)/len(ra_list)
    
print(acc_scores)

running training and testing for user 0
 --------------------------------------------
importing datasets....
selecting 30 simillar users...
configuring movie formatting...
creating the Graph...
Staring training fo 30 iterations...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 123.79it/s]


0
Recommended ID's: [1356.0, 1196.0, 780.0, 78499.0, 139385.0]
running training and testing for user 1
 --------------------------------------------
importing datasets....
selecting 30 simillar users...
configuring movie formatting...
creating the Graph...
Staring training fo 30 iterations...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 120.24it/s]


0
Recommended ID's: [2950.0, 2950.0, 1030.0, 5040.0, 485.0]
running training and testing for user 2
 --------------------------------------------
importing datasets....
selecting 30 simillar users...


In [None]:
acc_scores_ex = {}
for genre in genres:
    ra_list = []
    for i,rev_list in enumerate(train_revs):
        print(f"running training and testing for user {i}\n --------------------------------------------")
        recs = main (num_agents =30 , num_iter = 30, num_movies = 5 , genre=[],genre_exclude=[genre], exclude_users = chosen_users , userlist = rev_list ,rating_fn = "./generation_saves/ratings_gen_1000.csv", simillarility_fn = "./simillarity_matrix_normalized.csv", alpha= 0.2,beta =0.8,decay=0.05)
        precent_overlap, ra = evaluate_reccomendations(recs,test_revs[i])
        if (precent_overlap != 0 ):
            ra_list.append(ra)
    if (len(ra_list) > 0):
        print(f"genre: {genre}")
        print (f"overall accuracy is {sum(ra_list)/len(ra_list)}")
        acc_scores_ex [genre] = sum(ra_list)/len(ra_list)

In [None]:
print(acc_scores_ex)

In [None]:
from itertools import combinations
conjoined_genres  = combinations(genres, 2)
acc_scores_2 = {}
for genre in genres:
    ra_list = []
    for i,rev_list in enumerate(train_revs):
        print(f"running training and testing for user {i}\n --------------------------------------------")
        recs = main (num_agents =30 , num_iter = 30, num_movies = 5 , genre=[],genre_exclude=[genre], exclude_users = chosen_users , userlist = rev_list ,rating_fn = "./generation_saves/ratings_gen_1000.csv", simillarility_fn = "./simillarity_matrix_normalized.csv", alpha= 0.2,beta =0.8,decay=0.05)
        precent_overlap, ra = evaluate_reccomendations(recs,test_revs[i])
        if (precent_overlap != 0 ):
            ra_list.append(ra)

    if (len(ra_list) > 0):
        print(f"genre: {genre}")
        print (f"overall accuracy is {sum(ra_list)/len(ra_list)}")
        acc_scores_2 ["|".join(genre)] = sum(ra_list)/len(ra_list)

print(acc_scores_2)

In [None]:
from itertools import combinations
conjoined_genres  = combinations(genres, 3)
acc_scores_3 = {}
for genre in genres:
    ra_list = []
    for i,rev_list in enumerate(train_revs):
        print(f"running training and testing for user {i}\n --------------------------------------------")
        recs = main (num_agents =30 , num_iter = 30, num_movies = 5 , genre=[],genre_exclude=[genre], exclude_users = chosen_users , userlist = rev_list ,rating_fn = "./generation_saves/ratings_gen_1000.csv", simillarility_fn = "./simillarity_matrix_normalized.csv", alpha= 0.2,beta =0.8,decay=0.05)
        precent_overlap, ra = evaluate_reccomendations(recs,test_revs[i])
        if (precent_overlap != 0 ):
            ra_list.append(ra)

    if (len(ra_list) > 0):
        print(f"genre: {genre}")
        print (f"overall accuracy is {sum(ra_list)/len(ra_list)}")
        acc_scores_3 ["|".join(genre)] = sum(ra_list)/len(ra_list)
    
print(acc_scores_3)