# Basic GA structure

### 4 core function 
1. fitness function: determines the evolutionary fitness (aka liklihood that the agen passes on their "genes"/movie ratings) 
2. Selection: selecting the most fit and the most simillar "users" and pairing them off for mating/passing on their genes
3. Crossover: selecting random components from both agent when creating offspring 
4. mutation: account for random variance by adding traits randomly 

### 1 main function 
- will use all the core functions above to simulate evolution of "user reviews" to create new and more varried users 

In [379]:
import ast 
import random 
import numpy as np
import pandas as pd
import copy

In [380]:
## determine average simillarity with other users
def average_simillarity(df):
    users_sim = {}
    
    for index, row in df.iterrows():
        user = row["userId"]
        simillarities = ast.literal_eval (row["simillarity_vector"])
        
        users_sim [user] = sum(simillarities) / len(simillarities)
        
    return users_sim
        
        

In [381]:
sim_df = pd.read_csv(f"./simillarity_matrix_normalized.csv")
average_user_simillarities  = average_simillarity (sim_df)
print(average_user_simillarities.keys())



dict_keys([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, 120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0, 128.0, 129.0, 130.0, 131.0, 132.0, 133.0, 134.0, 135.0, 136.0, 137.0, 138.0, 139.0, 140.0, 141.0, 142.0, 143.0, 144.0, 145.0, 146.0, 147.0, 148.0, 149.0, 150.0, 151.0, 152.0, 153.0, 154.0, 155.0, 156.0, 157.0

In [382]:
def above_percentile_thresh (thresh,sim,average_user_simillarities):
    if (sim < np.percentile(list(average_user_simillarities.values()), thresh)):
        return False
    else:
        return True
    

In [383]:
def fitness (userid,average_user_simillarities):
    if (above_percentile_thresh(25,average_user_simillarities[userid],average_user_simillarities)):
        return average_user_simillarities[userid]
    else:
        return 0 

In [384]:
def Selection (average_user_simillarities):
    user_pairings = []
    userList = list(average_user_simillarities.keys())
    removed_users =[]
    # first we remove all users who are bellow necessary fitness 
    for user in userList:
        if (fitness(user,average_user_simillarities) == 0 ):
            removed_users.append(user)
            userList.remove(user)
    
    # then we pair them off 
    for index, row in sim_df.iterrows():
        user = row["userId"]
        if (user in userList and (int(user) not in removed_users) ):
            simillarities = ast.literal_eval (row["simillarity_vector"])[:-1]
            #preventing a user pairing more than one person 
            sim_OG = copy.deepcopy(simillarities)

            for ru in sorted(removed_users, reverse=True): 
                del simillarities[int(ru) -1]
                
            simillarities.sort()
            length = len(simillarities)
            first_third, second_third, last_third =  int(length/3), int(2*length/3), length
            rand  = random.random()

            #breaking condition since we have an odd number of users
            if (length < 2):
                break
            elif (length == 2):
                user_pairings.append((sim_OG.index(simillarities[0])+1,sim_OG.index(simillarities[1])))
                break
                
            if rand > 0.3:
                # we will take from the first third
                randint = random.randint(0,first_third-1)
                paired_userindex = sim_OG.index(simillarities[randint])
                
            elif rand >0.1: 
                # we will take from the second third 
                randint = random.randint(first_third,second_third-1)
                paired_userindex = sim_OG.index(simillarities[randint])
            else:
                #we will take from the last third (least correlated users)
                randint = random.randint(second_third,last_third-1)
                paired_userindex = sim_OG.index(simillarities[randint])
                
                
            if(paired_userindex >607):
                print(paired_userindex)
            if(user >607):
                print("user: " +user)
                
            if (int(paired_userindex+1) == 609 or int(user) == 609 ):
                continue
                
                
                
                
            if(paired_userindex <1):
                print(paired_userindex)
                
            user_pairings.append((user,paired_userindex+1))
            
            removed_users.append(int(user))
            removed_users.append(int(paired_userindex+1))
            
    return user_pairings
            

In [385]:
def crossover (userratings1, userratings2):
    userratings1 = np.array(userratings1)
    userratings2 = np.array(userratings2)
    
    l1 = list(range(0,len(userratings1)))
    random.shuffle(l1)
    
    l2 = list(range(0,len(userratings2)))
    random.shuffle(l2)
    
    
    new_user_list = userratings1[l1[:int(len(userratings1)/2)]].tolist() + userratings2[l2[int(len(userratings2)/2):]].tolist()
                                 
    return new_user_list

In [386]:
def mutation (movie_list,userlist):
    
    rand  = random.random()
    ## sometimes the mutation will be mild and will only change a rating 
    if (rand >0.2): 
        random_movie = random.randint(0, len(userlist) -1)
        movie_rating = userlist [random_movie]
        mid,rating = movie_rating[0], movie_rating[1]
        rand2 = random.random()
        if rand2 > 0.5:
            rating += random.uniform(0.5, 2.0)
            rating = rating % 5
            
        else:
            rating -= random.uniform(0.5, 2.0)
            rating = rating % 5
            rating = abs(rating)
    
        userlist [random_movie] = (mid,rating)
    ## somtimes the mutation will be moderate and will greatly change the rating 
    if (rand >0.6): 
        random_movie = random.randint(0, len(userlist) -1)
        movie_rating = userlist [random_movie]
        mid,rating = movie_rating[0], movie_rating[1]
        rand2 = random.random()
        if rand2 > 0.5:
            rating += random.uniform(2.0, 4.0)
            rating = rating % 5
            
        else:
            rating -= random.uniform(2.0, 4.0)
            rating = rating % 5
            rating = abs(rating)
            
        userlist [random_movie] = (mid,rating)
    
    
    # somtimes the mutation will add a new movie
    if (rand>0.95):
        random_movie = random.randint(0, len(movie_list) -1)
        random_score = random.uniform(0.0, 5.0)
        userlist.append((random_movie,random_score))

In [387]:
# def get_movie_fromid (userlist,movieID): 
#     movie_index = [i for i, v in enumerate(lis) if v[0] == movieID]

In [399]:
def run_one_generation (gen_num,user_revs,movie_list,average_user_simillarities):
    pairings  = Selection(average_user_simillarities)
    gen_name = f"gen_{gen_num}"
    new_userid = 1 
    users = {}
    new_average_user_simillarities ={}
    
    for pair in pairings:
        
        if (gen_num == 1):
            parent1 = ast.literal_eval(user_revs.loc[user_revs['userId'] == pair[0]]["ratings_list"].tolist()[0])
            parent2 = ast.literal_eval(user_revs.loc[user_revs['userId'] == pair[1]]["ratings_list"].tolist()[0])
        else:
            parent1 = user_revs.loc[user_revs['userId'] == pair[0]]["ratings_list"].tolist()[0]
#             print(f"pair value: {pair[1]}")
#             print(f"user_rev len:{len(user_revs)} " )
            parent2 = user_revs.loc[user_revs['userId'] == pair[1]]["ratings_list"].tolist()[0]
            
        child1 = crossover(parent1,parent2)
        child2 = crossover(parent1,parent2)
        
        mutation(movie_list,child1)
        mutation(movie_list,child2)
        
        users [new_userid] = child1
        new_average_user_simillarities[new_userid] = average_user_simillarities[pair[0]] * average_user_simillarities[pair[1]]
        new_userid+=1
        users [new_userid] = child2
        new_average_user_simillarities[new_userid] = average_user_simillarities[pair[0]] * average_user_simillarities[pair[1]]
        new_userid+=1
        
        
    #random fit "couples" will have more children to compensate for the unfit 
    #ensures our population does not decrease every generation
    for i in range(0,correct_size - len(users.keys())):
        
        pair = pairings[random.randint(0,len(pairings)-1)]
        if (gen_num == 1):
            parent1 = ast.literal_eval(user_revs.loc[user_revs['userId'] == pair[0]]["ratings_list"].tolist()[0])
            parent2 = ast.literal_eval(user_revs.loc[user_revs['userId'] == pair[1]]["ratings_list"].tolist()[0])
        else:
            parent1 = user_revs.loc[user_revs['userId'] == pair[0]]["ratings_list"].tolist()[0]
            parent2 = user_revs.loc[user_revs['userId'] == pair[1]]["ratings_list"].tolist()[0]
            
        child1 = crossover(parent1,parent2)
        
        mutation(movie_list,child1)

        users [new_userid] = child1
        new_average_user_simillarities[new_userid] = average_user_simillarities[pair[0]] * average_user_simillarities[pair[1]]
        new_userid+=1

        
        
    tempdict = {"userId":list(users.keys()), "ratings_list":list(users.values())}
    #print (tempdict)
    tempDF = pd.DataFrame.from_dict(tempdict)

    tempDF.to_csv(f"./generation_saves/ratings_gen_{gen_num}.csv")
        
    return tempDF,new_average_user_simillarities
        

In [402]:
correct_size = 608
def main (): 
    movie_list = pd.read_csv(f"./ml-latest-small/movies.csv")['movieId'].tolist()
    user_revs = pd.read_csv(f"./ratings_organized.csv")
    average_user_simillarities = average_simillarity (sim_df)
    for i in range(1,3000):
        print(f"running simulation of generation {i}")
        user_revs,average_user_simillarities = run_one_generation(i,user_revs,movie_list,average_user_simillarities)

In [403]:
main()

running simulation of generation 1
running simulation of generation 2
running simulation of generation 3
running simulation of generation 4
running simulation of generation 5
running simulation of generation 6
running simulation of generation 7
running simulation of generation 8
running simulation of generation 9
running simulation of generation 10
running simulation of generation 11
running simulation of generation 12
running simulation of generation 13
running simulation of generation 14
running simulation of generation 15
running simulation of generation 16
running simulation of generation 17
running simulation of generation 18
running simulation of generation 19
running simulation of generation 20
running simulation of generation 21
running simulation of generation 22
running simulation of generation 23
running simulation of generation 24
running simulation of generation 25
running simulation of generation 26
running simulation of generation 27
running simulation of generation 28
r

running simulation of generation 226
running simulation of generation 227
running simulation of generation 228
running simulation of generation 229
running simulation of generation 230
running simulation of generation 231
running simulation of generation 232
running simulation of generation 233
running simulation of generation 234
running simulation of generation 235
running simulation of generation 236
running simulation of generation 237
running simulation of generation 238
running simulation of generation 239
running simulation of generation 240
running simulation of generation 241
running simulation of generation 242
running simulation of generation 243
running simulation of generation 244
running simulation of generation 245
running simulation of generation 246
running simulation of generation 247
running simulation of generation 248
running simulation of generation 249
running simulation of generation 250
running simulation of generation 251
running simulation of generation 252
r

running simulation of generation 448
running simulation of generation 449
running simulation of generation 450
running simulation of generation 451
running simulation of generation 452
running simulation of generation 453
running simulation of generation 454
running simulation of generation 455
running simulation of generation 456
running simulation of generation 457
running simulation of generation 458
running simulation of generation 459
running simulation of generation 460
running simulation of generation 461
running simulation of generation 462
running simulation of generation 463
running simulation of generation 464
running simulation of generation 465
running simulation of generation 466
running simulation of generation 467
running simulation of generation 468
running simulation of generation 469
running simulation of generation 470
running simulation of generation 471
running simulation of generation 472
running simulation of generation 473
running simulation of generation 474
r

running simulation of generation 670
running simulation of generation 671
running simulation of generation 672
running simulation of generation 673
running simulation of generation 674
running simulation of generation 675
running simulation of generation 676
running simulation of generation 677
running simulation of generation 678
running simulation of generation 679
running simulation of generation 680
running simulation of generation 681
running simulation of generation 682
running simulation of generation 683
running simulation of generation 684
running simulation of generation 685
running simulation of generation 686
running simulation of generation 687
running simulation of generation 688
running simulation of generation 689
running simulation of generation 690
running simulation of generation 691
running simulation of generation 692
running simulation of generation 693
running simulation of generation 694
running simulation of generation 695
running simulation of generation 696
r

running simulation of generation 892
running simulation of generation 893
running simulation of generation 894
running simulation of generation 895
running simulation of generation 896
running simulation of generation 897
running simulation of generation 898
running simulation of generation 899
running simulation of generation 900
running simulation of generation 901
running simulation of generation 902
running simulation of generation 903
running simulation of generation 904
running simulation of generation 905
running simulation of generation 906
running simulation of generation 907
running simulation of generation 908
running simulation of generation 909
running simulation of generation 910
running simulation of generation 911
running simulation of generation 912
running simulation of generation 913
running simulation of generation 914
running simulation of generation 915
running simulation of generation 916
running simulation of generation 917
running simulation of generation 918
r

running simulation of generation 1111
running simulation of generation 1112
running simulation of generation 1113
running simulation of generation 1114
running simulation of generation 1115
running simulation of generation 1116
running simulation of generation 1117
running simulation of generation 1118
running simulation of generation 1119
running simulation of generation 1120
running simulation of generation 1121
running simulation of generation 1122
running simulation of generation 1123
running simulation of generation 1124
running simulation of generation 1125
running simulation of generation 1126
running simulation of generation 1127
running simulation of generation 1128
running simulation of generation 1129
running simulation of generation 1130
running simulation of generation 1131
running simulation of generation 1132
running simulation of generation 1133
running simulation of generation 1134
running simulation of generation 1135
running simulation of generation 1136
running simu

running simulation of generation 1327
running simulation of generation 1328
running simulation of generation 1329
running simulation of generation 1330
running simulation of generation 1331
running simulation of generation 1332
running simulation of generation 1333
running simulation of generation 1334
running simulation of generation 1335
running simulation of generation 1336
running simulation of generation 1337
running simulation of generation 1338
running simulation of generation 1339
running simulation of generation 1340
running simulation of generation 1341
running simulation of generation 1342
running simulation of generation 1343
running simulation of generation 1344
running simulation of generation 1345
running simulation of generation 1346
running simulation of generation 1347
running simulation of generation 1348
running simulation of generation 1349
running simulation of generation 1350
running simulation of generation 1351
running simulation of generation 1352
running simu

running simulation of generation 1543
running simulation of generation 1544
running simulation of generation 1545
running simulation of generation 1546
running simulation of generation 1547
running simulation of generation 1548
running simulation of generation 1549
running simulation of generation 1550
running simulation of generation 1551
running simulation of generation 1552
running simulation of generation 1553
running simulation of generation 1554
running simulation of generation 1555
running simulation of generation 1556
running simulation of generation 1557
running simulation of generation 1558
running simulation of generation 1559
running simulation of generation 1560
running simulation of generation 1561
running simulation of generation 1562
running simulation of generation 1563
running simulation of generation 1564
running simulation of generation 1565
running simulation of generation 1566
running simulation of generation 1567
running simulation of generation 1568
running simu

running simulation of generation 1759
running simulation of generation 1760
running simulation of generation 1761
running simulation of generation 1762
running simulation of generation 1763
running simulation of generation 1764
running simulation of generation 1765
running simulation of generation 1766
running simulation of generation 1767
running simulation of generation 1768
running simulation of generation 1769
running simulation of generation 1770
running simulation of generation 1771
running simulation of generation 1772
running simulation of generation 1773
running simulation of generation 1774
running simulation of generation 1775
running simulation of generation 1776
running simulation of generation 1777
running simulation of generation 1778
running simulation of generation 1779
running simulation of generation 1780
running simulation of generation 1781
running simulation of generation 1782
running simulation of generation 1783
running simulation of generation 1784
running simu

running simulation of generation 1975
running simulation of generation 1976
running simulation of generation 1977
running simulation of generation 1978
running simulation of generation 1979
running simulation of generation 1980
running simulation of generation 1981
running simulation of generation 1982
running simulation of generation 1983
running simulation of generation 1984
running simulation of generation 1985
running simulation of generation 1986
running simulation of generation 1987
running simulation of generation 1988
running simulation of generation 1989
running simulation of generation 1990
running simulation of generation 1991
running simulation of generation 1992
running simulation of generation 1993
running simulation of generation 1994
running simulation of generation 1995
running simulation of generation 1996
running simulation of generation 1997
running simulation of generation 1998
running simulation of generation 1999
running simulation of generation 2000
running simu

running simulation of generation 2191
running simulation of generation 2192
running simulation of generation 2193
running simulation of generation 2194
running simulation of generation 2195
running simulation of generation 2196
running simulation of generation 2197
running simulation of generation 2198
running simulation of generation 2199
running simulation of generation 2200
running simulation of generation 2201
running simulation of generation 2202
running simulation of generation 2203
running simulation of generation 2204
running simulation of generation 2205
running simulation of generation 2206
running simulation of generation 2207
running simulation of generation 2208
running simulation of generation 2209
running simulation of generation 2210
running simulation of generation 2211
running simulation of generation 2212
running simulation of generation 2213
running simulation of generation 2214
running simulation of generation 2215
running simulation of generation 2216
running simu

running simulation of generation 2407
running simulation of generation 2408
running simulation of generation 2409
running simulation of generation 2410
running simulation of generation 2411
running simulation of generation 2412
running simulation of generation 2413
running simulation of generation 2414
running simulation of generation 2415
running simulation of generation 2416
running simulation of generation 2417
running simulation of generation 2418
running simulation of generation 2419
running simulation of generation 2420
running simulation of generation 2421
running simulation of generation 2422
running simulation of generation 2423
running simulation of generation 2424
running simulation of generation 2425
running simulation of generation 2426
running simulation of generation 2427
running simulation of generation 2428
running simulation of generation 2429
running simulation of generation 2430
running simulation of generation 2431
running simulation of generation 2432
running simu

running simulation of generation 2623
running simulation of generation 2624
running simulation of generation 2625
running simulation of generation 2626
running simulation of generation 2627
running simulation of generation 2628
running simulation of generation 2629
running simulation of generation 2630
running simulation of generation 2631
running simulation of generation 2632
running simulation of generation 2633
running simulation of generation 2634
running simulation of generation 2635
running simulation of generation 2636
running simulation of generation 2637
running simulation of generation 2638
running simulation of generation 2639
running simulation of generation 2640
running simulation of generation 2641
running simulation of generation 2642
running simulation of generation 2643
running simulation of generation 2644
running simulation of generation 2645
running simulation of generation 2646
running simulation of generation 2647
running simulation of generation 2648
running simu

running simulation of generation 2839
running simulation of generation 2840
running simulation of generation 2841
running simulation of generation 2842
running simulation of generation 2843
running simulation of generation 2844
running simulation of generation 2845
running simulation of generation 2846
running simulation of generation 2847
running simulation of generation 2848
running simulation of generation 2849
running simulation of generation 2850
running simulation of generation 2851
running simulation of generation 2852
running simulation of generation 2853
running simulation of generation 2854
running simulation of generation 2855
running simulation of generation 2856
running simulation of generation 2857
running simulation of generation 2858
running simulation of generation 2859
running simulation of generation 2860
running simulation of generation 2861
running simulation of generation 2862
running simulation of generation 2863
running simulation of generation 2864
running simu