# Basic GA structure

### 4 core function 
1. fitness function: determines the evolutionary fitness (aka liklihood that the agen passes on their "genes"/movie ratings) 
2. Selection: selecting the most fit and the most simillar "users" and pairing them off for mating/passing on their genes
3. Crossover: selecting random components from both agent when creating offspring 
4. mutation: account for random variance by adding traits randomly 

### 1 main function 
- will use all the core functions above to simulate evolution of "user reviews" to create new and more varried users 

In [379]:
import ast 
import random 
import numpy as np
import pandas as pd
import copy

In [380]:
## determine average simillarity with other users
def average_simillarity(df):
    users_sim = {}
    
    for index, row in df.iterrows():
        user = row["userId"]
        simillarities = ast.literal_eval (row["simillarity_vector"])
        
        users_sim [user] = sum(simillarities) / len(simillarities)
        
    return users_sim
        
        

In [381]:
sim_df = pd.read_csv(f"./simillarity_matrix_normalized.csv")
average_user_simillarities  = average_simillarity (sim_df)
print(average_user_simillarities.keys())



dict_keys([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, 120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0, 128.0, 129.0, 130.0, 131.0, 132.0, 133.0, 134.0, 135.0, 136.0, 137.0, 138.0, 139.0, 140.0, 141.0, 142.0, 143.0, 144.0, 145.0, 146.0, 147.0, 148.0, 149.0, 150.0, 151.0, 152.0, 153.0, 154.0, 155.0, 156.0, 157.0

In [382]:
def above_percentile_thresh (thresh,sim,average_user_simillarities):
    if (sim < np.percentile(list(average_user_simillarities.values()), thresh)):
        return False
    else:
        return True
    

In [383]:
def fitness (userid,average_user_simillarities):
    if (above_percentile_thresh(25,average_user_simillarities[userid],average_user_simillarities)):
        return average_user_simillarities[userid]
    else:
        return 0 

In [384]:
def Selection (average_user_simillarities):
    user_pairings = []
    userList = list(average_user_simillarities.keys())
    removed_users =[]
    # first we remove all users who are bellow necessary fitness 
    for user in userList:
        if (fitness(user,average_user_simillarities) == 0 ):
            removed_users.append(user)
            userList.remove(user)
    
    # then we pair them off 
    for index, row in sim_df.iterrows():
        user = row["userId"]
        if (user in userList and (int(user) not in removed_users) ):
            simillarities = ast.literal_eval (row["simillarity_vector"])[:-1]
            #preventing a user pairing more than one person 
            sim_OG = copy.deepcopy(simillarities)

            for ru in sorted(removed_users, reverse=True): 
                del simillarities[int(ru) -1]
                
            simillarities.sort()
            length = len(simillarities)
            first_third, second_third, last_third =  int(length/3), int(2*length/3), length
            rand  = random.random()

            #breaking condition since we have an odd number of users
            if (length < 2):
                break
            elif (length == 2):
                user_pairings.append((sim_OG.index(simillarities[0])+1,sim_OG.index(simillarities[1])))
                break
                
            if rand > 0.3:
                # we will take from the first third
                randint = random.randint(0,first_third-1)
                paired_userindex = sim_OG.index(simillarities[randint])
                
            elif rand >0.1: 
                # we will take from the second third 
                randint = random.randint(first_third,second_third-1)
                paired_userindex = sim_OG.index(simillarities[randint])
            else:
                #we will take from the last third (least correlated users)
                randint = random.randint(second_third,last_third-1)
                paired_userindex = sim_OG.index(simillarities[randint])
                
                
            if(paired_userindex >607):
                print(paired_userindex)
            if(user >607):
                print("user: " +user)
                
            if (int(paired_userindex+1) == 609 or int(user) == 609 ):
                continue
                
                
                
                
            if(paired_userindex <1):
                print(paired_userindex)
                
            user_pairings.append((user,paired_userindex+1))
            
            removed_users.append(int(user))
            removed_users.append(int(paired_userindex+1))
            
    return user_pairings
            

In [385]:
def crossover (userratings1, userratings2):
    userratings1 = np.array(userratings1)
    userratings2 = np.array(userratings2)
    
    l1 = list(range(0,len(userratings1)))
    random.shuffle(l1)
    
    l2 = list(range(0,len(userratings2)))
    random.shuffle(l2)
    
    
    new_user_list = userratings1[l1[:int(len(userratings1)/2)]].tolist() + userratings2[l2[int(len(userratings2)/2):]].tolist()
                                 
    return new_user_list

In [386]:
def mutation (movie_list,userlist):
    
    rand  = random.random()
    ## sometimes the mutation will be mild and will only change a rating 
    if (rand >0.2): 
        random_movie = random.randint(0, len(userlist) -1)
        movie_rating = userlist [random_movie]
        mid,rating = movie_rating[0], movie_rating[1]
        rand2 = random.random()
        if rand2 > 0.5:
            rating += random.uniform(0.5, 2.0)
            rating = rating % 5
            
        else:
            rating -= random.uniform(0.5, 2.0)
            rating = rating % 5
            rating = abs(rating)
    
        userlist [random_movie] = (mid,rating)
    ## somtimes the mutation will be moderate and will greatly change the rating 
    if (rand >0.6): 
        random_movie = random.randint(0, len(userlist) -1)
        movie_rating = userlist [random_movie]
        mid,rating = movie_rating[0], movie_rating[1]
        rand2 = random.random()
        if rand2 > 0.5:
            rating += random.uniform(2.0, 4.0)
            rating = rating % 5
            
        else:
            rating -= random.uniform(2.0, 4.0)
            rating = rating % 5
            rating = abs(rating)
            
        userlist [random_movie] = (mid,rating)
    
    
    # somtimes the mutation will add a new movie
    if (rand>0.95):
        random_movie = random.randint(0, len(movie_list) -1)
        random_score = random.uniform(0.0, 5.0)
        userlist.append((random_movie,random_score))

In [387]:
# def get_movie_fromid (userlist,movieID): 
#     movie_index = [i for i, v in enumerate(lis) if v[0] == movieID]

In [395]:
def run_one_generation (gen_num,user_revs,movie_list,average_user_simillarities):
    pairings  = Selection(average_user_simillarities)
    gen_name = f"gen_{gen_num}"
    new_userid = 1 
    users = {}
    new_average_user_simillarities ={}
    
    for pair in pairings:
        
        if (gen_num == 1):
            parent1 = ast.literal_eval(user_revs.loc[user_revs['userId'] == pair[0]]["ratings_list"].tolist()[0])
            parent2 = ast.literal_eval(user_revs.loc[user_revs['userId'] == pair[1]]["ratings_list"].tolist()[0])
        else:
            parent1 = user_revs.loc[user_revs['userId'] == pair[0]]["ratings_list"].tolist()[0]
#             print(f"pair value: {pair[1]}")
#             print(f"user_rev len:{len(user_revs)} " )
            parent2 = user_revs.loc[user_revs['userId'] == pair[1]]["ratings_list"].tolist()[0]
            
        child1 = crossover(parent1,parent2)
        child2 = crossover(parent1,parent2)
        
        mutation(movie_list,child1)
        mutation(movie_list,child2)
        
        users [new_userid] = child1
        new_average_user_simillarities[new_userid] = average_user_simillarities[pair[0]] * average_user_simillarities[pair[1]]
        new_userid+=1
        users [new_userid] = child2
        new_average_user_simillarities[new_userid] = average_user_simillarities[pair[0]] * average_user_simillarities[pair[1]]
        new_userid+=1
        
        
    #random fit "couples" will have more children to compensate for the unfit 
    #ensures our population does not decrease every generation
    for i in range(0,correct_size - len(users.keys())):
        
        pair = pairings[random.randint(0,len(pairings)-1)]
        if (gen_num == 1):
            parent1 = ast.literal_eval(user_revs.loc[user_revs['userId'] == pair[0]]["ratings_list"].tolist()[0])
            parent2 = ast.literal_eval(user_revs.loc[user_revs['userId'] == pair[1]]["ratings_list"].tolist()[0])
        else:
            parent1 = user_revs.loc[user_revs['userId'] == pair[0]]["ratings_list"].tolist()[0]
            parent2 = user_revs.loc[user_revs['userId'] == pair[1]]["ratings_list"].tolist()[0]
            
        child1 = crossover(parent1,parent2)
        
        mutation(movie_list,child1)

        users [new_userid] = child1
        new_average_user_simillarities[new_userid] = average_user_simillarities[pair[0]] * average_user_simillarities[pair[1]]
        new_userid+=1

        
        
    tempdict = {"userId":list(users.keys()), "ratings_list":list(users.values())}
    #print (tempdict)
    tempDF = pd.DataFrame.from_dict(tempdict)

    tempDF.to_csv(f"./generation_saves/ratings_gen_{gen_num}.csv")
        
    return tempDF,new_average_user_simillarities
        

In [396]:
correct_size = 608
def main (): 
    movie_list = pd.read_csv(f"./ml-latest-small/movies.csv")['movieId'].tolist()
    user_revs = pd.read_csv(f"./ratings_organized.csv")
    average_user_simillarities = average_simillarity (sim_df)
    for i in range(1,100):
        print(f"running simulation of generation {i}")
        user_revs,average_user_simillarities = run_one_generation(i,user_revs,movie_list,average_user_simillarities)

In [None]:
main()

running simulation of generation 1
running simulation of generation 2
pair value: 554
user_rev len:608 
pair value: 330
user_rev len:608 
pair value: 191
user_rev len:608 
pair value: 185
user_rev len:608 
pair value: 581
user_rev len:608 
pair value: 598
user_rev len:608 
pair value: 246
user_rev len:608 
pair value: 520
user_rev len:608 
pair value: 130
user_rev len:608 
pair value: 415
user_rev len:608 
pair value: 474
user_rev len:608 
pair value: 221
user_rev len:608 
pair value: 577
user_rev len:608 
pair value: 133
user_rev len:608 
pair value: 598
user_rev len:608 
pair value: 532
user_rev len:608 
pair value: 218
user_rev len:608 
pair value: 147
user_rev len:608 
pair value: 559
user_rev len:608 
pair value: 378
user_rev len:608 
pair value: 573
user_rev len:608 
pair value: 472
user_rev len:608 
pair value: 226
user_rev len:608 
pair value: 511
user_rev len:608 
pair value: 425
user_rev len:608 
pair value: 449
user_rev len:608 
pair value: 74
user_rev len:608 
pair value: 4

pair value: 461
user_rev len:608 
pair value: 407
user_rev len:608 
pair value: 185
user_rev len:608 
pair value: 547
user_rev len:608 
pair value: 509
user_rev len:608 
pair value: 127
user_rev len:608 
pair value: 546
user_rev len:608 
pair value: 271
user_rev len:608 
pair value: 439
user_rev len:608 
pair value: 356
user_rev len:608 
pair value: 512
user_rev len:608 
pair value: 454
user_rev len:608 
pair value: 296
user_rev len:608 
pair value: 500
user_rev len:608 
pair value: 419
user_rev len:608 
pair value: 521
user_rev len:608 
pair value: 564
user_rev len:608 
pair value: 451
user_rev len:608 
pair value: 218
user_rev len:608 
pair value: 599
user_rev len:608 
pair value: 114
user_rev len:608 
pair value: 249
user_rev len:608 
pair value: 244
user_rev len:608 
pair value: 414
user_rev len:608 
pair value: 514
user_rev len:608 
pair value: 356
user_rev len:608 
pair value: 185
user_rev len:608 
pair value: 536
user_rev len:608 
pair value: 526
user_rev len:608 
pair value: 60

pair value: 191
user_rev len:608 
pair value: 428
user_rev len:608 
pair value: 444
user_rev len:608 
pair value: 371
user_rev len:608 
pair value: 504
user_rev len:608 
pair value: 576
user_rev len:608 
pair value: 517
user_rev len:608 
pair value: 504
user_rev len:608 
pair value: 493
user_rev len:608 
pair value: 570
user_rev len:608 
pair value: 91
user_rev len:608 
pair value: 467
user_rev len:608 
pair value: 428
user_rev len:608 
pair value: 176
user_rev len:608 
pair value: 502
user_rev len:608 
pair value: 474
user_rev len:608 
pair value: 226
user_rev len:608 
pair value: 356
user_rev len:608 
pair value: 552
user_rev len:608 
pair value: 191
user_rev len:608 
pair value: 584
user_rev len:608 
pair value: 123
user_rev len:608 
pair value: 563
user_rev len:608 
pair value: 431
user_rev len:608 
pair value: 253
user_rev len:608 
pair value: 423
user_rev len:608 
pair value: 338
user_rev len:608 
pair value: 91
user_rev len:608 
pair value: 586
user_rev len:608 
pair value: 554


pair value: 120
user_rev len:608 
pair value: 480
user_rev len:608 
pair value: 484
user_rev len:608 
pair value: 232
user_rev len:608 
pair value: 554
user_rev len:608 
pair value: 352
user_rev len:608 
pair value: 432
user_rev len:608 
pair value: 232
user_rev len:608 
pair value: 556
user_rev len:608 
pair value: 246
user_rev len:608 
pair value: 588
user_rev len:608 
pair value: 516
user_rev len:608 
pair value: 110
user_rev len:608 
pair value: 526
user_rev len:608 
pair value: 352
user_rev len:608 
pair value: 436
user_rev len:608 
pair value: 578
user_rev len:608 
pair value: 490
user_rev len:608 
pair value: 504
user_rev len:608 
pair value: 492
user_rev len:608 
pair value: 396
user_rev len:608 
pair value: 604
user_rev len:608 
pair value: 474
user_rev len:608 
pair value: 554
user_rev len:608 
pair value: 560
user_rev len:608 
pair value: 360
user_rev len:608 
pair value: 388
user_rev len:608 
pair value: 498
user_rev len:608 
pair value: 444
user_rev len:608 
pair value: 59

running simulation of generation 11
pair value: 344
user_rev len:608 
pair value: 526
user_rev len:608 
pair value: 364
user_rev len:608 
pair value: 274
user_rev len:608 
pair value: 400
user_rev len:608 
pair value: 484
user_rev len:608 
pair value: 406
user_rev len:608 
pair value: 142
user_rev len:608 
pair value: 432
user_rev len:608 
pair value: 512
user_rev len:608 
pair value: 556
user_rev len:608 
pair value: 550
user_rev len:608 
pair value: 192
user_rev len:608 
pair value: 442
user_rev len:608 
pair value: 396
user_rev len:608 
pair value: 310
user_rev len:608 
pair value: 238
user_rev len:608 
pair value: 598
user_rev len:608 
pair value: 536
user_rev len:608 
pair value: 352
user_rev len:608 
pair value: 324
user_rev len:608 
pair value: 302
user_rev len:608 
pair value: 250
user_rev len:608 
pair value: 498
user_rev len:608 
pair value: 294
user_rev len:608 
pair value: 354
user_rev len:608 
pair value: 160
user_rev len:608 
pair value: 440
user_rev len:608 
pair value: 

running simulation of generation 12
pair value: 312
user_rev len:608 
pair value: 208
user_rev len:608 
pair value: 456
user_rev len:608 
pair value: 92
user_rev len:608 
pair value: 454
user_rev len:608 
pair value: 130
user_rev len:608 
pair value: 290
user_rev len:608 
pair value: 460
user_rev len:608 
pair value: 338
user_rev len:608 
pair value: 578
user_rev len:608 
pair value: 474
user_rev len:608 
pair value: 342
user_rev len:608 
pair value: 378
user_rev len:608 
pair value: 552
user_rev len:608 
pair value: 378
user_rev len:608 
pair value: 220
user_rev len:608 
pair value: 478
user_rev len:608 
pair value: 584
user_rev len:608 
pair value: 580
user_rev len:608 
pair value: 476
user_rev len:608 
pair value: 474
user_rev len:608 
pair value: 564
user_rev len:608 
pair value: 206
user_rev len:608 
pair value: 606
user_rev len:608 
pair value: 374
user_rev len:608 
pair value: 366
user_rev len:608 
pair value: 272
user_rev len:608 
pair value: 272
user_rev len:608 
pair value: 2

pair value: 228
user_rev len:608 
pair value: 68
user_rev len:608 
pair value: 542
user_rev len:608 
pair value: 116
user_rev len:608 
pair value: 226
user_rev len:608 
pair value: 586
user_rev len:608 
pair value: 498
user_rev len:608 
pair value: 364
user_rev len:608 
pair value: 526
user_rev len:608 
pair value: 380
user_rev len:608 
pair value: 516
user_rev len:608 
pair value: 578
user_rev len:608 
pair value: 176
user_rev len:608 
pair value: 398
user_rev len:608 
pair value: 332
user_rev len:608 
pair value: 594
user_rev len:608 
pair value: 486
user_rev len:608 
pair value: 540
user_rev len:608 
pair value: 494
user_rev len:608 
pair value: 272
user_rev len:608 
pair value: 346
user_rev len:608 
pair value: 406
user_rev len:608 
pair value: 546
user_rev len:608 
pair value: 332
user_rev len:608 
pair value: 562
user_rev len:608 
pair value: 364
user_rev len:608 
pair value: 222
user_rev len:608 
pair value: 366
user_rev len:608 
pair value: 390
user_rev len:608 
pair value: 130

pair value: 386
user_rev len:608 
pair value: 594
user_rev len:608 
pair value: 494
user_rev len:608 
pair value: 444
user_rev len:608 
pair value: 204
user_rev len:608 
pair value: 274
user_rev len:608 
pair value: 396
user_rev len:608 
pair value: 442
user_rev len:608 
pair value: 432
user_rev len:608 
pair value: 440
user_rev len:608 
pair value: 556
user_rev len:608 
pair value: 298
user_rev len:608 
pair value: 308
user_rev len:608 
pair value: 276
user_rev len:608 
pair value: 416
user_rev len:608 
pair value: 478
user_rev len:608 
pair value: 440
user_rev len:608 
pair value: 114
user_rev len:608 
pair value: 302
user_rev len:608 
pair value: 524
user_rev len:608 
pair value: 564
user_rev len:608 
pair value: 342
user_rev len:608 
pair value: 558
user_rev len:608 
pair value: 348
user_rev len:608 
pair value: 360
user_rev len:608 
pair value: 382
user_rev len:608 
pair value: 552
user_rev len:608 
pair value: 532
user_rev len:608 
pair value: 360
user_rev len:608 
pair value: 49

pair value: 504
user_rev len:608 
pair value: 402
user_rev len:608 
pair value: 382
user_rev len:608 
pair value: 276
user_rev len:608 
pair value: 212
user_rev len:608 
pair value: 518
user_rev len:608 
pair value: 316
user_rev len:608 
pair value: 298
user_rev len:608 
pair value: 444
user_rev len:608 
pair value: 262
user_rev len:608 
pair value: 386
user_rev len:608 
pair value: 398
user_rev len:608 
pair value: 546
user_rev len:608 
pair value: 358
user_rev len:608 
pair value: 414
user_rev len:608 
pair value: 604
user_rev len:608 
pair value: 340
user_rev len:608 
pair value: 490
user_rev len:608 
pair value: 528
user_rev len:608 
pair value: 338
user_rev len:608 
pair value: 316
user_rev len:608 
pair value: 160
user_rev len:608 
pair value: 360
user_rev len:608 
pair value: 508
user_rev len:608 
pair value: 340
user_rev len:608 
pair value: 412
user_rev len:608 
pair value: 316
user_rev len:608 
pair value: 400
user_rev len:608 
pair value: 276
user_rev len:608 
pair value: 50

running simulation of generation 16
pair value: 248
user_rev len:608 
pair value: 382
user_rev len:608 
pair value: 541
user_rev len:608 
pair value: 500
user_rev len:608 
pair value: 554
user_rev len:608 
pair value: 130
user_rev len:608 
pair value: 608
user_rev len:608 
pair value: 390
user_rev len:608 
pair value: 156
user_rev len:608 
pair value: 476
user_rev len:608 
pair value: 228
user_rev len:608 
pair value: 352
user_rev len:608 
pair value: 322
user_rev len:608 
pair value: 358
user_rev len:608 
pair value: 512
user_rev len:608 
pair value: 440
user_rev len:608 
pair value: 340
user_rev len:608 
pair value: 566
user_rev len:608 
pair value: 272
user_rev len:608 
pair value: 324
user_rev len:608 
pair value: 498
user_rev len:608 
pair value: 546
user_rev len:608 
pair value: 606
user_rev len:608 
pair value: 456
user_rev len:608 
pair value: 434
user_rev len:608 
pair value: 238
user_rev len:608 
pair value: 486
user_rev len:608 
pair value: 356
user_rev len:608 
pair value: 

running simulation of generation 17
pair value: 414
user_rev len:608 
pair value: 226
user_rev len:608 
pair value: 504
user_rev len:608 
pair value: 250
user_rev len:608 
pair value: 588
user_rev len:608 
pair value: 582
user_rev len:608 
pair value: 338
user_rev len:608 
pair value: 224
user_rev len:608 
pair value: 450
user_rev len:608 
pair value: 150
user_rev len:608 
pair value: 512
user_rev len:608 
pair value: 404
user_rev len:608 
pair value: 594
user_rev len:608 
pair value: 336
user_rev len:608 
pair value: 132
user_rev len:608 
pair value: 64
user_rev len:608 
pair value: 322
user_rev len:608 
pair value: 504
user_rev len:608 
pair value: 374
user_rev len:608 
pair value: 370
user_rev len:608 
pair value: 596
user_rev len:608 
pair value: 546
user_rev len:608 
pair value: 548
user_rev len:608 
pair value: 320
user_rev len:608 
pair value: 250
user_rev len:608 
pair value: 500
user_rev len:608 
pair value: 448
user_rev len:608 
pair value: 308
user_rev len:608 
pair value: 1

pair value: 494
user_rev len:608 
pair value: 322
user_rev len:608 
pair value: 592
user_rev len:608 
pair value: 554
user_rev len:608 
pair value: 426
user_rev len:608 
pair value: 130
user_rev len:608 
pair value: 518
user_rev len:608 
pair value: 576
user_rev len:608 
pair value: 308
user_rev len:608 
pair value: 560
user_rev len:608 
pair value: 242
user_rev len:608 
pair value: 408
user_rev len:608 
pair value: 514
user_rev len:608 
pair value: 514
user_rev len:608 
pair value: 488
user_rev len:608 
pair value: 392
user_rev len:608 
pair value: 598
user_rev len:608 
pair value: 312
user_rev len:608 
pair value: 348
user_rev len:608 
pair value: 218
user_rev len:608 
pair value: 516
user_rev len:608 
pair value: 598
user_rev len:608 
pair value: 480
user_rev len:608 
pair value: 520
user_rev len:608 
pair value: 560
user_rev len:608 
pair value: 528
user_rev len:608 
pair value: 348
user_rev len:608 
pair value: 592
user_rev len:608 
pair value: 592
user_rev len:608 
pair value: 37