# Refining Profile Data
Adding information to the categories

### Library and Data Imports

In [21]:
import pandas as pd
import _pickle as pickle
import numpy as np
from scipy.stats import halfnorm

In [22]:
with open("profiles.pkl",'rb') as fp:
    df = pickle.load(fp)

In [23]:
# Removing the numerical data
df = df[['Bios']]

### Creating Lists for the Categories

In [47]:
# Probability dictionary
p = {}

#Gender
gender = ['male','female']
p['Gender'] = [0.5,0.5]

# Movie Genres
movies = ['Adventure',
          'Action',
          'Drama',
          'Comedy',
          'Thriller',
          'Horror',
          'RomCom',
          'Musical',
          'Documentary']

p['Movies'] = [0.28,
               0.21,
               0.16,
               0.14,
               0.09,
               0.06,
               0.04,
               0.01, 
               0.01]

# TV Genres
tv = ['Comedy',
      'Drama',
      'Action/Adventure',
      'Suspense/Thriller',
      'Documentaries',
      'Crime/Mystery',
      'News',
      'SciFi',
      'History']

p['TV'] = [0.30,
           0.23,
           0.12,
           0.12,
           0.09,
           0.08,
           0.03,
           0.02,
           0.01]

# Religions (could potentially create a spectrum)
religion = ['Catholic',
            'Christian',
            'Jewish',
            'Muslim',
            'Hindu',
            'Buddhist',
            'Spiritual',
            'Other',
            'Agnostic',
            'Atheist']

p['Religion'] = [0.16,
                 0.16,
                 0.01,
                 0.19,
                 0.11,
                 0.05,
                 0.10,
                 0.09,
                 0.07,
                 0.06]

# Music
music = ['Rock',
         'HipHop',
         'Pop',
         'Country',
         'Latin',
         'EDM',
         'Gospel',
         'Jazz',
         'Classical']

p['Music'] = [0.30,
              0.23,
              0.20,
              0.10,
              0.06,
              0.04,
              0.03,
              0.02,
              0.02]

# Sports
sports = ['Football',
          'Baseball',
          'Basketball',
          'Hockey',
          'Soccer',
          'Other']

p['Sports'] = [0.34,
               0.30,
               0.16, 
               0.13,
               0.04,
               0.03]

# Politics (could also put on a spectrum)
politics = ['Liberal',
            'Progressive',
            'Centrist',
            'Moderate',
            'Conservative']

p['Politics'] = [0.26,
                 0.11,
                 0.11,
                 0.15,
                 0.37]

# Social Media
social = ['Facebook',
          'Youtube',
          'Twitter',
          'Reddit',
          'Instagram',
          'Pinterest',
          'LinkedIn',
          'SnapChat',
          'TikTok']

p['Social Media'] = [0.36,
                     0.27,
                     0.11,
                     0.09,
                     0.05,
                     0.03,
                     0.03,
                     0.03,
                     0.03]

# Age (generating random numbers based on half normal distribution)
age = halfnorm.rvs(loc=18,scale=8, size=df.shape[0]).astype(int)

# Lists of Names and the list of the lists
categories = [gender, movies, religion, music, politics, social, sports, age]

names = ['Gender','Movies','Religion', 'Music', 'Politics', 'Social Media', 'Sports', 'Age']

combined = dict(zip(names, categories))

### Establishing random values for each category

In [48]:
# Looping through and assigning random values
for name, cats in combined.items():
    if name in ['Religion', 'Politics','Gender']:
        # Picking only 1 from the list
        df[name] = np.random.choice(cats, df.shape[0], p=p[name])
        
    elif name == 'Age':
        # Generating random ages based on a normal distribution
        df[name] = cats
    else:
        # Picking 3 from the list 
        try:
            df[name] = list(np.random.choice(cats, size=(df.shape[0],1,3), p=p[name]))
        except:
            df[name] = list(np.random.choice(cats, size=(df.shape[0],1,3)))
        
        df[name] = df[name].apply(lambda x: list(set(x[0].tolist())))
        

In [49]:
df.head(20)

Unnamed: 0,Bios,Gender,Movies,Religion,Music,Politics,Social Media,Sports,Age
0,Evil communicator. Avid analyst. Freelance gam...,female,"[Adventure, Action]",Other,"[Rock, HipHop]",Moderate,"[TikTok, Twitter]","[Basketball, Baseball, Football]",19
1,Alcohol lover. Evil analyst. Infuriatingly hum...,female,"[Adventure, Drama, Action]",Christian,"[Rock, HipHop]",Conservative,"[Youtube, Reddit, Facebook]",[Football],18
2,Amateur pop culture ninja. Social media expert...,male,"[RomCom, Drama]",Muslim,"[Pop, Rock]",Liberal,"[Facebook, Youtube, Twitter]","[Basketball, Football]",20
3,Wannabe baconaholic. Music evangelist. Hardcor...,male,"[Thriller, Drama, Action]",Muslim,"[Jazz, HipHop, Rock]",Liberal,"[Youtube, Reddit, Twitter]","[Hockey, Football]",19
4,Wannabe creator. Student. Social media enthusi...,male,"[Comedy, Adventure, RomCom]",Christian,"[Pop, Rock, HipHop]",Conservative,"[Youtube, Facebook, SnapChat]","[Baseball, Hockey, Soccer]",22
5,Beer trailblazer. Avid troublemaker. Pop cultu...,female,"[Adventure, Drama, Action]",Agnostic,"[HipHop, Classical]",Centrist,"[Pinterest, Youtube, LinkedIn]","[Football, Soccer]",22
6,Infuriatingly humble coffee geek. Music enthus...,male,"[Horror, RomCom, Drama]",Spiritual,"[Pop, Rock, Latin]",Centrist,"[Youtube, Facebook, LinkedIn]","[Baseball, Football]",18
7,Extreme coffee evangelist. Social media enthus...,male,"[Comedy, Documentary, Action]",Muslim,"[Country, HipHop, Rock]",Progressive,"[LinkedIn, Twitter, Pinterest]","[Baseball, Basketball, Soccer]",24
8,Hardcore social media fanatic. Extreme coffee ...,female,"[Adventure, Drama]",Muslim,"[Pop, Rock]",Liberal,"[Youtube, Facebook]","[Baseball, Hockey]",19
9,Hipster-friendly alcoholaholic. Music speciali...,female,"[Comedy, Thriller, Action]",Catholic,"[HipHop, Latin]",Liberal,"[Youtube, Instagram]","[Basketball, Baseball, Football]",20


### Categorizing

In [50]:
df['Religion'] = pd.Categorical(df.Religion, ordered=True,
                                categories=['Catholic',
                                            'Christian',
                                            'Jewish',
                                            'Muslim',
                                            'Hindu',
                                            'Buddhist',
                                            'Spiritual',
                                            'Other',
                                            'Agnostic',
                                            'Atheist'])

df['Politics'] = pd.Categorical(df.Politics, ordered=True,
                                categories=['Liberal',
                                            'Progressive',
                                            'Centrist',
                                            'Moderate',
                                            'Conservative'])

### Exporting the DF

In [51]:
with open("refined_profiles.pkl",'wb') as fp:
    pickle.dump(df, fp)