# Set up

In [1]:
from textgenrnn import textgenrnn
from datetime import datetime
import os
import pandas as pd

In [2]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [82]:
model_cfg = {
    'word_level': True,   # set to True if want to train a word-level model (requires more data and smaller max_length)
    'rnn_size': 128,   # number of LSTM cells of each layer (128/256 recommended)
    'rnn_layers': 3,   # number of LSTM layers (>=2 recommended)
    'rnn_bidirectional': False,   # consider text both forwards and backward, can give a training boost
    'max_length': 30,   # number of tokens to consider before predicting the next (20-40 for characters, 5-10 for words recommended)
    'max_words': 10000,   # maximum number of words to model; the rest will be ignored (word-level model only)
}

train_cfg = {
    'line_delimited': False,   # set to True if each text has its own line in the source file
    'num_epochs': 50,   # set higher to train the model for longer
    'gen_epochs': 125,   # generates sample text from model after given number of epochs
    'train_size': 0.8,   # proportion of input data to train on: setting < 1.0 limits model from learning perfectly
    'dropout': 0.1,   # ignore a random proportion of source tokens each epoch, allowing model to generalize better
    'validation': False,   # If train__size < 1.0, test on holdout dataset; will make overall training slower
    'is_csv': False   # set to True if file is a CSV exported from Excel/BigQuery/pandas
}

# Data

In [15]:
raw_df = pd.read_csv('okcupid_text.csv')
raw_df.columns

Index(['age', 'body_type', 'diet', 'drinks', 'drugs', 'education', 'essay0',
       'essay1', 'essay2', 'essay3',
       ...
       'w199', 'w200', 'sd', 'ave_sentiment', 'Flesch_Kincaid',
       'Gunning_Fog_Index', 'Coleman_Liau', 'SMOG',
       'Automated_Readability_Index', 'Average_Grade_Level'],
      dtype='object', length=556)

In [58]:
raw_df.columns[1:50]

Index(['body_type', 'diet', 'drinks', 'drugs', 'education', 'essay0', 'essay1',
       'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8',
       'essay9', 'ethnicity', 'height', 'income', 'job', 'last_online',
       'location', 'offspring', 'orientation', 'pets', 'religion', 'sex',
       'sign', 'smokes', 'speaks', 'status', 'uuid', 'age_group', 'body_fat',
       'body_fit', 'body_type_noanswer', 'body_thin', 'vegetarian',
       'halal_kosher', 'alcohol', 'drugs_rec', 'education_rec', 'grad_school',
       'black', 'asian', 'short', 'income_rec', 'haskids', 'wantskids', 'gay'],
      dtype='object')

In [59]:
df= raw_df[['essay0', 'sex']]
df= df.dropna()
df= df.sample(10000)
df.describe()

Unnamed: 0,essay0,sex
count,10000,10000
unique,9995,2
top,hi!,m
freq,3,5946


## Run Text Generation Model

In [61]:
df.essay0 = df.essay0.str.replace('<[^<]+?>', '') 
df.essay0= df.essay0.str.replace('\n', '') 
#essay0[1:10000].to_csv('essay0.csv')
#essay0[1:10000]
df.essay0
df_m = df[df.sex=='m']
df_f = df[df.sex=='f']

In [64]:
text_m =  df_m['essay0'].tolist()
text_f =  df_f['essay0'].tolist()

In [65]:
model_name = 'okcupid'
textgen_f = textgenrnn(name=model_name)
textgen_f.reset()

In [83]:
#train_function = textgen.train_from_file if train_cfg['line_delimited'] else textgen.train_from_largetext_file
textgen_f.train_new_model( 
    text_f,
    #context_labels=context_labels, 
    #context=True,
    num_epochs=train_cfg['num_epochs'],
    gen_epochs=train_cfg['gen_epochs'],
    batch_size=500,
    train_size=train_cfg['train_size'],
    dropout=train_cfg['dropout'],
    validation=train_cfg['validation'],
    is_csv=train_cfg['is_csv'],
    rnn_layers=model_cfg['rnn_layers'],
    rnn_size=model_cfg['rnn_size'],
    rnn_bidirectional=model_cfg['rnn_bidirectional'],
    max_length=model_cfg['max_length'],
    dim_embeddings= 100,
    word_level=model_cfg['word_level'])

Training new model w/ 3-layer, 128-cell LSTMs
Training on 483,254 word sequences.
  ...
    to  
  ['...']
Train for 966 steps
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [89]:
textgen_m = textgenrnn(name=model_name)
textgen_m.train_new_model( 
    text_m,
    #ontext_labels=context_labels, 
    #context=True,
    num_epochs=train_cfg['num_epochs'],
    gen_epochs=train_cfg['gen_epochs'],
    batch_size=500,
    train_size=train_cfg['train_size'],
    dropout=train_cfg['dropout'],
    validation=train_cfg['validation'],
    is_csv=train_cfg['is_csv'],
    rnn_layers=model_cfg['rnn_layers'],
    rnn_size=model_cfg['rnn_size'],
    rnn_bidirectional=model_cfg['rnn_bidirectional'],
    max_length=model_cfg['max_length'],
    dim_embeddings= 100,
    word_level=model_cfg['word_level'])

Training new model w/ 3-layer, 128-cell LSTMs
Training on 649,374 word sequences.
  ...
    to  
  ['...']
Train for 1298 steps
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


# Generate profile intro for different gender

## Male 

In [95]:
textgen_m.generate(3, temperature=0.7)

 33%|████████████████████████████                                                        | 1/3 [00:00<00:01,  1.11it/s]

i'm a smart, financially - stable - athletic type of guy who's his journey as about it has the way.



 67%|████████████████████████████████████████████████████████                            | 2/3 [00:01<00:00,  1.21it/s]

i'm a college professor and working out in san francisco, but sometimes also i enjoy it.



100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:07<00:00,  2.58s/it]

i'm a pretty easy going, laid back guy. i'm a pretty outgoing guy. i'm star wars or attitude. i used to be a spanish - year in japan and i enjoy the world. i moved to san francisco ago from a year ago. i am prettymuch a major of city life to the city. i love this city life, enjoy meeting new people and quality. i'm always looking for new people or gay, hanging out with a bar or drinks food, wine and listening to good wine. i am a calm person, but i also like to keep fit, and am always looking for good people to make new people. i'm a rebel at heart and gets pretty good at times. i have a lot of acquaintances but am i'm very passionate and a career at my own, but i'm not sure to be pretty good at. finally, i know i look out, however, it doesn't really get to know, although all i'm not saying.






## Female

In [96]:
textgen_f.generate(3, temperature=0.7)

 33%|████████████████████████████                                                        | 1/3 [00:02<00:05,  2.59s/it]

i am a lover of mustaches,, take walks of the city, and like a variety of bands in the us. the best is: i tend to enjoy a walk around the closer to the beach. i believe everyone has a purpose. i believe that in relationship and experience is the most is a turn starting to laugh and i would like to be. i'm very well. i am strong, loving, and caring



 67%|████████████████████████████████████████████████████████                            | 2/3 [00:08<00:03,  3.50s/it]

i'm an east coast transplant who's cute, fun, fun lives looking for family andfriends and learning. i like to go to the movies and have been active to get to dinner parties where other people came of the country. favorite food, wine, food, wine, food, wine, and food. i'd like to meet a man who knows how to ground and life. i am very honest, and mostly just comfortable with a good movie and there. my last relationship traveling and being out of the country. especially in urban church. i'm curious about a bit of a project that it's 4 - but i'm pretty sure the most't in this summer. in case, i'm a short person. i'm not in a car and soccer field. in the past i like being outdoors, camping, hiking, biking, swimming, shopping, playingtennis and and seeing movies.



100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:10<00:00,  3.51s/it]

i've lived in the bay area for almost a couple years now. i work as a tech - class; working in a used city, went an avid rock band geek, where's going to change for school. i love to get out more, whether it's all talking to just do where i am from the bay. i will never say where i am doing.




