# Set up

In [1]:
from textgenrnn import textgenrnn
from datetime import datetime
import os
import pandas as pd

In [2]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [62]:
model_cfg = {
    'word_level': True,   # set to True if want to train a word-level model (requires more data and smaller max_length)
    'rnn_size': 128,   # number of LSTM cells of each layer (128/256 recommended)
    'rnn_layers': 3,   # number of LSTM layers (>=2 recommended)
    'rnn_bidirectional': False,   # consider text both forwards and backward, can give a training boost
    'max_length': 30,   # number of tokens to consider before predicting the next (20-40 for characters, 5-10 for words recommended)
    'max_words': 10000,   # maximum number of words to model; the rest will be ignored (word-level model only)
}

train_cfg = {
    'line_delimited': False,   # set to True if each text has its own line in the source file
    'num_epochs': 100,   # set higher to train the model for longer
    'gen_epochs': 125,   # generates sample text from model after given number of epochs
    'train_size': 0.8,   # proportion of input data to train on: setting < 1.0 limits model from learning perfectly
    'dropout': 0.1,   # ignore a random proportion of source tokens each epoch, allowing model to generalize better
    'validation': False,   # If train__size < 1.0, test on holdout dataset; will make overall training slower
    'is_csv': False   # set to True if file is a CSV exported from Excel/BigQuery/pandas
}

# Data

In [15]:
raw_df = pd.read_csv('okcupid_text.csv')
raw_df.columns

Index(['age', 'body_type', 'diet', 'drinks', 'drugs', 'education', 'essay0',
       'essay1', 'essay2', 'essay3',
       ...
       'w199', 'w200', 'sd', 'ave_sentiment', 'Flesch_Kincaid',
       'Gunning_Fog_Index', 'Coleman_Liau', 'SMOG',
       'Automated_Readability_Index', 'Average_Grade_Level'],
      dtype='object', length=556)

In [58]:
raw_df.columns[1:50]

Index(['body_type', 'diet', 'drinks', 'drugs', 'education', 'essay0', 'essay1',
       'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8',
       'essay9', 'ethnicity', 'height', 'income', 'job', 'last_online',
       'location', 'offspring', 'orientation', 'pets', 'religion', 'sex',
       'sign', 'smokes', 'speaks', 'status', 'uuid', 'age_group', 'body_fat',
       'body_fit', 'body_type_noanswer', 'body_thin', 'vegetarian',
       'halal_kosher', 'alcohol', 'drugs_rec', 'education_rec', 'grad_school',
       'black', 'asian', 'short', 'income_rec', 'haskids', 'wantskids', 'gay'],
      dtype='object')

In [59]:
df= raw_df[['essay0', 'sex']]
df= df.dropna()
df= df.sample(10000)
df.describe()

Unnamed: 0,essay0,sex
count,10000,10000
unique,9995,2
top,hi!,m
freq,3,5946


## Run Text Generation Model

In [61]:
df.essay0 = df.essay0.str.replace('<[^<]+?>', '') 
df.essay0= df.essay0.str.replace('\n', '') 
#essay0[1:10000].to_csv('essay0.csv')
#essay0[1:10000]
df.essay0
df_m = df[df.sex=='m']
df_f = df[df.sex=='f']

In [64]:
text_m =  df_m['essay0'].tolist()
text_f =  df_f['essay0'].tolist()

In [65]:
model_name = 'okcupid'
textgen_f = textgenrnn(name=model_name)
textgen_f.reset()

In [None]:
#train_function = textgen.train_from_file if train_cfg['line_delimited'] else textgen.train_from_largetext_file
textgen_f.train_new_model( 
    text_f,
    #context_labels=context_labels, 
    #context=True,
    num_epochs=train_cfg['num_epochs'],
    gen_epochs=train_cfg['gen_epochs'],
    batch_size=500,
    train_size=train_cfg['train_size'],
    dropout=train_cfg['dropout'],
    validation=train_cfg['validation'],
    is_csv=train_cfg['is_csv'],
    rnn_layers=model_cfg['rnn_layers'],
    rnn_size=model_cfg['rnn_size'],
    rnn_bidirectional=model_cfg['rnn_bidirectional'],
    max_length=model_cfg['max_length'],
    dim_embeddings= 100,
    word_level=model_cfg['word_level'])

Training new model w/ 3-layer, 128-cell LSTMs
Training on 483,992 word sequences.
  ...
    to  
  ['...']
Train for 967 steps
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100

In [None]:
textgen_m = textgenrnn(name=model_name)
textgen_m.train_new_model( 
    text_m,
    #ontext_labels=context_labels, 
    #context=True,
    num_epochs=train_cfg['num_epochs'],
    gen_epochs=train_cfg['gen_epochs'],
    batch_size=500,
    train_size=train_cfg['train_size'],
    dropout=train_cfg['dropout'],
    validation=train_cfg['validation'],
    is_csv=train_cfg['is_csv'],
    rnn_layers=model_cfg['rnn_layers'],
    rnn_size=model_cfg['rnn_size'],
    rnn_bidirectional=model_cfg['rnn_bidirectional'],
    max_length=model_cfg['max_length'],
    dim_embeddings= 100,
    word_level=model_cfg['word_level'])

In [57]:
textgen_m.generate(1, temperature=0.3)

i'm a nice guy who is looking for a good friend. i'm a very nice guy. i love to laugh and have a good time. i'm also a total homebody, but i'm not into the bar scene. i'm a bit of a nerd. i'm a bit of a dork, but i'm not a snob. i'm a bit of a dork. i'm a bit of a romantic. i'm a bit of a romantic. i'm not a jock, but i love to cook, i'm not a fan of jeans and a. i'm a bit of a homebody, but i'm also a bit of a homebody, but i'm not afraid to go out with friends, but i'm also a bit of a nerd. i'm a bit of a nerd, but i'm also very sarcastic. i'm a bit of a romantic. i'm a bit of a freak, and i'm not a snob. i'm a bit of a romantic, but i'm not looking for a hook - up. i'm a very nice guy, but i'm not really interested in meeting people. i'm a bit of a dork, but i'm not a. i'm a bit of a nerd, but i'm not a. i'm not a fan of a night, but i'm



In [None]:
textgen_f.generate(1, temperature=0.3)