In [None]:
# %load_ext jupyter_black

# stdlib
import ast
from random import randint

# third party
from faker import Faker
import pandas as pd

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

# third party
import numpy as np

In [None]:
# stdlib
# TODO move to https://github.com/OpenMined/datasets
# and use a helper util to download and `autocache`
# should we use a huge dataframe for the first example, could we at least use zip or  parquet or something?
import os

if not os.path.exists("ages_dataset.csv"):
    !curl -O https://openminedblob.blob.core.windows.net/csvs/ages_dataset.csv

In [None]:
data_path = "ages_dataset.csv"
df = pd.read_csv(data_path)
df = df.dropna(how="any")
print(df.shape)
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
print(df["Id"].nunique())
print(df["Name"].nunique())

In [None]:
df["Gender"].value_counts()

In [None]:
print("No. of unique Occupations:", df["Occupation"].nunique())
df["Occupation"].value_counts()[:10]

In [None]:
print("No. of unique combinations of Countries:", df["Associated Countries"].nunique())
df["Associated Countries"].value_counts()[:10]

In [None]:
# Convert <string> type list of strings to python <list> type
df["Associated Countries"] = df["Associated Countries"].apply(ast.literal_eval)

df["Associated Country Coordinates (Lat/Lon)"] = df[
    "Associated Country Coordinates (Lat/Lon)"
].apply(ast.literal_eval)

df["Associated Country Life Expectancy"] = df[
    "Associated Country Life Expectancy"
].apply(ast.literal_eval)

In [None]:
# Separate countries from lists and calculate their individual value_counts() which will be
# used by random.choice function later as distributions


def value_counts_of_lists(series_with_lists):
    # Concatenate all the lists in the Series into a single list
    unpacked_list = [item for sublist in series_with_lists for item in sublist]

    # Create a new Series from the unpacked list
    unpacked_series = pd.Series(unpacked_list)

    # Use value_counts to get the count of unique values
    value_counts = unpacked_series.value_counts()

    return unpacked_list, value_counts


# Create a dictionary where each unique country from all the lists in Associated Countries
# are keys and the corresponding (Lat,Long) tuples are the values

unpacked_cnt_list, cnt_value_counts = value_counts_of_lists(
    df["Associated Countries"].values
)

unpacked_exp_list, exp_value_counts = value_counts_of_lists(
    df["Associated Country Life Expectancy"].values
)

print(len(unpacked_cnt_list))
print(len(unpacked_exp_list))

cnt_dict = dict.fromkeys(unpacked_cnt_list, None)

for i in range(len(unpacked_exp_list)):
    if cnt_dict[unpacked_cnt_list[i]] is None:
        cnt_dict[unpacked_cnt_list[i]] = unpacked_exp_list[i]

In [None]:
NUM_OF_ROWS = df.shape[0]  # 10000

Faker.seed(0)
faker = Faker()

In [None]:
gender_encode_dict = {
    "Male": "Gender 1",
    "Female": "Gender 2",
    "Transgender Female": "Gender 3",
    "Transgender Male": "Gender 4",
    "Intersex": "Gender 5",
    "Eunuch; Male": "Gender 6; Gender 1",
    "Transgender Female; Female": "Gender 3; Gender 2",
}


def generate_random_choice_columns(df, num):
    # Generate Id
    id_list = np.arange(1, num + 2000)
    fake_id = np.random.choice(id_list, size=num, replace=False)
    fake_id = pd.Series(fake_id).apply(lambda x: "Q" + str(x))

    # Generate Gender
    gender_dist = df["Gender"].value_counts(normalize=True)
    gender = np.random.choice(
        df["Gender"].unique().tolist(),
        size=num,
        replace=True,
        p=gender_dist,  # probability
    )
    gender = pd.Series(gender).replace(gender_encode_dict)

    # Generate Age of death, add noise by adding random int between(-5,5) to fake age
    age_of_death_dist = df["Age of death"].value_counts(normalize=True)
    age_of_death = np.random.choice(
        df["Age of death"].unique().tolist(),
        size=num,
        replace=True,
        p=age_of_death_dist,  # probability
    )
    age_of_death = (
        pd.Series(age_of_death).apply(lambda x: x + randint(-5, 5)).astype("float64")
    )

    # Generate Associated Countries
    assc_cnt_dist = df["Associated Countries"].value_counts(normalize=True)
    assc_cnt = np.random.choice(
        df["Associated Countries"].astype(str).value_counts().keys().tolist(),
        size=num,
        replace=True,
        p=assc_cnt_dist,  # probability
    )
    assc_cnt = pd.Series(assc_cnt).apply(ast.literal_eval)

    # Generate Life Expectency using the dictionary created above
    assc_life_exp = pd.Series(assc_cnt).apply(lambda x: [cnt_dict[i] for i in x])

    # Generate Manner of death
    manner_of_death_dist = df["Manner of death"].value_counts(normalize=True)
    manner_of_death = np.random.choice(
        df["Manner of death"].unique().tolist(),
        size=num,
        replace=True,
        p=manner_of_death_dist,  # probability
    )
    manner_of_death = pd.Series(manner_of_death)

    return fake_id, gender, age_of_death, assc_cnt, assc_life_exp, manner_of_death


def make_faker_data(num):
    fake_data = [
        {
            "Name": faker.name(),
            "Short description": faker.paragraph(nb_sentences=2),
            "Occupation": faker.job(),
            "Death year": float(faker.year()),
        }
        for x in range(num)
    ]

    return fake_data

In [None]:
mock_df = pd.DataFrame()
(
    mock_df["Id"],
    mock_df["Gender"],
    mock_df["Age of death"],
    mock_df["Associated Countries"],
    mock_df["Associated Country Life Expectancy"],
    mock_df["Manner of death"],
) = generate_random_choice_columns(df, num=NUM_OF_ROWS)

fake_data = pd.DataFrame(make_faker_data(num=NUM_OF_ROWS))

for col in fake_data.columns.to_list():
    mock_df[col] = fake_data[col]

# Generate Birth year by subtracting Age of death from Death year
mock_df["Birth year"] = mock_df["Death year"].astype(int) - mock_df[
    "Age of death"
].astype(int)

print(mock_df.shape)
mock_df.head()

In [None]:
mock_df["Country"] = ["Not Available"] * mock_df.shape[0]
mock_df["Associated Country Coordinates (Lat/Lon)"] = ["Not Available"] * mock_df.shape[
    0
]

In [None]:
print(mock_df.shape)
mock_df.head()

In [None]:
mock_df.to_csv("ages_mock_df.csv", index=False)

In [None]:
cols = mock_df.columns
df[cols].info()

In [None]:
mock_df[cols].info()

In [None]:
df[cols].describe()

In [None]:
mock_df[cols].describe()

In [None]:
df["Manner of death"].value_counts()[:5]

In [None]:
mock_df["Manner of death"].value_counts()[:5]

In [None]:
# third party
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

In [None]:
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "gpt2-medium"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, pad_token_id=tokenizer.eos_token_id
).to(torch_device)

In [None]:
# Zero-Shot Approach
# Directly asking the model to generate texts in response to a cue.


def generate_reviews(prompt, product, n_texts):
    for _ in range(n_texts):
        model_inputs = tokenizer(prompt, return_tensors="pt").to(torch_device)

        outputs = model.generate(
            **model_inputs,
            max_new_tokens=100,
            do_sample=True,
            top_k=10,
            top_p=0.95,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id,
        )

        review = tokenizer.decode(outputs[0], skip_special_tokens=True)
        product_names.append(product)
        reviews.append(review.splitlines()[0])

In [None]:
products = [
    "64 inch Sony television",
    "Amazon Kindle",
    "Honda SUV car",
    "Dyson vacuum cleaner",
]
product_names = []
reviews = []

# TODO: Shorten this loop for testing to like 1 row or something
for product in products[0:1]:
    # for product in products:
    # generate positive reviews
    prompt_pos = f"Recently I bought a {product}. I am happy with the purchase because"
    generate_reviews(prompt_pos, product, n_texts=2)

    # generate negative reviews
    prompt_neg = (
        f"Recently I bought a {product}. I am disappointed with the purchase because"
    )
    generate_reviews(prompt_neg, product, n_texts=2)

In [None]:
mock_review_df = pd.DataFrame()
mock_review_df["product"] = product_names
mock_review_df["review"] = reviews
print(mock_review_df.shape)
mock_review_df.head()

In [None]:
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "datificate/gpt2-small-spanish"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# add the EOS token as PAD token to avoid warnings
model = AutoModelForCausalLM.from_pretrained(
    model_name, pad_token_id=tokenizer.eos_token_id
).to(torch_device)

In [None]:
products = [
    "Televisión Sony de 64 pulgadas",
    "Amazon Kindle",
    "auto SUV Honda",
    "aspiradora Dyson",
]
product_names = []
reviews = []

# TODO: Shorten this loop for testing to like 1 row or something
# for product in products:
for product in products[0:1]:
    # generate generic reviews
    prompt = f"Ayer compré un {product}. Fue"
    generate_reviews(prompt, product, n_texts=2)

In [None]:
mock_review_df = pd.DataFrame()
mock_review_df["product"] = product_names
mock_review_df["review"] = reviews
print(mock_review_df.shape)
mock_review_df.head()

In [None]:
# skip in CI

In [None]:
# !uv pip install medigan

In [None]:
# stdlib
import glob

# third party
from PIL import Image
import matplotlib.pyplot as plt
from medigan import Generators

In [None]:
generators = Generators()

In [None]:
# generate 3 samples with model 19 (00019_PGGAN_CHEST_XRAY).
# Also, auto-install required model dependencies.

generators.generate(model_id=19, num_samples=3, install_dependencies=True)

In [None]:
def plot_images(out_dir):
    """plot images from generator output"""

    fig, ax = plt.subplots(1, 3, figsize=(15, 15))

    # get the most recent output
    img_folder = sorted(glob.glob(f"{out_dir}/*/*/"), reverse=True)[0]

    for i in range(3):
        img_path = f"{img_folder}batch_0_{str(i)}.png"
        img = Image.open(img_path)
        print(img_path)

        # plotting images
        ax[i].axis("off")
        ax[i].imshow(img)

In [None]:
plot_images("./output")

In [None]:
!rm -r ./models ./config