In [None]:
#Question: If you have data from the UK biobank of 10,000 South Asian individuals, 
# how will you go about isolating nutritional phenotype information and performing further analysis?

#Answer:
#The broad steps to isolate nutritional phenotype data from UK biobank are as follows:

1.Identify the variables in the UK Biobank dataset that relate to nutrition, such as dietary intake, nutrient levels, or anthropometric measures related to nutrition

2. Filter the dataset to only include data for South Asian individuals

3. Explore the dataset to understand the distribution of variables and check for missing values

4.  Perform statistical analyses to investigate relationships between the nutritional variables and other variables of interest, such as demographic information, health outcomes, or genetic variants

5. Use data visualization tools to communicate findings and insights from the analysis

In [None]:
#To filter the dataset in the UK Biobank to only include data for South Asian individuals, 
#you can use either self-reported ethnicity or genetic ancestry data. 

In [None]:
#Here are the steps for each self-reported ethnicity
#Firstly we create a simulated dataset 

In [1]:
#import libraries
import numpy as np
import pandas as pd

In [2]:
# Set the number of individuals in the dataset
num_individuals = 10800

In [29]:
# Generate the ID variable
ids = np.arange(1, num_individuals+1)

# Generate the Age variable (mean=50, std=10)
ages = np.random.normal(loc=50, scale=10, size=num_individuals).astype(int)

# Generate the Sex variable (M or F)
sexes = np.random.choice(['M', 'F'], size=num_individuals)

# Generate the Height variable (mean=170, std=10)
heights = np.random.normal(loc=170, scale=10, size=num_individuals).round(1)

# Generate the Weight variable (mean=70, std=10)
weights = np.random.normal(loc=70, scale=10, size=num_individuals).round(1)

#Generate the ethnicity variable
ethnicity = np.random.choice(["Indian", "Pakistani", "Bangladeshi", "Asian", "Chinese", "white_british", "white_irish"], size=num_individuals)

# Generate the Systolic Blood Pressure variable (mean=120, std=10)
sbp = np.random.normal(loc=120, scale=10, size=num_individuals).astype(int)

# Generate the Diastolic Blood Pressure variable (mean=80, std=10)
dbp = np.random.normal(loc=80, scale=10, size=num_individuals).astype(int)

#Generate calorie_intake variable (mean =1000, std = 500)
calorie = np.random.normal(loc=1000, scale=500,size=num_individuals).astype(int)

#Generate protein_intake variable (mean =60, std = 20)
protein = np.random.normal(loc=60, scale=20,size=num_individuals).astype(int)

#Generate carb_intake variable (mean =200, std = 70)
carb = np.random.normal(loc=200, scale=70,size=num_individuals).astype(int)

#Generate fat_intake variable (mean =80, std = 25)
fat = np.random.normal(loc=80, scale=25,size=num_individuals).astype(int)

# Generate the Smoking Status variable (Current Smoker, Former Smoker, or Non-Smoker)
smoking_status = np.random.choice(['Current Smoker', 'Former Smoker', 'Non-Smoker'], size=num_individuals)

# Generate the Alcohol Intake variable (mean=20, std=10)
alcohol_intake = np.random.normal(loc=20, scale=10, size=num_individuals).astype(int)

# Generate the Physical Activity variable (mean=3, std=2)
physical_activity = np.random.normal(loc=3, scale=2, size=num_individuals).round(1)

# Generate the Diabetes Status variable (Yes or No)
diabetes_status = np.random.choice(['Yes', 'No'], size=num_individuals)

# Generate the Cancer Status variable (Yes or No)
cancer_status = np.random.choice(['Yes', 'No'], size=num_individuals)


In [30]:
# Create a dictionary with the variables
data_dict = {'ID': ids,
             'Age': ages,
             'Sex': sexes,
             'Height_cm': heights,
             'Weight_kg': weights,
             'Ethnicity': ethnicity,
             'Systolic_BP_mmHg': sbp,
             'Diastolic_BP_mmHg': dbp,
             'Calorie_intake': calorie,
             'Protein_intake': protein,
             'Carb_intake': carb,
             'Fat_intake': fat,
             'Smoking_Status': smoking_status,
             'Alcohol_Intake_g/day': alcohol_intake,
             'Physical_Activity_hours/week': physical_activity,
             'Diabetes_Status': diabetes_status,
             'Cancer_Status': cancer_status}

In [31]:
# Convert the dictionary to a Pandas DataFrame
df = pd.DataFrame(data_dict)

In [32]:
# Save the DataFrame to a tab-delimited text file
df.to_csv('simulated_uk_biobank_dataset.txt', sep='\t', index=False)

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10800 entries, 0 to 10799
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   ID                            10800 non-null  int32  
 1   Age                           10800 non-null  int32  
 2   Sex                           10800 non-null  object 
 3   Height_cm                     10800 non-null  float64
 4   Weight_kg                     10800 non-null  float64
 5   Ethnicity                     10800 non-null  object 
 6   Systolic_BP_mmHg              10800 non-null  int32  
 7   Diastolic_BP_mmHg             10800 non-null  int32  
 8   Calorie_intake                10800 non-null  int32  
 9   Protein_intake                10800 non-null  int32  
 10  Carb_intake                   10800 non-null  int32  
 11  Fat_intake                    10800 non-null  int32  
 12  Smoking_Status                10800 non-null  object 
 13  A

In [34]:
# Second, filter the dataset to only include individuals who self-reported as "Asian" in the "ethnicity" column
asian_df = df.loc[(df["Ethnicity"] == "Indian")|(df["Ethnicity"]=="Pakistani")|(df["Ethnicity"]=="Bangladeshi")| (df["Ethnicity"]=="Asian")]

In [35]:
asian_df['Ethnicity'].value_counts()

Bangladeshi    1608
Asian          1573
Indian         1569
Pakistani      1523
Name: Ethnicity, dtype: int64