<a href="https://www.kaggle.com/code/tyjensen/fe-data-df2?scriptVersionId=130534340" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

- What are the current ferritin levels within our donor population?
- Are donor ferritin levels adversely affected by frequent donations?


## Dataframe excluding donors labelled as Therapeutic, AFN , MED and TTD

#### LOAD DATA

In [None]:
df2 = pd.read_csv('../input/data-excl-therap-donors/clean_df2', parse_dates=['date_bled'],dayfirst=True, dtype={'race':'object'}, index_col=[0])
df2.info()

In [None]:
df2.head()

In [None]:
df2['ferritin_ratio'].describe()

### What is the ferritin ratio amoung donors?

In [None]:
# sns.set_style('white')
# sns.set_palette('Reds')
fig, ax = plt.subplots(figsize=(20,6)) #Create a figure and one subplot

sns.histplot(x='ferritin_ratio', data=df2, bins=100)

ax.set(xlabel='Ferritin Ratio', xticks=(range(0,2100, 25)), xlim =(0,1000), title='FERRITIN RATIO DISTRIBUTION IN DONOR POPULATION')

ax.axvline(x=df2['ferritin_ratio'].median(), color='m', label='Median', linestyle='--', linewidth=2)
ax.axvline(x=df2['ferritin_ratio'].mean(), color='b', label='Mean', linestyle='-', linewidth=2)

ax.legend()

plt.show()

In [None]:
df2['ferritin_ratio'].describe()

### What is the ferritin ratio in different donor demographics?¶

- 1 and 2: white male and female
- 3 and 4: coloured male and female
- 5 and 6: indian male and female
- 7 and 8: black male and female

In [None]:
fig, ax = plt.subplots(figsize=(20, 6))

sns.barplot(data=df2, x='race', y='ferritin_ratio', order=['1.0','2.0','3.0','4.0','5.0','6.0','7.0','8.0'])

ax.axhline(y=df2['ferritin_ratio'].median(), color='m', label='Median', linestyle='--', linewidth=2)
ax.axhline(y=df2['ferritin_ratio'].mean(), color='b', label='Mean', linestyle='-', linewidth=2)

ax.set_xlabel('Race Categories')
ax.set_ylabel('Average Ferritin Ratio')
ax.set_title('AVERAGE FERRITIN RATIO VS GENDER / RACE')

ax.legend()

plt.show()

In [None]:
df2['race'].value_counts()

### How many donors donated more than once?

In [None]:
donor_code_count = df2["donor_code"].value_counts()

# rename index for join
donor_code_count.rename("donor_code") 

# join Series(donor_code_count) to df(clean_df_non_therap)
df2 = df2.join(donor_code_count, on='donor_code',how='left', rsuffix='_count') 

In [None]:
fig, ax = plt.subplots(figsize=(20, 6))

sns.countplot(data=df2, x='donor_code_count')
ax.set_xlabel('Number of donations in study period (7 months)')
ax.set_ylabel('Number of Donors')
ax.set_title("Donation Frequency")

plt.show()

In [None]:
df2['donor_code_count'].value_counts()
# df2['donor_code_count'].value_counts(normalize=True)

### What does our donor population look like in the ferritin categories?

- very low: <= 5.9 ug/L
- low: >6 and <=15.9 ug/L
- normal: >= 16 and <=500 ug/L
- high: >=501 and <=1000 ug/L
- very high: >1000 ug/L

In [None]:
def to_cat(x):
    """function to convert ferritin ratio into defined categories"""
    if x <= 5.9:
        return 'very low'
    elif x > 6 and x <=15.9:
        return 'low'
    elif x >= 16 and x<=500:
        return 'normal'
    elif x >=501 and x <=1000:
        return 'high'
    else:
        return 'very high'

df2['ferritin_cat'] = df2['ferritin_ratio'].apply(to_cat)

In [None]:
fig, ax = plt.subplots(figsize=(20, 6))

sns.countplot(x=df2['ferritin_cat'], order=['very low', 'low', 'normal','high', 'very high'])
ax.set(xlabel=None, ylabel='Count',title='FERRITIN CATEGORIES')

plt.show()

In [None]:
df2['ferritin_cat'].value_counts()

### AVERAGE FERRITIN RATIO VS DONATION FREQUENCY WITHIN STUDY PERIOD

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

sns.barplot(data=df2, x='donor_code_count', y='ferritin_ratio')

ax.axhline(y=df2['ferritin_ratio'].median(), color='m', label='Median', linestyle='--', linewidth=2)
ax.axhline(y=df2['ferritin_ratio'].mean(), color='b', label='Mean', linestyle='-', linewidth=2)

ax.set_xlabel('number of donations in study period(7 months)')
ax.set_ylabel('average ferritin ratio')
ax.set_title('AVERAGE FERRITIN RATIO VS DONATION FREQUENCY WITHIN STUDY PERIOD')

ax.legend()

plt.show()

### What do our donors look like in the low and very low ferritin categories?

In [None]:
low_df = df2[df2['ferritin_cat'].isin(['very low','low'])]
normal_df = df2[df2['ferritin_cat'].isin(['normal'])]
high_df = df2[df2['ferritin_cat'].isin(['very high','high'])]

In [None]:
fig, ax = plt.subplots(figsize=(20, 6))

sns.histplot(x='ferritin_ratio', data=low_df, bins=80)

ax.set(xlabel='Ferritin Ratio', title='FERRITIN RATIO IN "LOW / VERY LOW" DONORS')

ax.axvline(x=low_df['ferritin_ratio'].median(), color='m', label='Median', linestyle='--', linewidth=2)
ax.axvline(x=low_df['ferritin_ratio'].mean(), color='b', label='Mean', linestyle='-', linewidth=2)

ax.legend()

plt.show()

In [None]:
low_df['ferritin_ratio'].describe()

In [None]:
fig, ax = plt.subplots(figsize=(20, 6))

sns.countplot(data=low_df, x='race', order=['1.0','2.0','3.0','4.0','5.0','6.0','7.0','8.0'])

ax.set_xlabel(None)
ax.set_ylabel('Number of Donors')
ax.set_title('GENDER AND RACE IN "LOW / VERY LOW" DONORS')

plt.show()

In [None]:
low_df['race'].value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(20, 6))

sns.barplot(data=low_df, x='age', y='ferritin_ratio')

plt.xticks(rotation=90)

ax.set_xlabel('Age')
ax.set_ylabel('Ferritin Ratio')
ax.set_title('FERRITIN RATIO IN LOW AND VERY LOW CATEGORIES BY AGE')

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20, 5))

sns.countplot(data=low_df, x='age')
plt.xticks(rotation=90)

ax.set_xlabel('Age')
ax.set_ylabel('Number of Donors')
ax.set_title('AGE IN "LOW / VERY LOW" DONORS')

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20, 5))

sns.countplot(data=normal_df, x='age')
plt.xticks(rotation=90)

ax.set_xlabel('Age')
ax.set_ylabel('Number of Donors')
ax.set_title('NORMAL CATEGORY BY AGE')

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20, 5))

sns.countplot(data=high_df, x='age')
plt.xticks(rotation=90)

ax.set_xlabel('Age')
ax.set_ylabel('Number of Donors')
ax.set_title('HIGH / VERY HIGH CATEGORIES BY AGE')

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

sns.barplot(data=low_df, x='donor_code_count', y='ferritin_ratio')

ax.axhline(y=low_df['ferritin_ratio'].median(), color='r', label='Low Median', linestyle='--', linewidth=2)
ax.axhline(y=low_df['ferritin_ratio'].mean(), color='g', label='Low Mean', linestyle='-', linewidth=2)


ax.set_xlabel('number of donations in study period(7 months)')
ax.set_ylabel('average ferritin ratio')
ax.set_title('AVERAGE FERRITIN RATIO OF LOW AND VERY LOW CATEGORIES VS DONATION FREQUENCY WITHIN STUDY PERIOD')

ax.legend()

plt.show()

SUMMARY



EXTRAS

In [None]:
fig, ax = plt.subplots(figsize=(25, 6))

sns.lineplot(x='date_bled', y='ferritin_ratio', data = df2)

# ax.axhline(y=df2['ferritin_ratio'].median(), color='m', label='Median', linestyle='--', linewidth=2)
# ax.axhline(y=df2['ferritin_ratio'].mean(), color='b', label='Mean', linestyle='-', linewidth=2)

In [None]:
fig, ax = plt.subplots(figsize=(25, 6))

sns.lineplot(x='date_bled', y='ferritin_ratio', data = low_df)

### How does the ferritin ratio change between donations?

In [None]:
# sort dataframe
df_Y = df2.sort_values(by=['donor_code', 'date_bled'])

# calculate day difference betweeen donations
date_diff = df_Y.groupby('donor_code')['date_bled'].diff().dt.days.fillna(0, downcast='infer')

#create new col
df_Y['date_diff'] = date_diff

In [None]:
# select for donors who donated twice in study period
df4 = df_Y[df_Y['donor_code_count'] ==2]

# group selected donors by donor code and date bled to obtain ferritin ratios
df5 = df4.groupby(['donor_code', 'date_bled'])[['ferritin_ratio']].mean()

# create dict to hold results of for loop
dict1 ={}

# for loop to iterate over donor codes and extract ferritin values to a list and add to dict
for donor_code in df4['donor_code']:
    #add list to dict
    dict1[donor_code] = df5.loc[(donor_code)].values.flatten().tolist() 

# create df from dict
df_calc_2 = pd.DataFrame(dict1)

#transpose df
df_calc_2 = df_calc_2.T

df_calc_2 = df_calc_2.reset_index()

df_calc_2.rename(columns={'index': 'donor_code', 0:"1_donation", 1 : "2_donation"}, inplace = True)

# Calculate ferritin difference between donations
dict ={}

for donor_code in df4['donor_code']:
    arr = df5.loc[(donor_code)].values.flatten()
    diff = np.diff(arr).tolist()
    dict[donor_code] = diff
    
# create df from dict
df_calc_diff_2 = pd.DataFrame(dict)

#transpose df
df_calc_diff_2 = df_calc_diff_2.T

df_calc_diff_2 = df_calc_diff_2.reset_index()

df_calc_diff_2.rename(columns={'index': 'donor_code', 0:"ferr_diff_1_&_2"}, inplace = True)

# Calculate date difference between donations 
X = df4[df4['date_diff'] >0]['date_diff'].values
# df_calc_diff_2['ferr_diff_1_&_2'].mean()
df_calc_diff_2['date_diff_1_&_2']=X

In [None]:
# select for donors who donated Three times in study period
df4 = df_Y[df_Y['donor_code_count'] ==3]

# group selected donors by donor code and date bled to obtain ferritin ratios
df5 = df4.groupby(['donor_code', 'date_bled'])[['ferritin_ratio']].mean()

# create dict to hold results of for loop
dict1 ={}

# for loop to iterate over donor codes and extract ferritin values to a list and add to dict
for donor_code in df4['donor_code']:
    #add list to dict
    dict1[donor_code] = df5.loc[(donor_code)].values.flatten().tolist() 

# create df from dict
df_calc_3 = pd.DataFrame(dict1)

#transpose df
df_calc_3 = df_calc_3.T

df_calc_3 = df_calc_3.reset_index()

df_calc_3.rename(columns={'index': 'donor_code', 0:"1_donation", 1 : "2_donation", 2 : '3_donation'}, inplace = True)

df = df4[df4['date_diff'] >0]

# Calculate ferritin difference between donations
dict ={}

for donor_code in df4['donor_code']:
    arr = df5.loc[(donor_code)].values.flatten()
    diff = np.diff(arr).tolist()
    dict[donor_code] = diff
    
# create df from dict
df_calc_diff_3 = pd.DataFrame(dict)

#transpose df
df_calc_diff_3 = df_calc_diff_3.T

df_calc_diff_3 = df_calc_diff_3.reset_index()

df_calc_diff_3.rename(columns={'index': 'donor_code', 0 : "ferr_diff_1_&_2", 1 : "ferr_diff_2_&_3"}, inplace = True)

# Calculate date difference between donations
global code
code=''

date_diff_1and2=[]
date_diff_2and3=[]

for donor_code in df['donor_code']:
    
    if donor_code != code:
        value0 = df[df['donor_code'] == donor_code]['date_diff'].values[0]
        date_diff_1and2.append(value0)
        code = donor_code
        
    elif donor_code == code:
        value1 = df[df['donor_code'] == donor_code]['date_diff'].values[1]
        date_diff_2and3.append(value1)
    
df_calc_diff_3['date_diff_1_&_2']= date_diff_1and2
df_calc_diff_3['date_diff_2_&_3']= date_diff_2and3

In [None]:
# select for donors who donated twice in study period
df4 = df_Y[df_Y['donor_code_count'] ==4]

# group selected donors by donor code and date bled to obtain ferritin ratios
df5 = df4.groupby(['donor_code', 'date_bled'])[['ferritin_ratio']].mean()

# create dict to hold results of for loop
dict1 ={}

# for loop to iterate over donor codes and extract ferritin values to a list and add to dict
for donor_code in df4['donor_code']:
    #add list to dict
    dict1[donor_code] = df5.loc[(donor_code)].values.flatten().tolist() 

# create df from dict
df_calc_4 = pd.DataFrame(dict1)

#transpose df
df_calc_4 = df_calc_4.T

df_calc_4 = df_calc_4.reset_index()

df_calc_4.rename(columns={'index': 'donor_code', 0:"1_donation", 1 : "2_donation", 2 : '3_donation', 3 : '4_donation'}, inplace = True)

# Calculate ferritin difference between donations
dict ={}

for donor_code in df4['donor_code']:
    arr = df5.loc[(donor_code)].values.flatten()
    diff = np.diff(arr).tolist()
    dict[donor_code] = diff
    
# create df from dict
df_calc_diff_4 = pd.DataFrame(dict)

#transpose df
df_calc_diff_4 = df_calc_diff_4.T

df_calc_diff_4 = df_calc_diff_4.reset_index()

df_calc_diff_4.rename(columns={'index': 'donor_code', 0 : "ferr_diff_1_&_2", 1 : "ferr_diff_2_&_3", 2 : "ferr_diff_3_&_4"}, inplace = True)

# Calculate date difference between donations
df = df4[df4['date_diff'] >0]

global code
code=''

date_diff_1and2=[]
date_diff_2and3=[]
date_diff_3and4=[]

for donor_code in df['donor_code']:
    if donor_code != code:
        count=0
        value0 = df[df['donor_code'] == donor_code]['date_diff'].values[count]
        date_diff_1and2.append(value0)
        code = donor_code
        count +=1
    elif donor_code == code:
        if count == 1:
            value1 = df[df['donor_code'] == donor_code]['date_diff'].values[count]
            date_diff_2and3.append(value1)
            count += 1
        elif count == 2:
            value2 = df[df['donor_code'] == donor_code]['date_diff'].values[count]
            date_diff_3and4.append(value2)

df_calc_diff_4['date_diff_1_&_2']= date_diff_1and2
df_calc_diff_4['date_diff_2_&_3']= date_diff_2and3
df_calc_diff_4['date_diff_3_&_4']= date_diff_3and4

### Vertical Stack

In [None]:
# Stack the DataFrames on top of each other
ferr_date_diff_stack = pd.concat([df_calc_diff_2, df_calc_diff_3, df_calc_diff_4], axis=0)

In [None]:
ferr_date_diff_stack.info()

In [None]:
f, ax = plt.subplots(figsize=(25, 5))
sns.stripplot(y='ferr_diff_1_&_2', x='date_diff_1_&_2', data=ferr_date_diff_stack)

plt.xticks(rotation=90)
ax.set_xlabel('Days')
ax.set_ylabel('Difference in ferritin ratio')
ax.set_title('DIFFERENCE IN FERRITIN RATIO (1ST AND 2ND DONATIONS) WRT DONATION INTERVAL')

plt.show()

In [None]:
f, ax = plt.subplots(figsize=(25, 5))
sns.stripplot(y='ferr_diff_2_&_3', x='date_diff_2_&_3', data=ferr_date_diff_stack)

plt.xticks(rotation=90)
ax.set_xlabel('Days')
ax.set_ylabel('Difference in ferritin ratio')
ax.set_title('DIFFERENCE IN FERRITIN RATIO (2ND AND 3RD DONATIONS) WRT DONATION INTERVAL')

plt.show()

In [None]:
f, ax = plt.subplots(figsize=(25, 5))
sns.stripplot(y='ferr_diff_3_&_4', x='date_diff_3_&_4', data=ferr_date_diff_stack)

plt.xticks(rotation=90)
ax.set_xlabel('Days')
ax.set_ylabel('Difference in ferritin ratio')
ax.set_title('DIFFERENCE IN FERRITIN RATIO (3RD AND 4TH DONATIONS) WRT DONATION INTERVAL')

plt.show()

In [None]:
vertical_stack_2 = pd.concat([df_calc_2, df_calc_3, df_calc_4], axis=0)

In [None]:
vertical_stack_2.info()

In [None]:
f, ax = plt.subplots(figsize=(20, 5))

sns.pointplot(data= vertical_stack_2)

ax.axhline(y=df2['ferritin_ratio'].median(), color='m', label='Median', linestyle='--', linewidth=2)
ax.axhline(y=df2['ferritin_ratio'].mean(), color='b', label='Mean', linestyle='-', linewidth=2)

ax.set_ylabel('Ferritin Ratio')
ax.set_title('DIFFERENCE IN FERRITIN RATIOs BETWEEN DONATIONS')

plt.show()