In [None]:
from covidcomp.data import RawRepresentation, DerivedRepresentation
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## Create raw data representation from `./covid.csv` and auxiliary datasets

In [None]:
raw = RawRepresentation("./covid.csv")

# Flat comparison
## Get flat data representation

In [None]:
flat_input, flat_target = raw.get_flat_representation()
flat_input

## Get derived (paired) flat representation

In [None]:
flat_derived = DerivedRepresentation(flat_input, flat_target)
flat_derived_input = flat_derived.input
flat_derived_target = flat_derived.target
flat_derived_input

## Fit model on Flat and test

In [None]:
sample_size = 300
print(f"Number of pairs in flat: {derived.input.size}")
print(f"Number of pairs sampled: {sample_size}")
derived_input = flat_derived_input.sample(n=sample_size,random_state=5).to_numpy()
derived_target = flat_derived_target.sample(n=sample_size,random_state=5).to_numpy()

X_train, X_test, y_train, y_test = train_test_split(derived_input, derived_target, test_size=0.33, random_state=42)
Model = LogisticRegression(max_iter = 5000,  random_state = 1)
Model.fit(X_train, y_train.ravel())
print(f'Flat Accuracy - : {Model.score(X_test, y_test.ravel())}')

# Partition by Continent

## Generate raw and partitioned representations by continent. For each continent, get the derived representation and conduct experiment

In [None]:
partitioned_by_continent_dict = raw.get_partitioned_representation("continent")

accuracy_sum = 0

for continent in partitioned_by_continent_dict:
    raw_input, raw_target = partitioned_by_continent_dict[continent]
    print(f"\nNumber of pairs in {continent}: {raw_input.size}")
    derived_continent = DerivedRepresentation(raw_input, raw_target)
    
    derived_input = derived_continent.input.to_numpy()
    derived_target = derived_continent.target.to_numpy()
    
    X_train, X_test, y_train, y_test = train_test_split(derived_input, derived_target.ravel(), test_size=0.33, random_state=42)
    Model = LogisticRegression(max_iter = 5000,  random_state = 1)
    Model.fit(X_train, y_train.ravel())
    
    accuracy = Model.score(X_test, y_test.ravel())
    accuracy_sum += accuracy
    
    print(f'{continent} Accuracy - : {accuracy}')
    
print(f'\nMean Accuracy by continents - : {accuracy_sum/len(partitioned_by_continent_dict)}')

# Partition by Income Group

## Generate raw and partitioned representations by Income Group. For each Income Group, get the derived representation and conduct experiment

In [None]:
partitioned_by_income = raw.get_partitioned_representation("income_group")

accuracy_sum = 0

for income_group in partitioned_by_income:
    raw_input, raw_target = partitioned_by_income[income_group]
    print(f"\nNumber of pairs in {income_group}: {raw_input.size}")
    derived_income = DerivedRepresentation(raw_input, raw_target)
    
    derived_input = derived_income.input.to_numpy()
    derived_target = derived_income.target.to_numpy()
    
    X_train, X_test, y_train, y_test = train_test_split(derived_input, derived_target.ravel(), test_size=0.33, random_state=42)
    Model = LogisticRegression(max_iter = 5000,  random_state = 1)
    Model.fit(X_train, y_train.ravel())
    
    accuracy = Model.score(X_test, y_test.ravel())
    accuracy_sum += accuracy
    
    print(f'{income_group} Accuracy - : {accuracy}')
    
print(f'\nMean Accuracy by income group - : {accuracy_sum/len(partitioned_by_income)}')