## Correlation for proportion infected

In [15]:
import pandas as pd
df = pd.read_csv('covid_data_22-09-2021_2-full.csv')
print(df.shape)

(149, 13)


First of all, split the countries in to two categories: high and low. So the countries who are going to be in the category 'high', have a high proportion infected and the countries who are going to be in the category 'low', have a low proportion infected. (You can play around with the value that is now 4000 to make other ratios of low and high countries.)

In [16]:
high =  df[(df['proportion infected'] > 4000)]
low = df[(df['proportion infected'] < 4000)]

print(high.shape)
print(low.shape)

(76, 13)
(73, 13)


Then make a new column in the dataframe with the categories.

In [17]:
categories = list()
for i in df['proportion infected'].values:
    if i > 4000:
        categories.append('high')
    else:
        categories.append('low')
    
df.insert(4, "category", categories, True)
df

Unnamed: 0.1,Unnamed: 0,Country,Confirmed,proportion infected,category,Deaths,GDP,Population Density,2021 Population,Area,Literacy,Employment,Population 65 +,Foreigners
0,0,Albania,165096,5746.60,high,2601,14.80,100,2872933,28748,0.972489,0.707484,0.147591,0.018337
1,1,Algeria,202122,453.02,low,5739,145.00,19,44616624,2381741,0.774214,0.896187,0.053393,0.003512
2,2,Antigua and Barbuda,2625,2658.74,low,57,1.42,223,98731,442,0.990000,0.994403,0.088592,0.296995
3,3,Argentina,5245265,11501.30,high,114684,383.00,16,45605826,2780400,0.980849,0.938704,0.116861,0.045017
4,4,Armenia,254709,8581.47,high,5181,12.65,100,2968127,29743,0.996145,0.783675,0.119087,0.065531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,144,Uruguay,388068,11134.90,high,6049,53.63,19,3485151,181034,0.949552,0.936522,0.147933,0.023435
145,145,Venezuela,358462,1248.78,low,4346,482.00,31,28704954,916445,0.950790,0.911165,0.072738,0.042478
146,146,Vietnam,718963,732.37,low,17781,271.00,296,98168833,331212,0.938753,0.970942,0.077089,0.089552
147,147,Zambia,208676,1102.90,low,3639,19.32,25,18920651,752612,0.815428,0.877331,0.026461,0.009907


In [18]:
print(df[df['Country'] == 'Belgium'])

    Unnamed: 0  Country  Confirmed  proportion infected category  Deaths  \
12          12  Belgium    1229236             10567.41     high   25533   

      GDP  Population Density  2021 Population   Area  Literacy  Employment  \
12  515.0                 381         11632326  30528      0.99    0.917029   

    Population 65 +  Foreigners  
12         0.189032    0.148065  


Then we make a dataframe with only the data, the features, that we want to use to do the logistic regression with. And we also make a dataframe of only the categories, these are the labels.

In [19]:
from sklearn.preprocessing import normalize
data = df[['GDP', 'Population Density', 'Literacy', 'Employment', 'Population 65 +', 'Foreigners']]
labels = df['category']

Then we split up the data and the labels into a training and testing set. (Here you can also play with different amount of training and testing sets) 

In [29]:
train_set_data = data[:70]
train_set_labels = labels[:70]
test_set_data = data[70:]
test_set_labels = labels[70:]

Now we fit the training data in the model.

In [30]:
from sklearn.linear_model import LogisticRegression
X, y = train_set_data, train_set_labels
model = LogisticRegression(multi_class = 'multinomial')
model.fit(X, y)

LogisticRegression(multi_class='multinomial')

Now we are going to calculate the coefficients to see where there are correlations in the features and the categories.

In [31]:
print(model.classes_)
print(model.coef_)

['high' 'low']
[[ 8.88663035e-05 -9.29588382e-05 -1.45263869e+00  7.10454311e-01
  -5.70554126e-01 -3.11009066e-01]]


We also going to test the model with the testing set and count how many countries are wrong categorised.

In [32]:
test_set_labels_new = model.predict(test_set_data)
count = 0
countries = df['Country'].values
test_set_labels = test_set_labels.values
for i in range(len(test_set_data)):
    print("X=%s, Predicted=%s, Reel=%s" % (countries[i+70], test_set_labels_new[i], test_set_labels[i]))
    if test_set_labels_new[i] != test_set_labels[i]:
        count += 1
print(count)

X=Laos, Predicted=high, Reel=low
X=Latvia, Predicted=high, Reel=high
X=Lesotho, Predicted=low, Reel=low
X=Liberia, Predicted=low, Reel=low
X=Libya, Predicted=high, Reel=high
X=Liechtenstein, Predicted=high, Reel=high
X=Lithuania, Predicted=high, Reel=high
X=Luxembourg, Predicted=high, Reel=high
X=Malawi, Predicted=low, Reel=low
X=Malaysia, Predicted=high, Reel=high
X=Maldives, Predicted=high, Reel=high
X=Mali, Predicted=low, Reel=low
X=Malta, Predicted=high, Reel=high
X=Marshall Islands, Predicted=low, Reel=low
X=Mauritania, Predicted=low, Reel=low
X=Mauritius, Predicted=high, Reel=low
X=Mexico, Predicted=low, Reel=low
X=Micronesia, Predicted=high, Reel=low
X=Moldova, Predicted=high, Reel=high
X=Mongolia, Predicted=high, Reel=high
X=Montenegro, Predicted=high, Reel=high
X=Morocco, Predicted=low, Reel=low
X=Mozambique, Predicted=low, Reel=low
X=Namibia, Predicted=high, Reel=high
X=Nepal, Predicted=low, Reel=low
X=Netherlands, Predicted=high, Reel=high
X=New Zealand, Predicted=high, Reel

Then we make a score to see what the accuracy is of the model.

In [33]:
print(model.score(test_set_data, test_set_labels))

0.7215189873417721


In [39]:
print(model.predict([[23.59, 20, 0.5458, 0.896, 0.049, 0.08955]])[0])

low


In [41]:
from ipywidgets import interact

def machine(gdp, popdens, lit, emp, old, foreign):
    print(model.predict([[gdp, popdens, lit, emp, old, foreign]])[0])

interact(machine, gdp=(0, 1000, 1), popdens = (0, 500, 1), lit = (0, 1, 0.01), emp = (0, 1, 0.01), old =  (0, 1, 0.01), foreign =  (0, 1, 0.01))

interactive(children=(IntSlider(value=500, description='gdp', max=1000), IntSlider(value=250, description='pop…

<function __main__.machine(gdp, popdens, lit, emp, old, foreign)>

## Correlation with deaths

Exactly the same as above but now with the proportions of deaths in a country. So I first made another column that gives you the proportion of deaths.

In [114]:
import pandas as pd
df = pd.read_csv('covid_data_22-09-2021_2-full.csv')
print(df.shape)
proportion_deaths = list()
popu = df['2021 Population'].values
for index, i in enumerate(df['Deaths'].values):
    proportion_deaths.append((i/popu[index])*100000)
    
df.insert(4, "deaths_prop", proportion_deaths, True)

(149, 13)


In [115]:
high =  df[(df['deaths_prop'] >60)]
low = df[(df['deaths_prop'] < 60)]

print(high.shape)
print(low.shape)

(74, 14)
(75, 14)


In [116]:
categories = list()
for i in df['deaths_prop'].values:
    if i > 60:
        categories.append('high')
    else:
        categories.append('low')
    
df.insert(4, "category", categories, True)
df

Unnamed: 0.1,Unnamed: 0,Country,Confirmed,proportion infected,category,deaths_prop,Deaths,GDP,Population Density,2021 Population,Area,Literacy,Employment,Population 65 +,Foreigners
0,0,Albania,165096,5746.60,high,90.534656,2601,14.80,100,2872933,28748,0.972489,0.707484,0.147591,0.018337
1,1,Algeria,202122,453.02,low,12.862919,5739,145.00,19,44616624,2381741,0.774214,0.896187,0.053393,0.003512
2,2,Antigua and Barbuda,2625,2658.74,low,57.732627,57,1.42,223,98731,442,0.990000,0.994403,0.088592,0.296995
3,3,Argentina,5245265,11501.30,high,251.467872,114684,383.00,16,45605826,2780400,0.980849,0.938704,0.116861,0.045017
4,4,Armenia,254709,8581.47,high,174.554525,5181,12.65,100,2968127,29743,0.996145,0.783675,0.119087,0.065531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,144,Uruguay,388068,11134.90,high,173.564933,6049,53.63,19,3485151,181034,0.949552,0.936522,0.147933,0.023435
145,145,Venezuela,358462,1248.78,low,15.140244,4346,482.00,31,28704954,916445,0.950790,0.911165,0.072738,0.042478
146,146,Vietnam,718963,732.37,low,18.112673,17781,271.00,296,98168833,331212,0.938753,0.970942,0.077089,0.089552
147,147,Zambia,208676,1102.90,low,19.232953,3639,19.32,25,18920651,752612,0.815428,0.877331,0.026461,0.009907


In [117]:
data = df[['GDP', 'Population Density', '2021 Population', 'Literacy', 'Employment', 'Population 65 +', 'Foreigners']]
labels = df['category']

In [118]:
train_set_data = data[:80]
train_set_labels = labels[:80]
test_set_data = data[80:]
test_set_labels = labels[80:]

In [119]:
from sklearn.linear_model import LogisticRegression
X, y = train_set_data, train_set_labels
model = LogisticRegression(multi_class='multinomial', max_iter = 1000)
model.fit(X, y)

LogisticRegression(max_iter=1000, multi_class='multinomial')

In [120]:
print(model.classes_)
print(model.coef_)

['high' 'low']
[[-1.47676925e-13 -3.69895685e-14  1.42574607e-09 -2.88011584e-16
  -1.37780738e-16 -4.83246804e-17 -6.05242566e-17]]


In [121]:
test_set_labels_new = model.predict(test_set_data)
count = 0
countries = df['Country'].values
test_set_labels = test_set_labels.values
for i in range(len(test_set_data)):
    print("X=%s, Predicted=%s, Reel=%s" % (countries[i+80], test_set_labels_new[i], test_set_labels[i]))
    if test_set_labels_new[i] != test_set_labels[i]:
        count += 1
print(count)

X=Maldives, Predicted=low, Reel=low
X=Mali, Predicted=low, Reel=low
X=Malta, Predicted=low, Reel=high
X=Marshall Islands, Predicted=low, Reel=low
X=Mauritania, Predicted=low, Reel=low
X=Mauritius, Predicted=low, Reel=low
X=Mexico, Predicted=low, Reel=high
X=Micronesia, Predicted=low, Reel=low
X=Moldova, Predicted=low, Reel=high
X=Mongolia, Predicted=low, Reel=low
X=Montenegro, Predicted=low, Reel=high
X=Morocco, Predicted=low, Reel=low
X=Mozambique, Predicted=low, Reel=low
X=Namibia, Predicted=low, Reel=high
X=Nepal, Predicted=low, Reel=low
X=Netherlands, Predicted=low, Reel=high
X=New Zealand, Predicted=low, Reel=low
X=Nicaragua, Predicted=low, Reel=low
X=Niger, Predicted=low, Reel=low
X=Nigeria, Predicted=low, Reel=low
X=North Macedonia, Predicted=low, Reel=high
X=Norway, Predicted=low, Reel=low
X=Oman, Predicted=low, Reel=high
X=Pakistan, Predicted=low, Reel=low
X=Palau, Predicted=low, Reel=low
X=Panama, Predicted=low, Reel=high
X=Papua New Guinea, Predicted=low, Reel=low
X=Paraguay

In [122]:
print(model.score(test_set_data, test_set_labels))

0.5507246376811594
