<a href="https://colab.research.google.com/github/WoradeeKongthong/association_rule_learning/blob/master/01_Covid19.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Association Rule Learning

Simple learning of Covid-19 symptoms that occur together.  
  
Note :  
The dataset I use in this learning is part of https://www.kaggle.com/bitsofishan/covid19-patient-symptoms ' dataset.  
I choose only the record that the corona result is 1.

In [0]:
#libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_colwidth", 10000)

In [0]:
df = pd.read_csv('https://raw.githubusercontent.com/WoradeeKongthong/association_rule_learning/master/corona_symptoms.csv',usecols=[i for i in range(1,16) if ((i!=6)&(i!=11))])

# Dataset : covid19 symptoms

In [5]:
df

Unnamed: 0,sour throat,weakness,breathing problem,drowsiness,pain in chest,diabetes,heart disease,lung disease,stroke or reduced immunity,high blood pressue,kidney disease,change in appetide,Loss of sense of smell
0,1,1,1,1,1,0,0,0,0,0,0,1,0
1,0,1,0,0,0,0,0,0,0,0,0,1,0
2,1,0,0,1,0,1,1,0,1,0,1,0,0
3,1,1,0,0,0,0,0,0,0,0,0,1,0
4,0,1,0,0,1,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,0,0,0,1,0,0,0,0,0,0,1,0,0
65,1,1,1,1,1,1,0,0,1,0,0,1,1
66,0,0,0,1,1,1,0,0,0,0,0,1,1
67,0,0,0,1,1,1,0,0,0,1,0,1,0


# Data Proprocessor

input of a priori model is a list of lists of symptoms in each case

In [0]:
symptoms = []
for i in range(len(df)):
  temp = df.iloc[i,:]
  symptoms.append(list(temp[temp==1].index.values))

In [42]:
# example of symptoms list
symptoms[:5]

[['sour throat',
  'weakness',
  'breathing problem',
  'drowsiness',
  'pain in chest',
  'change in appetide'],
 ['weakness', 'change in appetide'],
 ['sour throat',
  'drowsiness',
  'diabetes',
  'heart disease',
  'stroke or reduced immunity',
  'kidney disease'],
 ['sour throat', 'weakness', 'change in appetide'],
 ['weakness', 'pain in chest', 'kidney disease', 'change in appetide']]

# Apriori model

In [8]:
! pip install apyori

Collecting apyori
  Downloading https://files.pythonhosted.org/packages/5e/62/5ffde5c473ea4b033490617ec5caa80d59804875ad3c3c57c0976533a21a/apyori-1.1.2.tar.gz
Building wheels for collected packages: apyori
  Building wheel for apyori (setup.py) ... [?25l[?25hdone
  Created wheel for apyori: filename=apyori-1.1.2-cp36-none-any.whl size=5975 sha256=12c877282f653bd893337f73d5efd36e98e512ad4b2c805d0c46d367877d53bb
  Stored in directory: /root/.cache/pip/wheels/5d/92/bb/474bbadbc8c0062b9eb168f69982a0443263f8ab1711a8cad0
Successfully built apyori
Installing collected packages: apyori
Successfully installed apyori-1.1.2


In [0]:
from apyori import apriori
rules = apriori(symptoms, min_support=0.0145, min_confidence=0.001, min_lift=1.0001, max_length=2)

Note  
- consider every pair of symptoms, even one that occur in one case  
    min_support = 1/69 = 0.0145
- let's set the minimum likelihood that the symptom B is occured if symptom A is occur to very low value (0.001) to capture all possibilities
- set min_lift = 1.0001 to consider the symptoms that more likely to occur together


# Results of the learning

In [0]:
results = list(rules)

In [99]:
len(results)

39

In [0]:
results1 = []
for i in range(len(results)):
  items = list(results[i].items)
  support = results[i].support
  confidence = results[i].ordered_statistics[1][2]
  lift = results[i].ordered_statistics[1][3]

  row = (items, support, confidence, lift)
  results1.append(row)

labels = ['rule','support','confidence','lift']
result_df = pd.DataFrame(results1, columns=labels)

In [107]:
result_df.sort_values('support', ascending=False)

Unnamed: 0,rule,support,confidence,lift
25,"[sour throat, drowsiness]",0.318841,0.594595,1.13964
15,"[drowsiness, change in appetide]",0.318841,0.611111,1.505952
37,"[sour throat, weakness]",0.275362,0.59375,1.107264
5,"[sour throat, Loss of sense of smell]",0.275362,0.513514,1.181081
11,"[breathing problem, pain in chest]",0.26087,0.62069,1.381535
0,"[breathing problem, Loss of sense of smell]",0.246377,0.548387,1.26129
7,"[weakness, Loss of sense of smell]",0.246377,0.53125,1.221875
12,"[breathing problem, sour throat]",0.246377,0.459459,1.022668
13,"[breathing problem, weakness]",0.246377,0.53125,1.18246
35,"[sour throat, pain in chest]",0.231884,0.432432,1.028891


# Discussion

Top 10 rules that occur in the record

In [108]:
result_df.sort_values('support', ascending=False)[:10]

Unnamed: 0,rule,support,confidence,lift
25,"[sour throat, drowsiness]",0.318841,0.594595,1.13964
15,"[drowsiness, change in appetide]",0.318841,0.611111,1.505952
37,"[sour throat, weakness]",0.275362,0.59375,1.107264
5,"[sour throat, Loss of sense of smell]",0.275362,0.513514,1.181081
11,"[breathing problem, pain in chest]",0.26087,0.62069,1.381535
0,"[breathing problem, Loss of sense of smell]",0.246377,0.548387,1.26129
7,"[weakness, Loss of sense of smell]",0.246377,0.53125,1.221875
12,"[breathing problem, sour throat]",0.246377,0.459459,1.022668
13,"[breathing problem, weakness]",0.246377,0.53125,1.18246
35,"[sour throat, pain in chest]",0.231884,0.432432,1.028891


Note :  
- 31.88 % of the records have both sour throat and drowsiness.  
  59.46 % of the records that contain sour throat also have drowsiness

- 31.88 % of the records have both drowsiness and change in appetide.  
  61.11 % of the records that contain drowsiness also have change in appetide

In [111]:
result_df.sort_values('lift', ascending=False)[:5]

Unnamed: 0,rule,support,confidence,lift
27,"[heart disease, kidney disease]",0.028986,0.285714,3.942857
19,"[diabetes, high blood pressue]",0.173913,0.6,2.435294
33,"[stroke or reduced immunity, lung disease]",0.057971,0.307692,2.358974
20,"[diabetes, kidney disease]",0.057971,0.571429,2.319328
31,"[kidney disease, stroke or reduced immunity]",0.043478,0.230769,2.274725


The maximum lift items : heart disease and kidney disease 
support = 0.029 means that 2.9% of the records (or 2 out of 69) contains  heart disease and kidney disease.  
confidence = 0.286 means that 28.6% of the records that have heart disease also have kidney disease.  
lift = 3.943 means that the likelihood of heart disease and kidney disease occur together is 3.943 times more than only kidney disease occur alone.

# Symptoms checker
- we will make an association rule of maximum 4 symptoms
- we will focus only on the support of the set of symptoms

In [0]:
checker_rules = apriori(symptoms, min_support=0.0145, min_confidence=0.001, min_lift=1.0001, max_length=4)

In [0]:
checker_result = list(checker_rules)

In [117]:
len(checker_result)

391

In [0]:
results2 = []
for i in range(len(checker_result)):
  items = list(checker_result[i].items)
  support = checker_result[i].support

  row = (items, support)
  results2.append(row)

labels2 = ['rule','support']
checker_result_df = pd.DataFrame(results2, columns=labels2)

In [121]:
checker_result_df.sort_values('support',ascending=False)

Unnamed: 0,rule,support
25,"[sour throat, drowsiness]",0.318841
15,"[drowsiness, change in appetide]",0.318841
37,"[sour throat, weakness]",0.275362
5,"[sour throat, Loss of sense of smell]",0.275362
11,"[breathing problem, pain in chest]",0.260870
...,...,...
273,"[diabetes, breathing problem, high blood press...",0.028986
274,"[diabetes, breathing problem, pain in chest, c...",0.028986
275,"[diabetes, breathing problem, sour throat, cha...",0.028986
276,"[diabetes, stroke or reduced immunity, breathi...",0.028986


In [0]:
# function to investigate list of symptoms

def symptoms_checker(symptoms):
  symptom_occur_list = np.ones((len(checker_result_df),), dtype=bool)

  for symptom in symptoms :
    temp_array = np.array([symptom in items for items in checker_result_df['rule']])
    symptom_occur_list = symptom_occur_list & temp_array
    
  return checker_result_df[symptom_occur_list].sort_values('support', ascending=False)

Let's look at the records contain 'heart disease'.

In [134]:
symptom_df = symptoms_checker(['heart disease'])
symptom_df

Unnamed: 0,rule,support
28,"[heart disease, sour throat]",0.057971
9,"[heart disease, breathing problem]",0.043478
170,"[heart disease, stroke or reduced immunity, kidney disease]",0.028986
380,"[heart disease, stroke or reduced immunity, drowsiness, sour throat]",0.028986
379,"[heart disease, stroke or reduced immunity, kidney disease, drowsiness]",0.028986
378,"[heart disease, kidney disease, drowsiness, sour throat]",0.028986
316,"[heart disease, breathing problem, sour throat, pain in chest]",0.028986
172,"[heart disease, stroke or reduced immunity, sour throat]",0.028986
171,"[heart disease, sour throat, pain in chest]",0.028986
169,"[heart disease, kidney disease, sour throat]",0.028986


consider 'heart disease' and 'drowsiness'

In [135]:
symptom_df = symptoms_checker(['heart disease','drowsiness'])
symptom_df

Unnamed: 0,rule,support
154,"[heart disease, kidney disease, drowsiness]",0.028986
155,"[heart disease, drowsiness, sour throat]",0.028986
156,"[heart disease, stroke or reduced immunity, drowsiness]",0.028986
378,"[heart disease, kidney disease, drowsiness, sour throat]",0.028986
379,"[heart disease, stroke or reduced immunity, kidney disease, drowsiness]",0.028986
380,"[heart disease, stroke or reduced immunity, drowsiness, sour throat]",0.028986
