In this example, we are analyzing the "South_African_Heart_Disease" dataset. First, we will graphically display the dependencies, and then we will build a classifier based on logistic regression.

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("./South_African_Heart_Disease.txt")

In [3]:
data

Unnamed: 0,row.names,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
0,1,160,12.00,5.73,23.11,Present,49,25.30,97.20,52,1
1,2,144,0.01,4.41,28.61,Absent,55,28.87,2.06,63,1
2,3,118,0.08,3.48,32.28,Present,52,29.14,3.81,46,0
3,4,170,7.50,6.41,38.03,Present,51,31.99,24.26,58,1
4,5,134,13.60,3.50,27.78,Present,60,25.99,57.34,49,1
...,...,...,...,...,...,...,...,...,...,...,...
457,459,214,0.40,5.98,31.72,Absent,64,28.45,0.00,58,0
458,460,182,4.20,4.41,32.10,Absent,52,28.61,18.72,52,1
459,461,108,3.00,1.59,15.23,Absent,40,20.09,26.64,55,0
460,462,118,5.40,11.61,30.79,Absent,64,27.35,23.97,40,0


In [4]:
import matplotlib as plt

In [5]:
import plotly.express as px

In [6]:
from plotly.offline import init_notebook_mode 


In [7]:
init_notebook_mode(connected=True)


In [45]:
px.scatter_matrix(data, dimensions=["sbp", "ldl", "adiposity", "age"], color="chd").show()


In [8]:
for col in data.columns:
    px.histogram(data, x = col, color = "chd", barmode='group', labels={"chd": "Desease"}).show()

In [9]:
for col in data.columns:
    px.histogram(data, x = col, facet_row = "chd").show()

In [10]:
import statsmodels.api as sm


In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
x = data[['tobacco','alcohol', 'age']]
print(x)

     tobacco  alcohol  age
0      12.00    97.20   52
1       0.01     2.06   63
2       0.08     3.81   46
3       7.50    24.26   58
4      13.60    57.34   49
..       ...      ...  ...
457     0.40     0.00   58
458     4.20    18.72   52
459     3.00    26.64   55
460     5.40    23.97   40
461     0.00     0.00   46

[462 rows x 3 columns]


In [13]:
data.columns

Index(['row.names', 'sbp', 'tobacco', 'ldl', 'adiposity', 'famhist', 'typea',
       'obesity', 'alcohol', 'age', 'chd'],
      dtype='object')

In [14]:
y = data["chd"]

In [15]:
model = LogisticRegression() 

In [16]:
model.fit(x, y)

LogisticRegression()

In [17]:
model

LogisticRegression()

In [18]:
model.predict(x)

array([1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,

In [19]:
model.predict_proba(x)

array([[0.39195769, 0.60804231],
       [0.49615247, 0.50384753],
       [0.71020238, 0.28979762],
       [0.41399175, 0.58600825],
       [0.40969892, 0.59030108],
       [0.6142785 , 0.3857215 ],
       [0.73520966, 0.26479034],
       [0.48356791, 0.51643209],
       [0.86087375, 0.13912625],
       [0.62894994, 0.37105006],
       [0.40907446, 0.59092554],
       [0.53352204, 0.46647796],
       [0.92225415, 0.07774585],
       [0.92960184, 0.07039816],
       [0.44447551, 0.55552449],
       [0.68133198, 0.31866802],
       [0.53236941, 0.46763059],
       [0.42529873, 0.57470127],
       [0.44792039, 0.55207961],
       [0.29218123, 0.70781877],
       [0.89788799, 0.10211201],
       [0.59308937, 0.40691063],
       [0.6607265 , 0.3392735 ],
       [0.55218967, 0.44781033],
       [0.32632059, 0.67367941],
       [0.69198734, 0.30801266],
       [0.58213691, 0.41786309],
       [0.34715058, 0.65284942],
       [0.51065412, 0.48934588],
       [0.49494937, 0.50505063],
       [0.

In [20]:
model.predict_proba(x)[:,1]

array([0.60804231, 0.50384753, 0.28979762, 0.58600825, 0.59030108,
       0.3857215 , 0.26479034, 0.51643209, 0.13912625, 0.37105006,
       0.59092554, 0.46647796, 0.07774585, 0.07039816, 0.55552449,
       0.31866802, 0.46763059, 0.57470127, 0.55207961, 0.70781877,
       0.10211201, 0.40691063, 0.3392735 , 0.44781033, 0.67367941,
       0.30801266, 0.41786309, 0.65284942, 0.48934588, 0.50505063,
       0.31773542, 0.30914714, 0.57955954, 0.37845502, 0.40922834,
       0.18192742, 0.20978634, 0.37015067, 0.13885627, 0.65337353,
       0.23966749, 0.14635945, 0.07034067, 0.39632237, 0.07774585,
       0.39268775, 0.58006591, 0.18894081, 0.07641347, 0.18612424,
       0.2462899 , 0.37095259, 0.25885505, 0.26342013, 0.43890843,
       0.43993198, 0.13285466, 0.28276405, 0.08206433, 0.49171082,
       0.17019896, 0.28442481, 0.6214695 , 0.21512678, 0.07787977,
       0.53851128, 0.19965088, 0.13215519, 0.22237387, 0.15940208,
       0.07037623, 0.13667277, 0.27447035, 0.09111911, 0.18043

In [21]:
model.coef_

array([[0.07773226, 0.00090601, 0.05404614]])

In [22]:
px.bar(x = ['tobacco','alcohol', 'age'], y = [0.07773226, 0.00090601, 0.05404614], labels={"x": "predictors", "y": "importance"}).show()