In [1]:
import pandas as pd

# Reading Data

In [2]:
df = pd.read_csv('Training_data.csv')

In [3]:
df.head()

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
0,99,1,15,1,0,0
1,103,1,57,1,0,1
2,100,1,34,1,-1,1
3,104,1,5,0,-1,0
4,102,0,70,0,1,1


In [4]:
df.tail()

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
2494,102,0,14,0,1,0
2495,102,1,85,0,0,0
2496,98,0,39,1,0,0
2497,101,0,63,0,0,0
2498,101,0,80,0,-1,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2499 entries, 0 to 2498
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   fever          2499 non-null   int64
 1   bodyPain       2499 non-null   int64
 2   age            2499 non-null   int64
 3   runnyNose      2499 non-null   int64
 4   diffBreath     2499 non-null   int64
 5   infectionProb  2499 non-null   int64
dtypes: int64(6)
memory usage: 117.3 KB


In [6]:
df['diffBreath'].value_counts()

 0    873
 1    846
-1    780
Name: diffBreath, dtype: int64

In [7]:
df.describe()

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
count,2499.0,2499.0,2499.0,2499.0,2499.0,2499.0
mean,101.027211,0.502601,50.043617,0.498599,0.026411,0.479392
std,1.97362,0.500093,29.189864,0.500098,0.806364,0.499675
min,98.0,0.0,1.0,0.0,-1.0,0.0
25%,99.0,0.0,25.0,0.0,-1.0,0.0
50%,101.0,1.0,50.0,0.0,0.0,0.0
75%,103.0,1.0,75.0,1.0,1.0,1.0
max,104.0,1.0,100.0,1.0,1.0,1.0


# Train-Test Splitting


In [8]:
import numpy as np

In [9]:
def data_split(data, ratio):
    np.random.seed(42) 
    #used to freeze the selected random value
    shuffled = np.random.permutation(len(data)) 
    #produces random shuffled numbers
    test_set_size= int(len(data)*ratio)
    test_indices=shuffled[:test_set_size]
    train_indices=shuffled[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]    
    

In [10]:
np.random.permutation(10)  #for example

array([8, 5, 9, 4, 7, 1, 3, 6, 2, 0])

In [11]:
train, test = data_split(df, 0.2)

In [12]:
train

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
461,104,0,13,0,0,1
109,102,1,53,1,1,1
2296,100,0,33,1,1,1
354,104,0,90,0,0,0
266,100,1,88,1,1,1
...,...,...,...,...,...,...
1638,101,0,47,0,0,0
1095,101,1,95,1,1,0
1130,99,1,6,1,0,1
1294,103,1,23,0,0,0


In [13]:
test

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
2319,99,1,14,1,-1,0
1865,99,0,88,1,-1,0
902,100,1,69,0,0,0
2240,98,1,13,0,1,0
1285,103,1,86,1,-1,0
...,...,...,...,...,...,...
1037,103,0,80,0,1,0
2054,99,1,97,0,-1,0
1860,99,1,30,1,1,1
1862,103,0,36,0,0,0


In [14]:
X_train = train[['fever','bodyPain','age','runnyNose','diffBreath']].to_numpy()
X_test = test[['fever','bodyPain','age','runnyNose','diffBreath']].to_numpy()

In [15]:
X_train

array([[104,   0,  13,   0,   0],
       [102,   1,  53,   1,   1],
       [100,   0,  33,   1,   1],
       ...,
       [ 99,   1,   6,   1,   0],
       [103,   1,  23,   0,   0],
       [100,   1,  63,   1,  -1]], dtype=int64)

In [16]:
X_test

array([[ 99,   1,  14,   1,  -1],
       [ 99,   0,  88,   1,  -1],
       [100,   1,  69,   0,   0],
       ...,
       [ 99,   1,  30,   1,   1],
       [103,   0,  36,   0,   0],
       [102,   0,  53,   0,   1]], dtype=int64)

In [17]:
Y_train = train[['infectionProb']].to_numpy().reshape(2000,)
Y_test = test[['infectionProb']].to_numpy().reshape(499,)

In [18]:
Y_train

array([1, 1, 1, ..., 1, 0, 1], dtype=int64)

# Training the ML model (Here, LogisticRegression)

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
clf = LogisticRegression()
clf.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
clf.predict([[102,1,22,1,1,]])

array([0], dtype=int64)

In [22]:
clf.predict_proba([[102,1,22,1,1,]])

array([[0.52216532, 0.47783468]])

In [23]:
inputFeatures = [98,0,15,1,-1]
infProb=clf.predict_proba([inputFeatures])[0][1]

In [24]:
infProb

0.4427837759221556

In [25]:
print("Probability of infection : ",infProb*100,"%")

Probability of infection :  44.27837759221556 %
