In [25]:
import pandas as pd

# Reading Data

In [26]:
df = pd.read_csv('covid19.csv')

In [27]:
df.head()

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,travelledFor,infectionProb
0,99.512365,0,50,1,1,1,0
1,100.729504,0,97,1,1,1,1
2,101.030982,1,87,0,1,0,0
3,98.994444,0,2,0,0,1,0
4,100.644429,0,59,1,1,0,0


In [28]:
df.tail()

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,travelledFor,infectionProb
2624,100.816708,1,94,0,1,0,0
2625,100.075287,1,51,1,1,1,0
2626,100.420244,1,87,1,-1,1,0
2627,98.085696,1,93,0,-1,0,1
2628,99.92421,1,28,1,-1,1,0


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2629 entries, 0 to 2628
Data columns (total 7 columns):
fever            2629 non-null float64
bodyPain         2629 non-null int64
age              2629 non-null int64
runnyNose        2629 non-null int64
diffBreath       2629 non-null int64
travelledFor     2629 non-null int64
infectionProb    2629 non-null int64
dtypes: float64(1), int64(6)
memory usage: 143.8 KB


In [30]:
df['diffBreath'].value_counts()

-1    911
 1    891
 0    827
Name: diffBreath, dtype: int64

In [31]:
df.describe()

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,travelledFor,infectionProb
count,2629.0,2629.0,2629.0,2629.0,2629.0,2629.0,2629.0
mean,99.9737,0.503994,49.583492,0.496386,-0.007607,0.517687,0.497528
std,1.146333,0.500079,28.925999,0.500082,0.828031,0.499782,0.500089
min,98.001974,0.0,1.0,0.0,-1.0,0.0,0.0
25%,98.959719,0.0,24.0,0.0,-1.0,0.0,0.0
50%,99.974704,1.0,49.0,0.0,0.0,1.0,0.0
75%,100.919134,1.0,75.0,1.0,1.0,1.0,1.0
max,101.999897,1.0,100.0,1.0,1.0,1.0,1.0


# Train Test Splitting

In [32]:
import numpy as np

In [33]:
def data_split(data, ratio):
    np.random.seed(42)
    shuffled = np.random.permutation(len(data))
    test_set_size = int(len(data) * ratio)
    test_indices = shuffled[:test_set_size]
    train_indices = shuffled[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

    
    

In [34]:
np.random.permutation(7)

array([4, 3, 5, 2, 1, 0, 6])

In [35]:
train, test = data_split(df, 0.2)

In [36]:
train

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,travelledFor,infectionProb
1220,98.442873,1,77,0,-1,1,1
1208,101.194703,0,36,0,0,1,1
2356,99.024384,1,18,0,1,1,1
482,100.074136,1,5,0,1,0,0
1199,101.489087,1,92,1,-1,0,1
...,...,...,...,...,...,...,...
1638,101.625553,0,83,1,1,1,0
1095,98.904372,1,99,0,1,1,1
1130,100.063068,1,63,0,1,1,0
1294,101.254393,0,87,1,-1,1,1


In [37]:
test

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,travelledFor,infectionProb
221,98.571725,0,22,1,0,0,1
318,98.226784,0,24,1,-1,1,0
926,98.386153,1,13,0,-1,0,1
2489,98.073497,1,80,1,-1,0,0
1420,100.153051,1,98,0,0,0,0
...,...,...,...,...,...,...,...
508,101.111773,1,52,0,1,1,1
76,101.856296,0,30,0,0,0,1
2581,98.534799,0,75,1,1,1,1
620,99.003363,0,55,1,0,0,0


In [38]:
x_train = train[['fever', 'bodyPain', 'age', 'runnyNose', 'diffBreath', 'travelledFor']].to_numpy()

In [39]:
x_train

array([[ 98.44287303,   1.        ,  77.        ,   0.        ,
         -1.        ,   1.        ],
       [101.1947033 ,   0.        ,  36.        ,   0.        ,
          0.        ,   1.        ],
       [ 99.02438412,   1.        ,  18.        ,   0.        ,
          1.        ,   1.        ],
       ...,
       [100.0630677 ,   1.        ,  63.        ,   0.        ,
          1.        ,   1.        ],
       [101.2543927 ,   0.        ,  87.        ,   1.        ,
         -1.        ,   1.        ],
       [101.6194081 ,   1.        ,   4.        ,   0.        ,
          0.        ,   0.        ]])

In [40]:
x_test = test[['fever', 'bodyPain', 'age', 'runnyNose', 'diffBreath', 'travelledFor']].to_numpy()

In [41]:
x_test

array([[98.57172464,  0.        , 22.        ,  1.        ,  0.        ,
         0.        ],
       [98.22678386,  0.        , 24.        ,  1.        , -1.        ,
         1.        ],
       [98.38615282,  1.        , 13.        ,  0.        , -1.        ,
         0.        ],
       ...,
       [98.53479939,  0.        , 75.        ,  1.        ,  1.        ,
         1.        ],
       [99.00336267,  0.        , 55.        ,  1.        ,  0.        ,
         0.        ],
       [99.84294098,  0.        , 64.        ,  0.        ,  0.        ,
         0.        ]])

In [42]:
y_train = train[['infectionProb']].to_numpy().reshape(2104 ,)
y_test = test[['infectionProb']].to_numpy().reshape(525 ,)

In [43]:
y_train

array([1, 1, 1, ..., 0, 1, 0], dtype=int64)

In [44]:
y_test

array([1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,

In [45]:
from sklearn.linear_model import LogisticRegression

In [46]:
clf = LogisticRegression()
clf.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [47]:
inputFeatures = [100, 1, 25, -1, 1, 1]
infProb = clf.predict_proba([inputFeatures])[0][1]

In [48]:
infProb

0.5260360106633324