In [73]:
import pandas as pd

# Reading Data

In [74]:
df = pd.read_csv('data.csv')

In [75]:
df.head()

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
0,98.833794,1,39,1,0,0
1,98.028211,1,30,1,1,1
2,100.976153,1,24,1,-1,1
3,100.430455,0,58,0,0,0
4,99.728631,0,2,0,-1,0


In [76]:
df.tail()

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
2508,100.030243,0,20,1,1,0
2509,98.343038,0,89,1,-1,1
2510,101.565289,0,52,0,1,0
2511,99.251392,1,44,1,-1,0
2512,98.370763,1,55,1,1,1


In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2513 entries, 0 to 2512
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   fever          2513 non-null   float64
 1   bodyPain       2513 non-null   int64  
 2   age            2513 non-null   int64  
 3   runnyNose      2513 non-null   int64  
 4   diffBreath     2513 non-null   int64  
 5   infectionProb  2513 non-null   int64  
dtypes: float64(1), int64(5)
memory usage: 117.9 KB


In [78]:
df['diffBreath'].value_counts()

 0    864
-1    833
 1    816
Name: diffBreath, dtype: int64

In [79]:
df.describe()

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
count,2513.0,2513.0,2513.0,2513.0,2513.0,2513.0
mean,99.973646,0.500597,49.603661,0.500199,-0.006765,0.516116
std,1.152883,0.500099,28.570795,0.500099,0.810187,0.49984
min,98.00018,0.0,1.0,0.0,-1.0,0.0
25%,98.960967,0.0,25.0,0.0,-1.0,0.0
50%,99.988252,1.0,49.0,1.0,0.0,1.0
75%,100.940414,1.0,74.0,1.0,1.0,1.0
max,101.999274,1.0,100.0,1.0,1.0,1.0


# Train Test Splitting

In [80]:
import numpy as np

In [81]:
def data_split(data,ratio):
    np.random.seed(42)
    shuffled = np.random.permutation(len(data))
    test_set_size = int(len(data) * ratio)
    test_indices = shuffled[:test_set_size]
    train_indices = shuffled[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]
    

In [82]:
np.random.permutation(10)

array([8, 1, 9, 5, 3, 6, 4, 7, 0, 2])

In [83]:
train,test = data_split(df,0.2)

In [84]:
train

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
2461,99.571805,0,71,0,0,1
1446,100.262692,1,92,1,-1,0
2102,98.648122,1,95,1,0,0
1990,100.672790,0,56,0,1,0
2401,101.322620,0,39,0,1,1
...,...,...,...,...,...,...
1638,99.960319,1,93,1,1,0
1095,98.899945,1,68,1,1,1
1130,98.755711,1,81,1,-1,0
1294,100.249877,1,30,0,0,1


In [85]:
test

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
700,100.126664,0,39,0,-1,0
433,99.843327,0,44,0,-1,0
1070,99.415479,1,21,1,0,1
1133,99.169246,1,18,1,1,1
93,98.998012,0,87,0,1,0
...,...,...,...,...,...,...
1307,98.414036,0,5,1,1,0
1932,99.343738,0,42,0,-1,1
2213,98.250107,1,63,1,0,1
289,99.883045,0,95,0,0,1


In [86]:
X_train = train[['fever','bodyPain','age','runnyNose','diffBreath']]

In [87]:
X_train

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath
2461,99.571805,0,71,0,0
1446,100.262692,1,92,1,-1
2102,98.648122,1,95,1,0
1990,100.672790,0,56,0,1
2401,101.322620,0,39,0,1
...,...,...,...,...,...
1638,99.960319,1,93,1,1
1095,98.899945,1,68,1,1
1130,98.755711,1,81,1,-1
1294,100.249877,1,30,0,0


In [88]:
X_train = train[['fever','bodyPain','age','runnyNose','diffBreath']].to_numpy()
X_test = train[['fever','bodyPain','age','runnyNose','diffBreath']].to_numpy()

In [89]:
X_train

array([[ 99.57180509,   0.        ,  71.        ,   0.        ,
          0.        ],
       [100.2626918 ,   1.        ,  92.        ,   1.        ,
         -1.        ],
       [ 98.64812156,   1.        ,  95.        ,   1.        ,
          0.        ],
       ...,
       [ 98.75571103,   1.        ,  81.        ,   1.        ,
         -1.        ],
       [100.2498775 ,   1.        ,  30.        ,   0.        ,
          0.        ],
       [ 99.17359605,   0.        ,  99.        ,   0.        ,
          1.        ]])

In [90]:
X_test

array([[ 99.57180509,   0.        ,  71.        ,   0.        ,
          0.        ],
       [100.2626918 ,   1.        ,  92.        ,   1.        ,
         -1.        ],
       [ 98.64812156,   1.        ,  95.        ,   1.        ,
          0.        ],
       ...,
       [ 98.75571103,   1.        ,  81.        ,   1.        ,
         -1.        ],
       [100.2498775 ,   1.        ,  30.        ,   0.        ,
          0.        ],
       [ 99.17359605,   0.        ,  99.        ,   0.        ,
          1.        ]])

In [100]:
Y_train = train[['infectionProb']].to_numpy().reshape(2011,)
Y_test = train[['infectionProb']].to_numpy().reshape(2011,)

In [101]:
Y_train

array([1, 0, 0, ..., 0, 1, 1], dtype=int64)

In [102]:
Y_test

array([1, 0, 0, ..., 0, 1, 1], dtype=int64)

In [95]:
from sklearn.linear_model import LogisticRegression

In [96]:
clf = LogisticRegression()
clf.fit(X_train,Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [97]:
inputFeatures = [100,1,22,-1,1]
infProb = clf.predict_proba([inputFeatures])[0][1]

In [98]:
infProb

0.5585203850619324