In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

Creating Medical Diagnosis Dataset


In [2]:
np.random.seed(42)
patient_ids = np.arange(1, 101)
test_results = np.random.rand(100)
disease_statuses = np.random.choice(['Has Disease', 'No Disease'], size=100, p=[0.3, 0.7])
medical_data = pd.DataFrame({
    'patient_id': patient_ids,
    'test_result': test_results,
    'disease_status': disease_statuses
})
medical_data.to_csv('medical_data.csv', index=False)

Creating Physiological Diagnosis Dataset

In [35]:
individual_ids = np.arange(1, 101)
heart_rates = np.random.normal(70, 10, 100)
blood_pressures = [f'{int(np.random.normal(120, 15))}/{int(np.random.normal(80, 10))}' for _ in range(100)]
cholesterols = np.random.normal(180, 25, 100)
glucoses = np.random.normal(90, 15, 100)
physiological_data = pd.DataFrame({
    'individual_id': individual_ids,
    'heart_rate': heart_rates,
    'blood_pressure': blood_pressures,
    'cholesterol': cholesterols,
    'glucose': glucoses
})
physiological_data.to_csv('physiological_data.csv', index=False)

Creating Height Dataset

In [4]:
heights = np.random.normal(170, 10, 100)
height_data = pd.DataFrame({
    'individual_id': individual_ids,
    'height': heights
})
height_data.to_csv('height_data.csv', index=False)

In [6]:
data = pd.read_csv('medical_data.csv')
data.head()

Unnamed: 0,patient_id,test_result,disease_status
0,1,0.37454,Has Disease
1,2,0.950714,No Disease
2,3,0.731994,No Disease
3,4,0.598658,No Disease
4,5,0.156019,No Disease


In [7]:
X = data[['test_result']]
y = data['disease_status'].apply(lambda x: 1 if x == 'Has Disease' else 0)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

In [9]:
# Predicting probabilities
y_pred_prob = model.predict_proba(X_test)[:, 1]

Medical Diagnosis with Logistic Regression and Bayesian Inference

In [37]:
# Applying Bayesian Inference

P_A = y.mean()  #probabilities before using bayes theroem
P_not_A = 1 - P_A

print(f"Probability:\n{P_A}")

# Likelihoods from logistic regression
P_B_given_A = y_pred_prob
P_B_given_not_A = 1 - y_pred_prob

Probability:
0.32


In [11]:
# Posterior probability using Bayes' Theorem
P_A_given_B = (P_B_given_A * P_A) / (P_B_given_A * P_A + P_B_given_not_A * P_not_A)

print(f"Posterior probabilities:\n{P_A_given_B}")

Posterior probabilities:
[0.20089252 0.17342733 0.17728269 0.18079034 0.19416248 0.1880645
 0.19302956 0.17441869 0.20239723 0.19025332 0.18833724 0.18256811
 0.17591586 0.17174831 0.19894305 0.19768351 0.17731361 0.20052662
 0.17538106 0.19718359 0.17258565 0.17366488 0.19618186 0.20191297
 0.17057573 0.19674048 0.19885835 0.18533721 0.17932765 0.20292661]


Eigenvalues and Eigenvectors using PCA

In [13]:
from sklearn.decomposition import PCA

physiological_data = pd.read_csv('physiological_data.csv')
physiological_data.head()

Unnamed: 0,individual_id,heart_rate,blood_pressure,cholesterol,glucose
0,1,63.199753,151/90,155.356849,107.516731
1,2,72.322537,97/75,192.601163,93.816313
2,3,72.930725,139/72,166.74356,95.06404
3,4,62.856486,126/87,160.178179,83.821846
4,5,88.657745,106/79,177.324241,82.685907


In [15]:
physiological_data.describe()

Unnamed: 0,individual_id,heart_rate,cholesterol,glucose
count,100.0,100.0,100.0,100.0
mean,50.5,71.080334,175.736187,90.907458
std,29.011492,10.057188,24.569199,15.676794
min,1.0,49.748574,118.208887,49.5467
25%,25.75,62.854105,158.484223,82.79783
50%,50.5,71.798176,174.699813,91.086632
75%,75.25,76.812634,193.302016,101.406866
max,100.0,108.527315,256.97202,126.68628


In [16]:
physiological_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   individual_id   100 non-null    int64  
 1   heart_rate      100 non-null    float64
 2   blood_pressure  100 non-null    object 
 3   cholesterol     100 non-null    float64
 4   glucose         100 non-null    float64
dtypes: float64(3), int64(1), object(1)
memory usage: 4.0+ KB


In [19]:
physiological_data[['systolic','diastolic']]=(physiological_data['blood_pressure'].str.split('/', expand=True).astype(int))

In [20]:
physiological_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   individual_id   100 non-null    int64  
 1   heart_rate      100 non-null    float64
 2   blood_pressure  100 non-null    object 
 3   cholesterol     100 non-null    float64
 4   glucose         100 non-null    float64
 5   systolic        100 non-null    int64  
 6   diastolic       100 non-null    int64  
dtypes: float64(3), int64(3), object(1)
memory usage: 5.6+ KB


In [22]:
cols = physiological_data.columns.difference(['blood_pressure'])

In [26]:
# Standardize the data (mean=0, variance=1)
physiological_data_standardized = (physiological_data[cols] - physiological_data[cols].mean()) / physiological_data[cols].std()

In [27]:
# Perform PCA
pca = PCA()
pca.fit(physiological_data_standardized)

In [28]:
# Extract eigenvalues and eigenvectors
eigenvalues = pca.explained_variance_
eigenvectors = pca.components_

print("Eigenvalues:", eigenvalues)
print("Eigenvectors:\n", eigenvectors)

Eigenvalues: [1.30062959 1.23097979 0.98457305 0.87443825 0.84138223 0.7679971 ]
Eigenvectors:
 [[ 0.3445786  -0.22475778 -0.2706883   0.46083172 -0.46519554  0.57332766]
 [-0.37478893 -0.60271929  0.62274737 -0.05698068 -0.31620338  0.07222908]
 [ 0.63470509 -0.14727265  0.01801939 -0.57009096 -0.4014867  -0.29822851]
 [-0.22610046 -0.19840165 -0.35417156 -0.64810597  0.19579737  0.57070125]
 [-0.38783433  0.61793006  0.00575806 -0.17661367 -0.6547042   0.08879103]
 [-0.36933915 -0.37858075 -0.64274272  0.09027677 -0.2354629  -0.49351225]]


 Determinant and Inverse of a Matrix

In [29]:
A = np.array([[4, 1, 2], [1, 3, 0], [2, 0, 2]])

# Calculated the determinant
det_A = np.linalg.det(A)

print("Determinant:", det_A)

# Calculated the inverse, if the determinant is not zero
if det_A != 0:
    inverse_A = np.linalg.inv(A)
    print("Inverse:\n", inverse_A)
else:
    print("Matrix is singular and does not have an inverse.")


Determinant: 10.000000000000002
Inverse:
 [[ 0.6 -0.2 -0.6]
 [-0.2  0.4  0.2]
 [-0.6  0.2  1.1]]


Normal Distribution: Hypothesis Testing and Probability Calculation


In [30]:
from scipy.stats import norm, shapiro

height_data = pd.read_csv('height_data.csv')

In [31]:
height_data.head()

Unnamed: 0,individual_id,height
0,1,167.311113
1,2,158.934741
2,3,195.733598
3,4,170.592184
4,5,170.139293


In [38]:
heights = height_data['height']

# Performed Shapiro-Wilk test for normality
stat, p_value = shapiro(heights)

print(f"Shapiro-Wilk test statistic: {stat}, p-value: {p_value}")

# If p-value > 0.05, we can't reject the null hypothesis (data is normally distributed)

Shapiro-Wilk test statistic: 0.996555507183075, p-value: 0.9973596334457397


In [34]:
mean_height = heights.mean()
std_height = heights.std()

x = 170
probability = norm.cdf(x, mean_height, std_height)

print(f"Probability that height is less than {x} cm: {probability}")

Probability that height is less than 170 cm: 0.5496011350430713
