In [2]:
!pip install --upgrade pandas
!pip install --upgrade fsspec
!pip install --upgrade xgboost
!pip install -U sagemaker
!pip install tornado==6.4
!pip install --upgrade typing-extensions
!pip install scikit-learn
!pip install pydantic
!pip install pydantic-settings
!pip install matplotlib reportlab
!pip install --upgrade scikit-learn

import pandas as pd
import numpy as np 
import boto3
import sagemaker
import json
import joblib
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
)
from sagemaker.inputs import TrainingInput
from sagemaker.image_uris import retrieve
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
s3fs 2024.3.1 requires fsspec==2024.3.1, but you have fsspec 2024.6.1 which is incompatible.



Collecting fsspec
  Downloading fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Downloading fsspec-2024.6.1-py3-none-any.whl (177 kB)
   ---------------------------------------- 0.0/177.6 kB ? eta -:--:--
   ------ -------------------------------- 30.7/177.6 kB 640.0 kB/s eta 0:00:01
   --------------------------- ------------ 122.9/177.6 kB 1.4 MB/s eta 0:00:01
   ---------------------------------------- 177.6/177.6 kB 1.8 MB/s eta 0:00:00
Installing collected packages: fsspec
Successfully installed fsspec-2024.6.1
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting sagemaker
  Downloading sagemaker-2.229.0-py3-none-any.whl.metadata (4.1 kB)
Collecting boto3<2.0,>=1.34.142 (from sagemaker)
  Downloading boto3-1.35.7-py3-none-any.whl.metadata (6.6 kB)
Collecting docker (from sagemaker)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting importlib-me

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 2.12.3 requires botocore<1.34.70,>=1.34.41, but you have botocore 1.35.7 which is incompatible.
s3fs 2024.3.1 requires fsspec==2024.3.1, but you have fsspec 2024.6.1 which is incompatible.


Defaulting to user installation because normal site-packages is not writeable
Collecting tornado==6.4
  Downloading tornado-6.4-cp38-abi3-win_amd64.whl.metadata (2.6 kB)
Downloading tornado-6.4-cp38-abi3-win_amd64.whl (436 kB)
   ---------------------------------------- 0.0/437.0 kB ? eta -:--:--
   -- ------------------------------------ 30.7/437.0 kB 660.6 kB/s eta 0:00:01
   ----------- ---------------------------- 122.9/437.0 kB 1.8 MB/s eta 0:00:01
   ---------------------------------------  430.1/437.0 kB 4.5 MB/s eta 0:00:01
   ---------------------------------------- 437.0/437.0 kB 3.9 MB/s eta 0:00:00
Installing collected packages: tornado
Successfully installed tornado-6.4
Defaulting to user installation because normal site-packages is not writeable
Collecting typing-extensions
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Downloading typing_extensions-4.12.2-py3-none-any.whl (37 kB)
Installing collected packages: typing-extensions
Successfully ins

In [None]:
df = pd.read_csv('s3://medicaldata01/Medical dataset.csv')  
print(df)

In [None]:
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

In [None]:
encoder = OneHotEncoder(drop='first')
sex_encoded = encoder.fit_transform(df[['SEX']])
df_encoded = pd.concat([df.drop('SEX', axis=1), pd.DataFrame(sex_encoded.toarray(), columns=['SEX_encoded'])], axis=1)

In [None]:
df.hist(figsize=(15, 10))
plt.show()

In [None]:
corr_matrix = df_encoded.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.show()

In [None]:
print(df_encoded.skew())
print(df_encoded.kurt())

In [None]:
y = df['SOURCE']
X = df.drop('SOURCE', axis=1)

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
encoder = OneHotEncoder(drop='first')
X_train_encoded = encoder.fit_transform(X_train[['SEX']]).toarray()
X_test_encoded = encoder.transform(X_test[['SEX']]).toarray()

In [None]:
X_train.drop('SEX', axis=1, inplace=True)
X_test.drop('SEX', axis=1, inplace=True)
X_train = np.hstack((X_train.values, X_train_encoded))
X_test = np.hstack((X_test.values, X_test_encoded))

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
train_scores = []
val_scores = []
n_estimators_values = range(10, 201, 10)
for n in n_estimators_values:
    clf = RandomForestClassifier(n_estimators=n, random_state=42)
    clf.fit(X_train_scaled, y_train)
    train_score = clf.score(X_train_scaled, y_train)
    val_score = clf.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    val_scores.append(val_score)
    plt.figure(figsize=(10, 6))
plt.plot(n_estimators_values, train_scores, label='Training Accuracy', marker='o')
plt.plot(n_estimators_values, val_scores, label='Validation Accuracy', marker='x')
plt.xlabel('n_estimators')
plt.ylabel('Accuracy')
plt.title('Training vs Validation Accuracy for Random Forest')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))
fpr, tpr, _ = roc_curve(y_test, clf.predict_proba(X_test_scaled)[:, 1])
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
train_data, test_data = train_test_split(df[numeric_columns], test_size=0.2, random_state=42)
train_data, validation_data = train_test_split(train_data, test_size=0.2, random_state=42)
train_data.to_csv('train.csv', index=False)
test_data.to_csv('test.csv', index=False)
validation_data.to_csv('validation.csv', index=False)
s3 = boto3.client('s3')
bucket_name = 'medicaldata01'
s3.upload_file('train.csv', 'medicaldata01',  'Medical_data/dataset/train.csv')
s3.upload_file('test.csv', 'medicaldata01', 'Medical_data/dataset/test.csv')
s3.upload_file('validation.csv', 'medicaldata01', 'Medical_data/dataset/validation.csv')


In [None]:
y_pred = clf.predict(X_test_scaled)
print("Predicted Labels:", y_pred)
y_prob = clf.predict_proba(X_test_scaled)
print("Probability Estimates:", y_prob)