In [2]:
#Data Cleaning
import pandas as pd
import numpy as np
data = pd.read_csv("diabetic_data.csv")

In [3]:
data.shape

(101766, 50)

In [4]:
print("Shape of the data:", data.shape)

Shape of the data: (101766, 50)


In [5]:
data.drop(columns=['encounter_id'], inplace=True)

In [6]:
data.shape

(101766, 49)

In [7]:
data.replace('?', np.nan, inplace=True)

In [8]:
print("Summary of missing values before replacement:")
print(data.isnull().sum())

Summary of missing values before replacement:
patient_nbr                     0
race                         2273
gender                          0
age                             0
weight                      98569
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  40256
medical_specialty           49949
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         21
diag_2                        358
diag_3                       1423
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
ac

In [9]:
data['readmitted'].replace({'<30': 1, '>30': 0, 'NO': 0}, inplace=True)

In [10]:
print(data.dtypes)

patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride                 object
acetohexamide       

In [11]:
missing_percentage = (data.isnull().sum()/len(data)) * 100
columns_to_drop = missing_percentage[missing_percentage > 90].index
print(columns_to_drop)
data.drop(columns=columns_to_drop, inplace=True)

Index(['weight', 'max_glu_serum'], dtype='object')


In [12]:
data.shape

(101766, 47)

In [13]:
zero_variance = ['repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 
                   'acetohexamide', 'tolbutamide', 'acarbose', 'miglitol', 
                   'troglitazone', 'tolazamide', 'glyburide-metformin', 
                   'glipizide-metformin', 'glimepiride-pioglitazone', 
                   'metformin-rosiglitazone', 'metformin-pioglitazone', 
                   'examide', 'citoglipton']

data.drop(columns=zero_variance, inplace= True)

data.shape





(101766, 30)

In [14]:
data.head()

Unnamed: 0,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,...,A1Cresult,metformin,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,change,diabetesMed,readmitted
0,8222157,Caucasian,Female,[0-10),6,25,1,1,,Pediatrics-Endocrinology,...,,No,No,No,No,No,No,No,No,0
1,55629189,Caucasian,Female,[10-20),1,1,7,3,,,...,,No,No,No,No,No,Up,Ch,Yes,0
2,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,,,...,,No,Steady,No,No,No,No,No,Yes,0
3,82442376,Caucasian,Male,[30-40),1,1,7,2,,,...,,No,No,No,No,No,Up,Ch,Yes,0
4,42519267,Caucasian,Male,[40-50),1,1,7,1,,,...,,No,Steady,No,No,No,Steady,Ch,Yes,0


In [15]:
data.dtypes

patient_nbr                  int64
race                        object
gender                      object
age                         object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
A1Cresult                   object
metformin                   object
glipizide                   object
glyburide                   object
pioglitazone                object
rosiglitazone               object
insulin                     object
change                      object
diabetesMed         

In [16]:
data.dropna(inplace=True)

In [17]:
data.shape

(4166, 30)

In [18]:
data.head()

Unnamed: 0,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,...,A1Cresult,metformin,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,change,diabetesMed,readmitted
23879,7239654,Caucasian,Female,[70-80),1,3,6,12,UN,InternalMedicine,...,>8,No,No,No,No,No,Steady,No,Yes,0
24069,14244093,Caucasian,Female,[30-40),1,1,6,13,MC,Nephrology,...,>8,No,No,No,No,No,Steady,No,Yes,0
24117,537543,Caucasian,Female,[30-40),2,1,4,11,MC,Nephrology,...,>8,No,No,No,Steady,No,Steady,Ch,Yes,0
24177,354474,Caucasian,Male,[70-80),2,3,1,12,MC,InternalMedicine,...,>7,No,Down,No,No,No,No,Ch,Yes,0
24228,1186533,AfricanAmerican,Female,[30-40),1,1,6,6,HM,InternalMedicine,...,Norm,No,No,No,No,No,No,No,No,0


In [19]:
data.dtypes

patient_nbr                  int64
race                        object
gender                      object
age                         object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
A1Cresult                   object
metformin                   object
glipizide                   object
glyburide                   object
pioglitazone                object
rosiglitazone               object
insulin                     object
change                      object
diabetesMed         

In [20]:
admit = (data['readmitted']==1).sum()
no_admit = (data['readmitted']==0).sum()

no_admit,admit

(3770, 396)

In [21]:
data.drop(columns=['patient_nbr'], inplace=True)

In [22]:
numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_columns = data.select_dtypes(include=['object']).columns.tolist()
# Print numerical columns
print("Numerical columns in the dataset:")
print(numerical_columns)
print(" ______________________________________________________")
print("categorical columns in the dataset")
print(categorical_columns)

Numerical columns in the dataset:
['admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'readmitted']
 ______________________________________________________
categorical columns in the dataset
['race', 'gender', 'age', 'payer_code', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'A1Cresult', 'metformin', 'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 'insulin', 'change', 'diabetesMed']


In [23]:
data.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,...,A1Cresult,metformin,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,change,diabetesMed,readmitted
23879,Caucasian,Female,[70-80),1,3,6,12,UN,InternalMedicine,77,...,>8,No,No,No,No,No,Steady,No,Yes,0
24069,Caucasian,Female,[30-40),1,1,6,13,MC,Nephrology,75,...,>8,No,No,No,No,No,Steady,No,Yes,0
24117,Caucasian,Female,[30-40),2,1,4,11,MC,Nephrology,43,...,>8,No,No,No,Steady,No,Steady,Ch,Yes,0
24177,Caucasian,Male,[70-80),2,3,1,12,MC,InternalMedicine,68,...,>7,No,Down,No,No,No,No,Ch,Yes,0
24228,AfricanAmerican,Female,[30-40),1,1,6,6,HM,InternalMedicine,50,...,Norm,No,No,No,No,No,No,No,No,0


In [24]:
data.dtypes

race                        object
gender                      object
age                         object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
A1Cresult                   object
metformin                   object
glipizide                   object
glyburide                   object
pioglitazone                object
rosiglitazone               object
insulin                     object
change                      object
diabetesMed                 object
readmitted          

In [25]:
admit = (data['readmitted']==1).sum()
no_admit = (data['readmitted']==0).sum()

no_admit,admit

(3770, 396)

In [26]:
proportion_admitted = (data['readmitted'] == 1).mean()
proportion_not_admitted = (data['readmitted'] == 0).mean()

# Print the proportions
print("Proportion of Admitted Cases:", proportion_admitted)
print("Proportion of Not Admitted Cases:", proportion_not_admitted)

Proportion of Admitted Cases: 0.09505520883341334
Proportion of Not Admitted Cases: 0.9049447911665867


In [27]:
!pip uninstall scikit-learn --yes

Found existing installation: scikit-learn 1.2.2
Uninstalling scikit-learn-1.2.2:
  Successfully uninstalled scikit-learn-1.2.2


ERROR: Exception:
Traceback (most recent call last):
  File "C:\Users\rahul\anaconda3\Lib\site-packages\pip\_internal\cli\base_command.py", line 180, in exc_logging_wrapper
    status = run_func(*args)
             ^^^^^^^^^^^^^^^
  File "C:\Users\rahul\anaconda3\Lib\site-packages\pip\_internal\commands\uninstall.py", line 110, in run
    uninstall_pathset.commit()
  File "C:\Users\rahul\anaconda3\Lib\site-packages\pip\_internal\req\req_uninstall.py", line 432, in commit
    self._moved_paths.commit()
  File "C:\Users\rahul\anaconda3\Lib\site-packages\pip\_internal\req\req_uninstall.py", line 278, in commit
    save_dir.cleanup()
  File "C:\Users\rahul\anaconda3\Lib\site-packages\pip\_internal\utils\temp_dir.py", line 173, in cleanup
    rmtree(self._path)
  File "C:\Users\rahul\anaconda3\Lib\site-packages\pip\_vendor\tenacity\__init__.py", line 291, in wrapped_f
    return self(f, *args, **kw)
           ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\rahul\anaconda3\Lib\site-packages\pip\_vend

In [28]:
!pip uninstall imblearn --yes

Found existing installation: imblearn 0.0
Uninstalling imblearn-0.0:
  Successfully uninstalled imblearn-0.0


In [29]:
!pip install scikit-learn==1.2.2

Collecting scikit-learn==1.2.2
  Obtaining dependency information for scikit-learn==1.2.2 from https://files.pythonhosted.org/packages/db/98/169b46a84b48f92df2b5e163fce75d471f4df933f8b3d925a61133210776/scikit_learn-1.2.2-cp311-cp311-win_amd64.whl.metadata
  Using cached scikit_learn-1.2.2-cp311-cp311-win_amd64.whl.metadata (11 kB)
Using cached scikit_learn-1.2.2-cp311-cp311-win_amd64.whl (8.3 MB)
Installing collected packages: scikit-learn
Successfully installed scikit-learn-1.2.2


In [30]:
!pip install imblearn

Collecting imblearn
  Obtaining dependency information for imblearn from https://files.pythonhosted.org/packages/81/a7/4179e6ebfd654bd0eac0b9c06125b8b4c96a9d0a8ff9e9507eb2a26d2d7e/imblearn-0.0-py2.py3-none-any.whl.metadata
  Using cached imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Using cached imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [31]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
for col in categorical_cols:  # Replace with actual column names
    data[col] = encoder.fit_transform(data[col])


# Training Model before Oversampling

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd

# Assuming 'data' is your DataFrame with features and target variable
# Assuming 'X' contains features and 'y' contains the target variable
X = data.drop(columns=['readmitted'])
y = data['readmitted']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the logistic regression model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# If you want to access the coefficients and intercept:
coefficients = model.coef_
intercept = model.intercept_
print("\nCoefficients:", coefficients)
print("Intercept:", intercept)


Accuracy: 0.920863309352518

Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       767
           1       0.67      0.03      0.06        67

    accuracy                           0.92       834
   macro avg       0.79      0.51      0.51       834
weighted avg       0.90      0.92      0.89       834


Confusion Matrix:
[[766   1]
 [ 65   2]]

Coefficients: [[-0.05354664 -0.06501161 -0.03152556 -0.22647587  0.02999935 -0.04626285
   0.00346851 -0.03314615 -0.00523081 -0.00603668  0.05994557 -0.00791564
   0.02507358  0.11430374  0.29285922 -0.00123506 -0.00103378 -0.00044416
   0.05874479 -0.09558904 -0.18112724 -0.14571316 -0.06489835 -0.07547958
  -0.06353851 -0.11533786 -0.07217375 -0.00254671]]
Intercept: [-0.07461032]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [33]:
from imblearn.over_sampling import SMOTE


X = data.drop(columns=['readmitted'])
y = data['readmitted']
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)
oversampled_data = pd.DataFrame(X_resampled, columns=X.columns)
oversampled_data['readmitted'] = y_resampled
print("Class distribution after oversampling:")
print(oversampled_data['readmitted'].value_counts())

Class distribution after oversampling:
readmitted
0    3770
1    3770
Name: count, dtype: int64


In [34]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd

# Assuming 'data' is your DataFrame with features and target variable
# Assuming 'X' contains features and 'y' contains the target variable
X = oversampled_data.drop(columns=['readmitted'])
y = oversampled_data['readmitted']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the logistic regression model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# If you want to access the coefficients and intercept:
coefficients = model.coef_
intercept = model.intercept_
print("\nCoefficients:", coefficients)
print("Intercept:", intercept)


Accuracy: 0.7015915119363395

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.71      0.71       769
           1       0.70      0.69      0.69       739

    accuracy                           0.70      1508
   macro avg       0.70      0.70      0.70      1508
weighted avg       0.70      0.70      0.70      1508


Confusion Matrix:
[[546 223]
 [227 512]]

Coefficients: [[-0.18643864 -0.40516949  0.08000357 -0.47957009  0.01438421 -0.09601533
   0.03947885 -0.04684926  0.00433535  0.01922328 -0.18228447 -0.004787
  -0.1426607   0.13177887  0.30037325  0.00188139  0.0007157   0.00179925
   0.1511732  -0.3800716  -0.29319787 -0.18565168 -0.05964387 -0.06312568
  -0.03276213 -0.31715724 -0.36792637  0.02762967]]
Intercept: [0.05139592]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
