<h1>Model trainer </h1>

In [96]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
import plotly.express as px
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score , precision_score , recall_score, classification_report

import warnings
warnings.filterwarnings('ignore')

<h1>EDA AND FE</h1>

In [97]:
df = pd.read_csv('data/patientsdata.csv')
df = df.drop('Unnamed: 32', axis=1)

In [98]:
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})
df.corr()['diagnosis'].sort_values(ascending=False)

diagnosis                  1.000000
concave points_worst       0.793566
perimeter_worst            0.782914
concave points_mean        0.776614
radius_worst               0.776454
perimeter_mean             0.742636
area_worst                 0.733825
radius_mean                0.730029
area_mean                  0.708984
concavity_mean             0.696360
concavity_worst            0.659610
compactness_mean           0.596534
compactness_worst          0.590998
radius_se                  0.567134
perimeter_se               0.556141
area_se                    0.548236
texture_worst              0.456903
smoothness_worst           0.421465
symmetry_worst             0.416294
texture_mean               0.415185
concave points_se          0.408042
smoothness_mean            0.358560
symmetry_mean              0.330499
fractal_dimension_worst    0.323872
compactness_se             0.292999
concavity_se               0.253730
fractal_dimension_se       0.077972
id                         0

<h1>checking cancerous and non cancerous patients</h1>

In [99]:
cancerous = df[df['diagnosis'] == 1]
non_cancerous = df[df['diagnosis'] == 0]

<h2>list of input column to drop if corelation value is >.92 </h2>

In [100]:
corr_matrix=df.corr().abs()
mask=np.triu(np.ones_like(corr_matrix,dtype=bool))
tr_df=corr_matrix.mask(mask)
to_drop=[x for x in tr_df.columns if any(tr_df[x]>.92)]
to_drop


['radius_mean',
 'perimeter_mean',
 'area_mean',
 'concavity_mean',
 'radius_se',
 'perimeter_se',
 'radius_worst',
 'perimeter_worst']

In [101]:
df.drop(to_drop,axis=1,inplace=True)
df.drop('id',axis=1,inplace=True)

In [102]:
df.shape

(569, 23)

In [103]:
from sklearn.impute  import SimpleImputer ## Handling the Missing Values
from sklearn.preprocessing import StandardScaler #Handling Featue Scalling 
from sklearn.preprocessing import OrdinalEncoder #handling Encoding 
## pipeline 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

<h1>Pipeline</h1>

In [None]:
#make a list of the numerical columns 
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()

#Exclude the target column if present in the list
target_column = 'diagnosis'
numerical_cols = [col for col in numerical_cols if col != target_column]

print(numerical_cols)

['texture_mean', 'smoothness_mean', 'compactness_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'texture_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'texture_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']


<h2>As our all input features are numerical we don't need the categorical pipeline</h2>

In [105]:
## Numerical Pipeline 
num_pipeline = Pipeline(
    steps=[('imputer',SimpleImputer(strategy='median')),
           ('scaler', StandardScaler())
           
           ]
)

"""cat_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('scaler', StandardScaler())

    ]
)"""
preprocessor=ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ##('cat_pipeline',cat_pipeline,categorical_cols)  
])

In [106]:
##train test split 
X=df.drop('diagnosis',axis=1)
y=df['diagnosis']
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=.2,random_state=42)

In [107]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test= pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [108]:
X_train.head()

Unnamed: 0,num_pipeline__texture_mean,num_pipeline__smoothness_mean,num_pipeline__compactness_mean,num_pipeline__concave points_mean,num_pipeline__symmetry_mean,num_pipeline__fractal_dimension_mean,num_pipeline__texture_se,num_pipeline__area_se,num_pipeline__smoothness_se,num_pipeline__compactness_se,...,num_pipeline__symmetry_se,num_pipeline__fractal_dimension_se,num_pipeline__texture_worst,num_pipeline__area_worst,num_pipeline__smoothness_worst,num_pipeline__compactness_worst,num_pipeline__concavity_worst,num_pipeline__concave points_worst,num_pipeline__symmetry_worst,num_pipeline__fractal_dimension_worst
0,-0.435319,0.780573,0.718921,-0.11915,1.092662,2.458173,-0.016052,-0.474761,0.838365,3.251027,...,2.621166,2.061208,-0.476309,-0.973968,0.722894,1.186732,4.672828,0.932012,2.097242,1.88645
1,1.733026,1.319843,3.426275,2.665032,2.127004,1.558396,-0.812687,0.87717,-0.896053,1.181222,...,-0.317717,0.529636,1.311279,2.137405,0.761928,3.265601,1.928621,2.698947,1.891161,2.497838
2,-1.249622,-1.332645,-0.307355,-0.696502,1.930333,0.954379,1.96306,-0.350779,0.572766,0.739499,...,2.615041,0.718928,-1.040811,-0.999715,-1.438693,-0.548564,-0.644911,-0.970239,0.597602,0.057894
3,1.416222,0.05939,-0.596788,-0.845115,0.313264,0.074041,0.536473,-0.49659,0.065475,-0.822404,...,0.123299,-0.431547,1.59353,-0.742947,0.796624,-0.729392,-0.77495,-0.809483,0.798928,-0.134497
4,-1.010259,1.269511,-0.439002,-0.9306,3.394436,0.950213,0.440382,-0.115532,0.171911,-0.78797,...,2.604015,0.765981,-1.334616,-0.896549,-0.174876,-0.995079,-1.209146,-1.354582,1.033544,-0.205732


<h1>Logistic Regression</h1>

In [165]:
log_reg=LogisticRegression()
log_reg.fit(X_train,y_train)

In [166]:
y_predtest=log_reg.predict(X_test)


In [167]:
print("Confusion Matrix (Test):")
print(confusion_matrix(y_test, y_predtest))

Confusion Matrix (Test):
[[70  1]
 [ 2 41]]


<h1>KFold¶ Cross Validation and Random Forest</h1>

In [168]:
##cross validation 
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import cross_val_score 

k_fold = KFold(n_splits= 5, shuffle=True, random_state=42)

In [169]:
model = RandomForestClassifier(n_estimators=100 , criterion = "entropy" , max_depth= 4 ,random_state= 0)
scores = cross_val_score(model, X, y, cv=k_fold, scoring='accuracy')
print("Cross-validation scores:", scores)
print("Mean accuracy:", scores.mean()*100)
print("Standard deviation:", scores.std())
model.fit(X_train, y_train)

Cross-validation scores: [0.95614035 0.95614035 0.94736842 0.98245614 0.95575221]
Mean accuracy: 95.9571495109455
Standard deviation: 0.011922704611312757


In [170]:
y_predTest=model.predict(X_test)
print("\nAccuracy Score:")
print(f"Test Accuracy: {accuracy_score(y_test, y_predTest)}")
print("\nRecall Score:")
print(f"Test Recall: {recall_score(y_test, y_predTest)}")



Accuracy Score:
Test Accuracy: 0.956140350877193

Recall Score:
Test Recall: 0.9069767441860465


In [171]:
print("Confusion Matrix (Test):")
print(confusion_matrix(y_test, y_predTest))

Confusion Matrix (Test):
[[70  1]
 [ 4 39]]


<h1>Training multiple model</h1>


In [172]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define the evaluation function that returns a dictionary of metrics
def evaluate_model(X_test, y_test, model):
    # Predict the target values
    y_pred = model.predict(X_test)

    # Calculate metrics and return them in a dictionary
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred)
    }
    return metrics

In [173]:
metrics = evaluate_model(X_test, y_test, model)

In [177]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [178]:
## Train multiple models 
# Model Evaluation 
models = {
    'LogisticRegression' : LogisticRegression(),
    'RandomForestClassifier' : RandomForestClassifier(n_estimators=100 , criterion = "entropy" , max_depth= 4 ,random_state= 0),
    'SVC': SVC(kernel='linear', random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5)
    
    
}
trained_model_list = []
model_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    # make prediction
    y_pred=model.predict(X_test)

    metrics = evaluate_model(X_test,y_test,model)
    
    print(list(models.keys())[i]) #printing the model name 

    model_list.append(list(models.keys())[i]) 

    print("Model Training Performance")
    print(f"Accuracy: {metrics['Accuracy']:.2f}")
    print(f"Precision: {metrics['Precision']:.2f}")
    print(f"Recall: {metrics['Recall']:.2f}")
    print(f"F1 Score: {metrics['F1 Score']:.2f}")

    print('='*35)
    print('\n')


    

LogisticRegression
Model Training Performance
Accuracy: 0.97
Precision: 0.98
Recall: 0.95
F1 Score: 0.96


RandomForestClassifier
Model Training Performance
Accuracy: 0.96
Precision: 0.97
Recall: 0.91
F1 Score: 0.94


SVC
Model Training Performance
Accuracy: 0.96
Precision: 0.95
Recall: 0.95
F1 Score: 0.95


KNN
Model Training Performance
Accuracy: 0.94
Precision: 0.93
Recall: 0.91
F1 Score: 0.92


