In [None]:
# Check for missing values
df.isna().sum()

In [None]:
# Check for duplicated rows
df.duplicated().sum()

# Drop duplicate rows
df = df.drop_duplicates()
df

In [None]:
# Define pipelines for numerical and categorical data preprocessing

num_pipeline = Pipeline(
                steps=[
                    ("imputer", SimpleImputer(strategy="median")),
                    ("scalar", StandardScaler(with_mean=False))
                ]
            )

# Create a preprocessor that applies transformations to numerical and categorical columns

preprocessor = ColumnTransformer(
                transformers=[
                    ('num_pipeline', num_pipeline, numerical_columns),
                ]
            )


In [None]:
# Load the training and testing datasets

train_df = pd.read_csv('../artifacts/train.csv')
test_df = pd.read_csv('../artifacts/test.csv')

# Define the target column and numerical columns
target_column_name = 'prognosis'

# Separate input and target features for training and testing datasets
input_feature_train_df = train_df.drop(columns=[target_column_name], axis=1)
target_feature_train_df = train_df[[target_column_name]]

input_feature_test_df = test_df.drop(columns=[target_column_name], axis=1)
target_feature_test_df = test_df[[target_column_name]]

# input_feature_train_df
# Apply one-hot encoding to the target feature
one = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform='pandas')

output_feature_train_df = one.fit_transform(target_feature_train_df)
output_feature_test_df = one.transform(target_feature_test_df)

# Decoding the disease column
diseases = np.array([column for column in  output_feature_train_df.columns])
diseases


In [None]:
# Save the preprocessor object to a file
import dill

with open('../artifacts/preprocessor_v1.pkl', "wb") as file_obj:
        dill.dump(preprocessor, file_obj)


In [None]:
# Prepare training and testing data
X_train,y_train,X_test,y_test = (
    input_feature_train_arr,
    np.array(output_feature_train_df),
    input_feature_test_arr,
    np.array(output_feature_test_df)
)

print("Saved preprocesssing object.")

In [None]:

from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor
from sklearn.ensemble import (
    AdaBoostRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor
)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVC

# Define a dictionary of regression models to evaluate

models = {
        "Random Forest" : RandomForestRegressor(),
        "Decision Tree" : DecisionTreeRegressor(),
        "Linear Regression" : LinearRegression(),
        "K-Neighbors Regressor" : KNeighborsRegressor(),
        "XGBRegressor" : XGBRegressor(),
        # "Gradient Boosting" : GradientBoostingRegressor(),
        # "CatBoosting Regressor" : CatBoostRegressor(verbose = False),
        # "AdaBoost Regressor" : AdaBoostRegressor()
        # "SVM" : SVC(kernel="rbf", gamma=0.5, C=1.0)
}

 
model_report:dict = evaluate_models(
    X_train = X_train,
    y_train = y_train,
    X_test = X_test,
    y_test = y_test,
    models = models
)

In [None]:
# Find the best performing model
best_model_score = max(sorted(model_report.values()))
best_model_name = list(model_report.keys())[
    list(model_report.values()).index(best_model_score)
]

best_model = models[best_model_name]

if best_model_score < 0.6:
    print("No best model found")

print("Best found model on both training and testing dataset")


In [None]:
# Make predictions using the best model
predicted = best_model.predict(X_test)

# Calculate the R-squared score for the best model
r2_sco = r2_score(y_test, predicted)
print(best_model_name)
print(r2_sco)