Import Libraries and Load Data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, mean_absolute_error
import numpy as np

# Load the dataset
from google.colab import files
uploaded = files.upload()
df = pd.read_csv('Dataset.csv')

# Display the first few rows of the dataset
print(df.head())


Saving Dataset.csv to Dataset (1).csv
   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520


Preprocess the data

In [2]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values:\n", missing_values)

# Encode categorical variables
label_encoders = {}
categorical_columns = ['sex', 'smoker', 'region']

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])

# Split the data into features and target variables
X = df.drop(['charges', 'smoker'], axis=1)  # Features for regression
y_regression = df['charges']                # Target for regression
y_classification = df['smoker']             # Target for classification

# Split the data into training and testing sets
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y_regression, test_size=0.2, random_state=42)
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X, y_classification, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_reg = scaler.fit_transform(X_train_reg)
X_test_reg = scaler.transform(X_test_reg)
X_train_clf = scaler.fit_transform(X_train_clf)
X_test_clf = scaler.transform(X_test_clf)


Missing values:
 age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64


Evaluate Classification Models

In [6]:
# Initialize models
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Support Vector Classifier": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier()
}

# Function to evaluate models
def evaluate_classification_models(models, X_train, y_train, X_test, y_test):
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        results[name] = {
            "Accuracy": accuracy_score(y_test, y_pred),

            "Precision": precision_score(y_test, y_pred, zero_division=0),
            "Recall": recall_score(y_test, y_pred, zero_division=0),
            "F1-Score": f1_score(y_test, y_pred, zero_division=0)
        }
    return results

# Evaluate models
classification_results = evaluate_classification_models(classifiers, X_train_clf, y_train_clf, X_test_clf, y_test_clf)
print("Classification Results:\n", classification_results)


Classification Results:
 {'Logistic Regression': {'Accuracy': 0.8008849557522124, 'Precision': 0.0, 'Recall': 0.0, 'F1-Score': 0.0}, 'Decision Tree Classifier': {'Accuracy': 0.7168141592920354, 'Precision': 0.2564102564102564, 'Recall': 0.2222222222222222, 'F1-Score': 0.23809523809523808}, 'Random Forest Classifier': {'Accuracy': 0.7610619469026548, 'Precision': 0.2, 'Recall': 0.06666666666666667, 'F1-Score': 0.1}, 'Support Vector Classifier': {'Accuracy': 0.8008849557522124, 'Precision': 0.0, 'Recall': 0.0, 'F1-Score': 0.0}, 'K-Nearest Neighbors': {'Accuracy': 0.7566371681415929, 'Precision': 0.2916666666666667, 'Recall': 0.15555555555555556, 'F1-Score': 0.2028985507246377}, 'Gradient Boosting Classifier': {'Accuracy': 0.7876106194690266, 'Precision': 0.0, 'Recall': 0.0, 'F1-Score': 0.0}}


The Logistic Regression, Support Vector Classifier, and Gradient Boosting Classifier have 0 values for Precision, Recall, and F1-Score due to the absence of positive predictions.

**Regression Model Implementation and Evaluation**

Next, let's implement and evaluate regression models using RMSE and MAE.

In [7]:
# Initialize models
regressors = {
    "Linear Regression": LinearRegression(),
    "Decision Tree Regressor": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Support Vector Regressor": SVR(),
    "K-Nearest Neighbors": KNeighborsRegressor(),
    "Gradient Boosting Regressor": GradientBoostingRegressor()
}

# Function to evaluate models
def evaluate_regression_models(models, X_train, y_train, X_test, y_test):
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        results[name] = {
            "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
            "MAE": mean_absolute_error(y_test, y_pred)
        }
    return results

# Evaluate models
regression_results = evaluate_regression_models(regressors, X_train_reg, y_train_reg, X_test_reg, y_test_reg)
print("Regression Results:\n", regression_results)


Regression Results:
 {'Linear Regression': {'RMSE': 11867.524254463204, 'MAE': 9316.655594881582}, 'Decision Tree Regressor': {'RMSE': 16585.933312486257, 'MAE': 10690.64210823009}, 'Random Forest Regressor': {'RMSE': 12876.298414879664, 'MAE': 9871.579044561813}, 'Support Vector Regressor': {'RMSE': 13395.245571079342, 'MAE': 8470.780287429792}, 'K-Nearest Neighbors': {'RMSE': 13067.717125874671, 'MAE': 9666.096928884072}, 'Gradient Boosting Regressor': {'RMSE': 12406.023698867106, 'MAE': 9388.583554887198}}


# New Section

I have mention all the stats of diiferent models in report. so based on that report and the overall performance metrics, let's proceed with the **Decision Tree Classifier** for the **classification task** and the **Support Vector Regressor** for the **regression task**.

Decision Tree Classifier: Detailed Evaluation

In [8]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

# Hyperparameter tuning
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search_clf.fit(X_train_clf, y_train_clf)

# Best parameters and score
best_params_clf = grid_search_clf.best_params_
best_score_clf = grid_search_clf.best_score_

print("Best parameters for Decision Tree Classifier: ", best_params_clf)
print("Best cross-validation accuracy: ", best_score_clf)

# Evaluate on test set
best_clf = grid_search_clf.best_estimator_
y_pred_clf = best_clf.predict(X_test_clf)

print("Test Set Accuracy: ", accuracy_score(y_test_clf, y_pred_clf))
print("Confusion Matrix:\n", confusion_matrix(y_test_clf, y_pred_clf))
print("Classification Report:\n", classification_report(y_test_clf, y_pred_clf))


Best parameters for Decision Tree Classifier:  {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
Best cross-validation accuracy:  0.7313934929404542
Test Set Accuracy:  0.7212389380530974
Confusion Matrix:
 [[158  23]
 [ 40   5]]
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.87      0.83       181
           1       0.18      0.11      0.14        45

    accuracy                           0.72       226
   macro avg       0.49      0.49      0.49       226
weighted avg       0.67      0.72      0.70       226



Support Vector Regressor: Detailed Evaluation

In [9]:

# Hyperparameter tuning
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.1, 0.2, 0.5, 0.3, 0.05]
}

grid_search_reg = GridSearchCV(SVR(), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search_reg.fit(X_train_reg, y_train_reg)

# Best parameters and score
best_params_reg = grid_search_reg.best_params_
best_score_reg = grid_search_reg.best_score_

print("Best parameters for Support Vector Regressor: ", best_params_reg)
print("Best cross-validation RMSE: ", np.sqrt(-best_score_reg))

# Evaluate on test set
best_reg = grid_search_reg.best_estimator_
y_pred_reg = best_reg.predict(X_test_reg)

print("Test Set RMSE: ", np.sqrt(mean_squared_error(y_test_reg, y_pred_reg)))
print("Test Set MAE: ", mean_absolute_error(y_test_reg, y_pred_reg))

Best parameters for Support Vector Regressor:  {'C': 10, 'epsilon': 0.5, 'kernel': 'sigmoid'}
Best cross-validation RMSE:  12512.581295742752
Test Set RMSE:  13357.820176534256
Test Set MAE:  8098.752940231096
