In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('/content/insurance.csv')

In [None]:
df.head(5)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

In [None]:
print("Unique values in each column:")
for column in df.columns:
    unique_values = df[column].unique()
    print(f"{column}: {unique_values}")

# **EDA**

In [None]:
# Pie chart: Gender-wise distribution
fig1 = px.pie(df, names='sex', title='Gender-wise Distribution')
fig1.show()

In [None]:
# Pie chart: Smoker distribution
fig2 = px.pie(df, names='smoker', title='Smoker Distribution')
fig2.show()

In [None]:
# Histogram: Number of children
fig3 = px.histogram(df, x='children', title='Distribution of Number of Children')
fig3.show()

In [None]:
# Violin plot: Sex vs. BMI
fig4 = px.violin(df, x='sex', y='bmi', box=True, points='all', title='Sex vs. BMI')
fig4.show()

In [None]:
# Count plot: Region distribution
fig5 = px.histogram(df, x='region', title='Region-wise Distribution', category_orders=dict(region=df['region'].unique()))
fig5.show()

In [None]:
# Unique plot: Charges vs. BMI scatter plot with trendline
fig6 = px.scatter(df, x='bmi', y='charges', trendline='ols', title='Charges vs. BMI')
fig6.show()

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
# Label Encoding for categorical columns
label_encoders = {}
categorical_columns = ['sex', 'smoker', 'region']

for column in categorical_columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le


In [None]:
df.head()

In [None]:
# Correlation matrix
plt.figure(figsize=(10, 8))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Box plots for each column before removing outliers
plt.figure(figsize=(15, 10))

for i, column in enumerate(df.columns, 1):
    plt.subplot(3, 3, i)
    sns.boxplot(y=df[column])
    plt.title(f'Box Plot of {column}')

plt.tight_layout()
plt.show()

In [None]:
# Function to remove outliers using IQR
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

# Removing outliers for each column
df2 = df.copy()
for column in df.columns:
    if column != 'charges':  # Assuming 'charges' is our target and we won't remove outliers from the target variable
        df2 = remove_outliers(df2, column)

# Box plots for each column after removing outliers
plt.figure(figsize=(15, 10))

for i, column in enumerate(df2.columns, 1):
    plt.subplot(3, 3, i)
    sns.boxplot(y=df2[column])
    plt.title(f'Box Plot of {column} (Outliers Removed)')

plt.tight_layout()
plt.show()


# **Multiple_Linear_Regression**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression

# Splitting the data into features and target
X = df2.drop('charges', axis=1)
y = df2['charges']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing and training the Linear Regression model
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

# Making predictions
y_pred = model_lr.predict(X_test)

# Calculating performance metrics
accuracy_lr = r2_score(y_test, y_pred)
mse_lr = mean_squared_error(y_test, y_pred)
rmse_lr = np.sqrt(mse_lr)
mape_lr = mean_absolute_percentage_error(y_test, y_pred)

# Printing the performance metrics
print(f"R^2 Score (Linear Regression): {accuracy_lr}")
print(f"Mean Squared Error (MSE) (Linear Regression): {mse_lr}")
print(f"Root Mean Squared Error (RMSE) (Linear Regression): {rmse_lr}")
print(f"Mean Absolute Percentage Error (MAPE) (Linear Regression): {mape_lr}")

# Plotting Actual vs Predicted values for Linear Regression
plt.figure(figsize=(10, 6))
plt.plot(y_test.values, label='Actual Values', color='b')
plt.plot(y_pred, label='Predicted Values', color='r')
plt.title('Actual vs Predicted Insurance Charges (Linear Regression)')
plt.xlabel('Samples')
plt.ylabel('Charges')
plt.legend()
plt.show()


# **Random Forest**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error

# Initializing and training the Random Forest Regressor
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

# Making predictions
y_pred = model_rf.predict(X_test)

# Calculating performance metrics
accuracy_rf = r2_score(y_test, y_pred)
mse_rf = mean_squared_error(y_test, y_pred)
rmse_rf = np.sqrt(mse_rf)
mape_rf = mean_absolute_percentage_error(y_test, y_pred)

# Printing the performance metrics
print(f"R^2 Score: {accuracy_rf}")
print(f"Mean Squared Error (MSE): {mse_rf}")
print(f"Root Mean Squared Error (RMSE): {rmse_rf}")
print(f"Mean Absolute Percentage Error (MAPE): {mape_rf}")

# Plotting Actual vs Predicted values
plt.figure(figsize=(10, 6))
plt.plot(y_test.values, label='Actual Values', color='b')
plt.plot(y_pred, label='Predicted Values', color='r')
plt.title('Actual vs Predicted Insurance Charges')
plt.xlabel('Samples')
plt.ylabel('Charges')
plt.legend()
plt.show()

# **Gradient Boosting Machine (GBM)**

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize GBM
model_gbm = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Train the model
model_gbm.fit(X_train, y_train)

# Making predictions
y_pred = model_gbm.predict(X_test)

# Calculating performance metrics
accuracy_gbm = r2_score(y_test, y_pred)
mse_gbm = mean_squared_error(y_test, y_pred)
rmse_gbm = np.sqrt(mse_gbm)
mape_gbm = mean_absolute_percentage_error(y_test, y_pred)

# Printing the performance metrics
print(f"R^2 Score (GBM): {accuracy_gbm}")
print(f"Mean Squared Error (MSE) (GBM): {mse_gbm}")
print(f"Root Mean Squared Error (RMSE) (GBM): {rmse_gbm}")
print(f"Mean Absolute Percentage Error (MAPE) (GBM): {mape_gbm}")

# Plotting Actual vs Predicted values for GBM
plt.figure(figsize=(10, 6))
plt.plot(y_test.values, label='Actual Values', color='b')
plt.plot(y_pred, label='Predicted Values', color='r')
plt.title('Actual vs Predicted Insurance Charges (GBM)')
plt.xlabel('Samples')
plt.ylabel('Charges')
plt.legend()
plt.show()


# **XGBOOST**

In [None]:
from xgboost import XGBRegressor

# Initialize XGBRegressor
model_xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Train the model
model_xgb.fit(X_train, y_train)

# Making predictions
y_pred = model_xgb.predict(X_test)

# Calculating performance metrics
accuracy_xgb = r2_score(y_test, y_pred)
mse_xgb = mean_squared_error(y_test, y_pred)
rmse_xgb = np.sqrt(mse_xgb)
mape_xgb = mean_absolute_percentage_error(y_test, y_pred)

# Printing the performance metrics
print(f"R^2 Score (XGBoost): {accuracy_xgb}")
print(f"Mean Squared Error (MSE) (XGBoost): {mse_xgb}")
print(f"Root Mean Squared Error (RMSE) (XGBoost): {rmse_xgb}")
print(f"Mean Absolute Percentage Error (MAPE) (XGBoost): {mape_xgb}")

# Plotting Actual vs Predicted values for XGBoost
plt.figure(figsize=(10, 6))
plt.plot(y_test.values, label='Actual Values', color='b')
plt.plot(y_pred, label='Predicted Values', color='r')
plt.title('Actual vs Predicted Insurance Charges (XGBoost)')
plt.xlabel('Samples')
plt.ylabel('Charges')
plt.legend()
plt.show()


In [None]:
# Example values for metrics
metrics = {
    'Model': ['MLR', 'RF', 'XGBoost', 'GBM'],
    'R^2 Score': [accuracy_lr, accuracy_rf, accuracy_xgb, accuracy_gbm],
    'MSE': [mse_lr, mse_rf, mse_xgb, mse_gbm],
    'RMSE': [rmse_lr, rmse_rf, rmse_xgb, rmse_gbm],
    'MAPE': [mape_lr, mape_rf, mape_xgb, mape_gbm]
}

# Create a DataFrame
df_metrics = pd.DataFrame(metrics)

# Plotting comparison
plt.figure(figsize=(10, 6))

# Plot R^2 Score
plt.subplot(221)
plt.bar(df_metrics['Model'], df_metrics['R^2 Score'], color='b')
plt.title('R^2 Score Comparison')
plt.ylim(0, 1)

# Plot MSE
plt.subplot(222)
plt.bar(df_metrics['Model'], df_metrics['MSE'], color='g')
plt.title('MSE Comparison')

# Plot RMSE
plt.subplot(223)
plt.bar(df_metrics['Model'], df_metrics['RMSE'], color='r')
plt.title('RMSE Comparison')

# Plot MAPE
plt.subplot(224)
plt.bar(df_metrics['Model'], df_metrics['MAPE'], color='purple')
plt.title('MAPE Comparison')

plt.tight_layout()
plt.show()

# Display the metrics DataFrame
print("Metrics Comparison:")
print(df_metrics)


In [None]:
from sklearn.model_selection import cross_val_score, KFold

# Initialize the Linear Regression model
model_lr = LinearRegression()

# Perform 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(model_lr, X, y, cv=kf, scoring='r2')

# Print accuracy on each iteration
print("Accuracy on each cross-validation iteration:")
for i, score in enumerate(cv_scores):
    print(f"Iteration {i+1}: {score}")

# Calculate average validated accuracy
avg_accuracy = np.mean(cv_scores)
print(f"\nAverage Validated R^2 Score: {avg_accuracy}")

# Plotting count plot of accuracy at each iteration
plt.figure(figsize=(8, 6))
plt.bar(range(1, 11), cv_scores, color='skyblue')
plt.axhline(y=avg_accuracy, color='r', linestyle='-', label=f'Average R^2 Score: {avg_accuracy:.2f}')
plt.title('Cross-Validation R^2 Scores')
plt.xlabel('Iteration')
plt.ylabel('R^2 Score')
plt.xticks(range(1, 11))
plt.legend()
plt.show()


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression

# Assume label_encoders dictionary is already defined from training phase
# label_encoders = {'sex': le_sex, 'smoker': le_smoker, 'region': le_region}

# Load the trained model (model_lr) and label encoders

def predict_insurance_charges(age, sex, bmi, children, smoker, region, model_lr, label_encoders):
    # Transform categorical variables using label encoders
    sex_encoded = label_encoders['sex'].transform([sex])[0]
    smoker_encoded = label_encoders['smoker'].transform([smoker])[0]
    region_encoded = label_encoders['region'].transform([region])[0]

    # Prepare input data as DataFrame
    input_data = pd.DataFrame({
        'age': [age],
        'sex': [sex_encoded],
        'bmi': [bmi],
        'children': [children],
        'smoker': [smoker_encoded],
        'region': [region_encoded]
    })

    # Make prediction using the trained Linear Regression model
    predicted_charge = model_lr.predict(input_data)[0]

    return predicted_charge

# Example of interactive user input and prediction
def main():
    print("Enter the following details to predict insurance charges:")
    age = int(input("Age: "))
    sex = input("Sex (male/female): ").strip().lower()
    bmi = float(input("BMI: "))
    children = int(input("Number of children: "))
    smoker = input("Smoker (yes/no): ").strip().lower()
    region = input("Region (northeast, northwest, southeast, southwest): ").strip().lower()

    # Ensure valid input for categorical variables
    if sex not in ['male', 'female']:
        print("Invalid input for sex. Please enter 'male' or 'female'.")
        return
    if smoker not in ['yes', 'no']:
        print("Invalid input for smoker. Please enter 'yes' or 'no'.")
        return
    if region not in ['northeast', 'northwest', 'southeast', 'southwest']:
        print("Invalid input for region. Please enter one of: northeast, northwest, southeast, southwest.")
        return

    # Call prediction function
    predicted_charge = predict_insurance_charges(age, sex, bmi, children, smoker, region, model_lr, label_encoders)

    # Display the prediction
    print(f"\nPredicted Insurance Charge: {predicted_charge:.2f} Rs")

if __name__ == "__main__":
    main()


In [None]:
import joblib

# Save the model
joblib.dump(model_lr, 'linear_regression_model.pkl')

# Save the label encoders
joblib.dump(label_encoders, 'label_encoders.pkl')