In [8]:
import sys
import joblib
import pandas as pd
import numpy as np
import boto3


from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from sklearn.ensemble import  GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings(action='ignore', category=FutureWarning)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [5]:
print(sys.version)

3.10.14 | packaged by conda-forge | (main, Mar 20 2024, 12:45:18) [GCC 12.3.0]


In [6]:
print(sys.executable)

/home/ec2-user/anaconda3/envs/python3/bin/python


In [10]:

s3_bucket = "sonbucket21"


def upload_to_s3(localpath, remotepath):
    boto3.client("s3").upload_file(Filename=localpath, Bucket=s3_bucket, Key=remotepath)


def download_from_s3(localpath, remotepath):
    boto3.client("s3").download_file(s3_bucket, remotepath, localpath)

In [14]:
download_from_s3("insurance.csv", "insurance.csv")

In [15]:
df = pd.read_csv("insurance.csv")

In [16]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.552
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.471
4,32,male,28.88,0,no,northwest,3866.855


In [17]:
df.duplicated().sum()
duplicate_rows_data = df[df.duplicated()]
df = df.drop_duplicates()

df = df.reset_index(drop=True)

In [18]:

def grab_col_names(dataframe, cat_th=10, car_th=20):

    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')

    return cat_cols, num_cols, cat_but_car
cat_cols, num_cols, cat_but_car = grab_col_names(df)

Observations: 1337
Variables: 7
cat_cols: 4
num_cols: 3
cat_but_car: 0
num_but_cat: 1


In [20]:

def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe

binary_cols = [col for col in df.columns if df[col].dtypes == "O" ]

for col in binary_cols:
    label_encoder(df, col)

In [21]:
df.corr()['charges'].sort_values()


region     -0.007
sex         0.058
children    0.067
bmi         0.198
age         0.298
smoker      0.787
charges     1.000
Name: charges, dtype: float64

In [22]:
def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe


binary_cols = [col for col in df.columns if df[col].dtypes == "O" and len(df[col].unique()) == 2]
cat_columns = [col for col in df.columns if df[col].dtypes == "O" and len(df[col].unique()) == 4]
for col in binary_cols:
    label_encoder(df, col)
for col in cat_columns:
    label_encoder(df, col)


In [23]:
cat_cols, num_cols, cat_but_car, = grab_col_names(df)

num_cols = [col for col in num_cols if "charges" not in col]
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])


Observations: 1337
Variables: 7
cat_cols: 4
num_cols: 3
cat_but_car: 0
num_but_cat: 4


In [24]:
y = df["charges"]
X = df.drop(["charges","region"], axis=1)

y = np.log1p(df['charges'])
X = df.drop(["charges"], axis=1)


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=17)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1069, 6) (268, 6) (1069,) (268,)


In [26]:
gbm_model = GradientBoostingRegressor(random_state=42).fit(X_train, y_train)
y_pred = gbm_model.predict(X_test) #Zuerst kehren wir den Logarithmus des y-Werts um
y_pred = np.expm1(y_pred)
y_test = np.expm1(y_test)
y_train=np.expm1(y_train)
y = np.expm1(y)


In [27]:
rmse = np.mean(np.sqrt(-cross_val_score(gbm_model, X, y, cv=10, scoring="neg_mean_squared_error")))

In [28]:
gbm_model.get_params()
param_grid = {
    'n_estimators': [ 500],
    'learning_rate': [0.01],
    'max_depth': [3]
}
gbm_gs_best = GridSearchCV(gbm_model,
                            param_grid,
                            cv=10,
                            n_jobs=-1,
                            verbose=0).fit(X_train, y_train)

In [29]:
final_model = gbm_model.set_params(**gbm_gs_best.best_params_).fit(X, y)
rmse = np.mean(np.sqrt(-cross_val_score(final_model, X, y, cv=5, scoring="neg_mean_squared_error")))

In [30]:

y_pred_tr = final_model.predict(X_train)
r2_score(y_train,y_pred_tr)



0.8854443711318675

In [31]:
y_pred_tx=final_model.predict(X_test)
r2_score(y_test,y_pred_tx)

0.8805095341867151

In [32]:
joblib.dump(final_model, './model.pkl')


['./model.pkl']

In [33]:
upload_to_s3( 'model.pkl', 'medical_model.pkl')