# **Predict Stroke**

- Import Library

In [122]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, GlobalAveragePooling1D, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from scipy.stats import skew
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

- Load Data

In [91]:
df = pd.read_csv('../dataset/housing.csv')

In [92]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## Data Preprocessing

- Handling Missing & Duplicated Values

In [93]:
df.duplicated().sum()

np.int64(0)

In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [95]:
df.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [96]:
df = df.dropna()

In [97]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0
mean,-119.570689,35.633221,28.633094,2636.504233,537.870553,1424.946949,499.433465,3.871162,206864.413155
std,2.003578,2.136348,12.591805,2185.269567,421.38507,1133.20849,382.299226,1.899291,115435.667099
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1450.0,296.0,787.0,280.0,2.5637,119500.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5365,179700.0
75%,-118.01,37.72,37.0,3143.0,647.0,1722.0,604.0,4.744,264700.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [98]:
numeric = df.select_dtypes(include=['number']).columns
for i in numeric:
    Q1 = df[i].quantile(0.25)
    Q3 = df[i].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    outliers = df[(df[i] < lower) | (df[i] > upper)]
    print(f"Kolom '{i}': {len(outliers)} outlier")

Kolom 'longitude': 0 outlier
Kolom 'latitude': 0 outlier
Kolom 'housing_median_age': 0 outlier
Kolom 'total_rooms': 1290 outlier
Kolom 'total_bedrooms': 1271 outlier
Kolom 'population': 1190 outlier
Kolom 'households': 1210 outlier
Kolom 'median_income': 670 outlier
Kolom 'median_house_value': 1064 outlier


- Outliers Handling

In [99]:
df['ocean_proximity'].value_counts()

ocean_proximity
<1H OCEAN     9034
INLAND        6496
NEAR OCEAN    2628
NEAR BAY      2270
ISLAND           5
Name: count, dtype: int64

In [100]:

def outliersHandling(series):
  Q1 = df[series].quantile(0.25)
  Q3 = df[series].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  df.loc[df[series] > upper_bound, series] = upper_bound
  df.loc[df[series] < lower_bound, series] = lower_bound
  return df


In [101]:
numeric = df.select_dtypes(include=['number']).columns

for i in numeric:
  df = outliersHandling(i)

for i in numeric:
    Q1 = df[i].quantile(0.25)
    Q3 = df[i].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    outliers = df[(df[i] < lower) | (df[i] > upper)]
    print(f"Kolom '{i}': {len(outliers)} outlier")

Kolom 'longitude': 0 outlier
Kolom 'latitude': 0 outlier
Kolom 'housing_median_age': 0 outlier
Kolom 'total_rooms': 0 outlier
Kolom 'total_bedrooms': 0 outlier
Kolom 'population': 0 outlier
Kolom 'households': 0 outlier
Kolom 'median_income': 0 outlier
Kolom 'median_house_value': 0 outlier


- Feature Engineering

In [None]:
ohe = OneHotEncoder(sparse_output=False)
encoded_data = ohe.fit_transform(df[['ocean_proximity']])
encoded_df = pd.DataFrame(encoded_data, columns=ohe.get_feature_names_out(['ocean_proximity']))

- Data Splitting

In [112]:
X = encoded_df
y = df['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [113]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Modelling & Evaluation

In [118]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1)  # Output layer untuk regresi
])

# Kompilasi model dengan optimizer dan loss function untuk regresi
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Menggunakan EarlyStopping untuk menghentikan training jika model tidak mengalami peningkatan
early = EarlyStopping(patience=5, restore_best_weights=True)

# Melatih model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=32, callbacks=[early])

Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m511/511[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 52819877888.0000 - mae: 199615.0156 - val_loss: 16087198720.0000 - val_mae: 88387.7891
Epoch 2/20
[1m511/511[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 12181000192.0000 - mae: 81235.2969 - val_loss: 9985990656.0000 - val_mae: 76061.5859
Epoch 3/20
[1m511/511[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 10738295808.0000 - mae: 78490.6016 - val_loss: 9922265088.0000 - val_mae: 76346.6250
Epoch 4/20
[1m511/511[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 10970929152.0000 - mae: 79309.4141 - val_loss: 9936456704.0000 - val_mae: 76171.1641
Epoch 5/20
[1m511/511[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 10900899840.0000 - mae: 79555.3438 - val_loss: 9969283072.0000 - val_mae: 75959.1406
Epoch 6/20
[1m511/511[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 11148991488.0000 - m

In [120]:
val_mae = history.history['val_mae'][-1]
mae_percent = (val_mae / y_test.mean()) * 100
print(f"MAE sekitar {mae_percent:.2f}% dari rata-rata harga rumah.")

MAE sekitar 36.87% dari rata-rata harga rumah.


In [129]:
X_train_cnn = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_cnn = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

cnn_model = Sequential([
    Conv1D(64, kernel_size=2, activation='relu', padding='same', input_shape=(X_train.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    BatchNormalization(),
    Conv1D(32, kernel_size=2, activation='relu', padding='same'),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1)
])

cnn_model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
early = EarlyStopping(patience=5, restore_best_weights=True)
history_cnn = cnn_model.fit(X_train_cnn, y_train, epochs=20, validation_split=0.2, verbose=1, callbacks=[early])


Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - loss: 55026593792.0000 - mae: 205572.5000 - val_loss: 45253390336.0000 - val_mae: 179941.1094
Epoch 2/20
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 33408432128.0000 - mae: 144439.1875 - val_loss: 15396417536.0000 - val_mae: 85321.2500
Epoch 3/20
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 11896930304.0000 - mae: 82310.7891 - val_loss: 12895101952.0000 - val_mae: 77109.6406
Epoch 4/20
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 10653019136.0000 - mae: 78051.5391 - val_loss: 11447840768.0000 - val_mae: 75582.0547
Epoch 5/20
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 10681857024.0000 - mae: 78246.3750 - val_loss: 11163105280.0000 - val_mae: 75357.8750
Epoch 6/20
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 10714515456.00

In [132]:
val_mae = history_cnn.history['val_mae'][-1]
mae_percent = (val_mae / y_test.mean()) * 100
print(f"MAE sekitar {mae_percent:.2f}% dari rata-rata harga rumah.")

MAE sekitar 36.27% dari rata-rata harga rumah.


## Predict

In [137]:
df.dtypes

longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object

In [None]:
sample = pd.DataFrame([[-122.23, 37.88, 41.0, 880.0, 129.0, 322.0, 126.0, 3.0, 4]], columns=X.columns)
sample_scaled = scaler.transform(sample)

print(model.predict(sample_scaled)[0])