In [29]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
class DataPreprocessor:
    def __init__(self):
        self.scaler = None
        self.label_encoders = {}
        self.imputer = None
        self.feature_names = None
        
    def load_data(self, file_path, file_type='csv', **kwargs):
        """Memuat data dari berbagai format file"""
        try:
            if file_type.lower() == 'csv':
                data = pd.read_csv(file_path, **kwargs)
            elif file_type.lower() == 'excel':
                data = pd.read_excel(file_path, **kwargs)
            elif file_type.lower() == 'json':
                data = pd.read_json(file_path, **kwargs)
            else:
                raise ValueError(f"Format file {file_type} tidak didukung")
                
            print(f"Data berhasil dimuat: {data.shape}")
            return data
        except Exception as e:
            print(f"Error memuat data: {e}")
            return None
            
    def explore_data(self, data):
        """Eksplorasi data dasar"""
        print("=== INFORMASI DATASET ===")
        print(f"Shape: {data.shape}")
        print(f"Columns: {list(data.columns)}")
        print("\n=== INFO DATA ===")
        print(data.info())
        
        print("\n=== STATISTIK DESKRIPTIF ===")
        print(data.describe())
        
        print("\n=== MISSING VALUES ===")
        missing = data.isnull().sum()
        print(missing[missing > 0])
        
        print("\n=== TIPE DATA ===")
        print(data.dtypes)
        
        return {
            'shape': data.shape,
            'columns': list(data.columns),
            'missing_values': missing,
            'dtypes': data.dtypes
        }
        
    def handle_missing_values(self, data, strategy='mean', columns=None):
        """Menangani missing values"""
        if columns is None:
            columns = data.select_dtypes(include=[np.number]).columns
            
        if strategy in ['mean', 'median', 'most_frequent']:
            self.imputer = SimpleImputer(strategy=strategy)
            data[columns] = self.imputer.fit_transform(data[columns])
        elif strategy == 'drop':
            data = data.dropna(subset=columns)
        elif strategy == 'fill_zero':
            data[columns] = data[columns].fillna(0)
        else:
            raise ValueError(f"Strategy {strategy} tidak didukung")
            
        print(f"Missing values ditangani dengan strategy: {strategy}")
        return data

In [30]:
# load dataset
preprocessor = DataPreprocessor()
df = preprocessor.load_data('healthcare-dataset-stroke-data.csv')

Data berhasil dimuat: (5110, 12)


In [31]:
# Hapus kolom yang tidak relevan
if 'id' in df.columns:
    df = df.drop('id', axis=1)

In [32]:
# memilih fitur yang akan digunakan
selected_features = [
    'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
    'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke'
]
df = df[selected_features]

In [33]:
# Cek dan hapus duplikasi
df = df.drop_duplicates()

In [34]:
# Handle missing value
df['bmi'] = df['bmi'].fillna(df['bmi'].median())

In [35]:
# Label encoding untuk fitur kategorikal
label_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
for col in label_cols:
    df[col] = df[col].fillna('Unknown')
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [36]:
# Label encoding untuk fitur kategorikal
numerical_cols = ['age', 'avg_glucose_level', 'bmi']
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [37]:
# Hasil data setelah preprocessing
preprocessor.explore_data(df)

=== INFORMASI DATASET ===
Shape: (5110, 11)
Columns: ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke']

=== INFO DATA ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   int32  
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   int32  
 5   work_type          5110 non-null   int32  
 6   Residence_type     5110 non-null   int32  
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                5110 non-null   float64
 9   smoking_status     5110 non-null   int32  
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int32(5), int64(3)
memo

{'shape': (5110, 11),
 'columns': ['gender',
  'age',
  'hypertension',
  'heart_disease',
  'ever_married',
  'work_type',
  'Residence_type',
  'avg_glucose_level',
  'bmi',
  'smoking_status',
  'stroke'],
 'missing_values': gender               0
 age                  0
 hypertension         0
 heart_disease        0
 ever_married         0
 work_type            0
 Residence_type       0
 avg_glucose_level    0
 bmi                  0
 smoking_status       0
 stroke               0
 dtype: int64,
 'dtypes': gender                 int32
 age                  float64
 hypertension           int64
 heart_disease          int64
 ever_married           int32
 work_type              int32
 Residence_type         int32
 avg_glucose_level    float64
 bmi                  float64
 smoking_status         int32
 stroke                 int64
 dtype: object}

In [21]:
# contoh data teratas
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,0.816895,0,1,1,2,1,0.801265,0.30126,1,1
1,0,0.743652,0,0,1,3,0,0.679023,,2,1
2,1,0.975586,0,1,1,2,0,0.234512,0.254296,2,1
3,0,0.597168,0,0,1,2,1,0.536008,0.27606,3,1
4,0,0.963379,1,0,1,3,0,0.549349,0.15693,2,1


In [38]:
# Menyimpan data yang telah diproses
df.to_csv('stroke_preprocessed.csv', index=False)