In [35]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
class DataPreprocessor:
    def __init__(self):
        self.scaler = None
        self.label_encoders = {}
        self.imputer = None
        self.feature_names = None
        
    def load_data(self, file_path, file_type='csv', **kwargs):
        """Memuat data dari berbagai format file"""
        try:
            if file_type.lower() == 'csv':
                data = pd.read_csv(file_path, **kwargs)
            elif file_type.lower() == 'excel':
                data = pd.read_excel(file_path, **kwargs)
            elif file_type.lower() == 'json':
                data = pd.read_json(file_path, **kwargs)
            else:
                raise ValueError(f"Format file {file_type} tidak didukung")
                
            print(f"Data berhasil dimuat: {data.shape}")
            return data
        except Exception as e:
            print(f"Error memuat data: {e}")
            return None
            
    def explore_data(self, data):
        """Eksplorasi data dasar"""
        print("=== INFORMASI DATASET ===")
        print(f"Shape: {data.shape}")
        print(f"Columns: {list(data.columns)}")
        print("\n=== INFO DATA ===")
        print(data.info())
        
        print("\n=== STATISTIK DESKRIPTIF ===")
        print(data.describe())
        
        print("\n=== MISSING VALUES ===")
        missing = data.isnull().sum()
        print(missing[missing > 0])
        
        print("\n=== TIPE DATA ===")
        print(data.dtypes)
        
        return {
            'shape': data.shape,
            'columns': list(data.columns),
            'missing_values': missing,
            'dtypes': data.dtypes
        }
        
    def handle_missing_values(self, data, strategy='mean', columns=None):
        """Menangani missing values"""
        if columns is None:
            columns = data.select_dtypes(include=[np.number]).columns
            
        if strategy in ['mean', 'median', 'most_frequent']:
            self.imputer = SimpleImputer(strategy=strategy)
            data[columns] = self.imputer.fit_transform(data[columns])
        elif strategy == 'drop':
            data = data.dropna(subset=columns)
        elif strategy == 'fill_zero':
            data[columns] = data[columns].fillna(0)
        else:
            raise ValueError(f"Strategy {strategy} tidak didukung")
            
        print(f"Missing values ditangani dengan strategy: {strategy}")
        return data

In [36]:
# load dataset
df = pd.read_csv('diabetes.csv')
    
    

In [37]:
# memilih fitur yang akan digunakan
selected_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']


   

In [38]:
# Mengganti nilai 0 menjadi NaN pada fitur selain outcome dan pregnancies
fitur_medis = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in fitur_medis:
    df[col] = df[col].replace(0, np.nan)


In [39]:
# Inisialisasi preprocessor
preprocessor = DataPreprocessor()

In [40]:
# Menangani missing values dengan median pada fitur medis
df = preprocessor.handle_missing_values(df, strategy='median', columns=fitur_medis)

Missing values ditangani dengan strategy: median


In [41]:
# Nromalisasi fitur age
scaler = MinMaxScaler()
df[fitur_medis + ['Age']] = scaler.fit_transform(df[fitur_medis + ['Age']])

In [42]:
# Hasil data setelah preprocessing
preprocessor.explore_data(df)

=== INFORMASI DATASET ===
Shape: (768, 9)
Columns: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']

=== INFO DATA ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    float64
 2   BloodPressure             768 non-null    float64
 3   SkinThickness             768 non-null    float64
 4   Insulin                   768 non-null    float64
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    float64
 8   Outcome                   768 non-null    int64  
dtypes: float64(7), int64(2)
memory usage: 54.1 KB
None

=== STATISTIK DESKRIPTIF ===
       Pre

{'shape': (768, 9),
 'columns': ['Pregnancies',
  'Glucose',
  'BloodPressure',
  'SkinThickness',
  'Insulin',
  'BMI',
  'DiabetesPedigreeFunction',
  'Age',
  'Outcome'],
 'missing_values': Pregnancies                 0
 Glucose                     0
 BloodPressure               0
 SkinThickness               0
 Insulin                     0
 BMI                         0
 DiabetesPedigreeFunction    0
 Age                         0
 Outcome                     0
 dtype: int64,
 'dtypes': Pregnancies                   int64
 Glucose                     float64
 BloodPressure               float64
 SkinThickness               float64
 Insulin                     float64
 BMI                         float64
 DiabetesPedigreeFunction    float64
 Age                         float64
 Outcome                       int64
 dtype: object}

In [43]:
# contoh data teratas
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,0.670968,0.489796,0.304348,0.133413,0.314928,0.627,0.483333,1
1,1,0.264516,0.428571,0.23913,0.133413,0.171779,0.351,0.166667,0
2,8,0.896774,0.408163,0.23913,0.133413,0.104294,0.672,0.183333,1
3,1,0.290323,0.428571,0.173913,0.096154,0.202454,0.167,0.0,0
4,0,0.6,0.163265,0.304348,0.185096,0.509202,2.288,0.2,1


In [None]:
# Menyimpan data yang telah diproses
df.to_csv('diabetes_preprocessed.csv', index=False)