In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
class DataPreprocessor:
    def __init__(self):
        self.scaler = None
        self.label_encoders = {}
        self.imputer = None
        self.feature_names = None
        
    def load_data(self, file_path, file_type='csv', **kwargs):
        """Memuat data dari berbagai format file"""
        try:
            if file_type.lower() == 'csv':
                data = pd.read_csv(file_path, **kwargs)
            elif file_type.lower() == 'excel':
                data = pd.read_excel(file_path, **kwargs)
            elif file_type.lower() == 'json':
                data = pd.read_json(file_path, **kwargs)
            else:
                raise ValueError(f"Format file {file_type} tidak didukung")
                
            print(f"Data berhasil dimuat: {data.shape}")
            return data
        except Exception as e:
            print(f"Error memuat data: {e}")
            return None
            
    def explore_data(self, data):
        """Eksplorasi data dasar"""
        print("=== INFORMASI DATASET ===")
        print(f"Shape: {data.shape}")
        print(f"Columns: {list(data.columns)}")
        print("\n=== INFO DATA ===")
        print(data.info())
        
        print("\n=== STATISTIK DESKRIPTIF ===")
        print(data.describe())
        
        print("\n=== MISSING VALUES ===")
        missing = data.isnull().sum()
        print(missing[missing > 0])
        
        print("\n=== TIPE DATA ===")
        print(data.dtypes)
        
        return {
            'shape': data.shape,
            'columns': list(data.columns),
            'missing_values': missing,
            'dtypes': data.dtypes
        }
        
    def handle_missing_values(self, data, strategy='mean', columns=None):
        """Menangani missing values"""
        if columns is None:
            columns = data.select_dtypes(include=[np.number]).columns
            
        if strategy in ['mean', 'median', 'most_frequent']:
            self.imputer = SimpleImputer(strategy=strategy)
            data[columns] = self.imputer.fit_transform(data[columns])
        elif strategy == 'drop':
            data = data.dropna(subset=columns)
        elif strategy == 'fill_zero':
            data[columns] = data[columns].fillna(0)
        else:
            raise ValueError(f"Strategy {strategy} tidak didukung")
            
        print(f"Missing values ditangani dengan strategy: {strategy}")
        return data
        
    def encode_categorical(self, data, columns=None, method='label'):
        """Encoding variabel kategorikal"""
        if columns is None:
            columns = data.select_dtypes(include=['object']).columns
            
        for col in columns:
            if method == 'label':
                if col not in self.label_encoders:
                    self.label_encoders[col] = LabelEncoder()
                data[col] = self.label_encoders[col].fit_transform(data[col].astype(str))
            elif method == 'onehot':
                dummies = pd.get_dummies(data[col], prefix=col)
                data = pd.concat([data, dummies], axis=1)
                data = data.drop(col, axis=1)
                
        print(f"Categorical encoding selesai dengan method: {method}")
        return data
        
    def scale_features(self, data, columns=None, method='standard'):
        """Scaling fitur numerik"""
        if columns is None:
            columns = data.select_dtypes(include=[np.number]).columns
            
        if method == 'standard':
            self.scaler = StandardScaler()
        elif method == 'minmax':
            self.scaler = MinMaxScaler()
        else:
            raise ValueError(f"Scaling method {method} tidak didukung")
            
        data[columns] = self.scaler.fit_transform(data[columns])
        print(f"Feature scaling selesai dengan method: {method}")
        return data
        
    def remove_outliers(self, data, columns=None, method='iqr', threshold=1.5):
        """Menghapus outliers"""
        if columns is None:
            columns = data.select_dtypes(include=[np.number]).columns
            
        initial_shape = data.shape
        
        if method == 'iqr':
            for col in columns:
                Q1 = data[col].quantile(0.25)
                Q3 = data[col].quantile(0.75)
                IQR = Q3 - Q1
                lower = Q1 - threshold * IQR
                upper = Q3 + threshold * IQR
                data = data[(data[col] >= lower) & (data[col] <= upper)]
                
        elif method == 'zscore':
            for col in columns:
                z_scores = np.abs((data[col] - data[col].mean()) / data[col].std())
                data = data[z_scores < threshold]
                
        print(f"Outliers dihapus: {initial_shape[0] - data.shape[0]} rows")
        return data
        
    def split_data(self, data, target_column, test_size=0.2, random_state=42):
        """Split data menjadi train dan test"""
        if target_column not in data.columns:
            raise ValueError(f"Target column {target_column} tidak ditemukan")
            
        X = data.drop(target_column, axis=1)
        y = data[target_column]
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state
        )
        
        self.feature_names = list(X.columns)
        
        print(f"Data split - Train: {X_train.shape}, Test: {X_test.shape}")
        return X_train, X_test, y_train, y_test
        
    def create_fuzzy_sets(self, data, column, n_sets=3, set_names=None):
        """Membuat fuzzy sets berdasarkan distribusi data"""
        if column not in data.columns:
            raise ValueError(f"Column {column} tidak ditemukan")
            
        col_data = data[column]
        min_val = col_data.min()
        max_val = col_data.max()
        
        if set_names is None:
            set_names = [f"set_{i+1}" for i in range(n_sets)]
            
        if len(set_names) != n_sets:
            raise ValueError("Jumlah set_names harus sama dengan n_sets")
            
        # Buat fuzzy sets dengan overlapping triangular membership
        sets = {}
        step = (max_val - min_val) / (n_sets - 1)
        
        for i, name in enumerate(set_names):
            if i == 0:  # Set pertama
                a = min_val
                b = min_val
                c = min_val + step
            elif i == n_sets - 1:  # Set terakhir
                a = max_val - step
                b = max_val
                c = max_val
            else:  # Set tengah
                a = min_val + (i - 1) * step
                b = min_val + i * step
                c = min_val + (i + 1) * step
                
            sets[name] = ('triangular', [a, b, c])
            
        return sets
        
    def plot_data_distribution(self, data, columns=None, figsize=(15, 10)):
        """Plot distribusi data"""
        if columns is None:
            columns = data.select_dtypes(include=[np.number]).columns
            
        n_cols = min(4, len(columns))
        n_rows = (len(columns) + n_cols - 1) // n_cols
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
        if n_rows == 1:
            axes = axes.reshape(1, -1)
            
        for i, col in enumerate(columns):
            row = i // n_cols
            col_idx = i % n_cols
            
            if n_rows > 1:
                ax = axes[row, col_idx]
            else:
                ax = axes[col_idx]
                
            data[col].hist(bins=30, ax=ax, alpha=0.7)
            ax.set_title(f'Distribution of {col}')
            ax.set_xlabel(col)
            ax.set_ylabel('Frequency')
            
        # Hide empty subplots
        for i in range(len(columns), n_rows * n_cols):
            row = i // n_cols
            col_idx = i % n_cols
            if n_rows > 1:
                axes[row, col_idx].axis('off')
            else:
                axes[col_idx].axis('off')
                
        plt.tight_layout()
        plt.show()
        
    def correlation_matrix(self, data, figsize=(10, 8)):
        """Plot correlation matrix"""
        numeric_data = data.select_dtypes(include=[np.number])
        
        plt.figure(figsize=figsize)
        correlation = numeric_data.corr()
        
        sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0,
                   square=True, linewidths=0.5)
        plt.title('Correlation Matrix')
        plt.tight_layout()
        plt.show()
        
        return correlation
    
    def save_preprocessor(self, filepath):
        """Simpan preprocessor untuk digunakan nanti"""
        import pickle
        
        preprocessor_data = {
            'scaler': self.scaler,
            'label_encoders': self.label_encoders,
            'imputer': self.imputer,
            'feature_names': self.feature_names
        }
        
        with open(filepath, 'wb') as f:
            pickle.dump(preprocessor_data, f)
        print(f"Preprocessor disimpan ke {filepath}")
    
    def load_preprocessor(self, filepath):
        """Muat preprocessor yang sudah disimpan"""
        import pickle
        
        with open(filepath, 'rb') as f:
            preprocessor_data = pickle.load(f)
        
        self.scaler = preprocessor_data['scaler']
        self.label_encoders = preprocessor_data['label_encoders']
        self.imputer = preprocessor_data['imputer']
        self.feature_names = preprocessor_data['feature_names']
        
        print(f"Preprocessor dimuat dari {filepath}")
    
    def transform_new_data(self, data):
        """Transform data baru menggunakan preprocessor yang sudah di-fit"""
        # Apply imputer jika ada
        if self.imputer:
            numeric_cols = data.select_dtypes(include=[np.number]).columns
            data[numeric_cols] = self.imputer.transform(data[numeric_cols])
        
        # Apply label encoders
        for col, encoder in self.label_encoders.items():
            if col in data.columns:
                data[col] = encoder.transform(data[col].astype(str))
        
        # Apply scaler
        if self.scaler:
            numeric_cols = data.select_dtypes(include=[np.number]).columns
            data[numeric_cols] = self.scaler.transform(data[numeric_cols])
        
        return data


In [24]:
# load dataset
df = pd.read_csv('diabetes.csv')
    
    

In [25]:
# memilih fitur yang akan digunakan
selected_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']


   

In [26]:
# Mengganti nilai 0 menjadi NaN pada fitur selain outcome dan pregnancies
fitur_medis = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in fitur_medis:
    df[col] = df[col].replace(0, np.nan)


In [27]:
# Inisialisasi preprocessor
preprocessor = DataPreprocessor()

In [28]:
# Menangani missing values dengan median pada fitur medis
df = preprocessor.handle_missing_values(df, strategy='median', columns=fitur_medis)

Missing values ditangani dengan strategy: median


In [31]:
# Nromalisasi fitur age
scaler = MinMaxScaler()
df[fitur_medis + ['Age']] = scaler.fit_transform(df[fitur_medis + ['Age']])

In [32]:
# Hasil data setelah preprocessing
preprocessor.explore_data(df)

=== INFORMASI DATASET ===
Shape: (768, 9)
Columns: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']

=== INFO DATA ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    float64
 2   BloodPressure             768 non-null    float64
 3   SkinThickness             768 non-null    float64
 4   Insulin                   768 non-null    float64
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    float64
 8   Outcome                   768 non-null    int64  
dtypes: float64(7), int64(2)
memory usage: 54.1 KB
None

=== STATISTIK DESKRIPTIF ===
       Pre

{'shape': (768, 9),
 'columns': ['Pregnancies',
  'Glucose',
  'BloodPressure',
  'SkinThickness',
  'Insulin',
  'BMI',
  'DiabetesPedigreeFunction',
  'Age',
  'Outcome'],
 'missing_values': Pregnancies                 0
 Glucose                     0
 BloodPressure               0
 SkinThickness               0
 Insulin                     0
 BMI                         0
 DiabetesPedigreeFunction    0
 Age                         0
 Outcome                     0
 dtype: int64,
 'dtypes': Pregnancies                   int64
 Glucose                     float64
 BloodPressure               float64
 SkinThickness               float64
 Insulin                     float64
 BMI                         float64
 DiabetesPedigreeFunction    float64
 Age                         float64
 Outcome                       int64
 dtype: object}

In [33]:
# contoh data teratas
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,0.670968,0.489796,0.304348,0.133413,0.314928,0.627,0.483333,1
1,1,0.264516,0.428571,0.23913,0.133413,0.171779,0.351,0.166667,0
2,8,0.896774,0.408163,0.23913,0.133413,0.104294,0.672,0.183333,1
3,1,0.290323,0.428571,0.173913,0.096154,0.202454,0.167,0.0,0
4,0,0.6,0.163265,0.304348,0.185096,0.509202,2.288,0.2,1
