In [1]:
# Contoh Data Cleaning dengan Python
import pandas as pd
import numpy as np
# Membuat dataframe contoh
data = {
'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Eve', np.nan],
'Age': [25, 30, 35, 25, np.nan, 50],
'City': ['New York', 'Los Angeles', 'Chicago', 'New York', 'Miami', 'Los Angeles']
}
df = pd.DataFrame(data)
# Menampilkan data awal
print("Data Awal:")
print(df)
# Menghapus duplikat
df = df.drop_duplicates()
# Menangani missing values dengan mengisi nilai median untuk kolom numerik
df['Age'] = df['Age'].fillna(df['Age'].median())
# Menghapus baris yang mengandung missing values di kolom 'Name'
df = df.dropna(subset=['Name'])
# Menampilkan data setelah cleaning
print("\nData Setelah Cleaning:")
df

Data Awal:
      Name   Age         City
0    Alice  25.0     New York
1      Bob  30.0  Los Angeles
2  Charlie  35.0      Chicago
3    Alice  25.0     New York
4      Eve   NaN        Miami
5      NaN  50.0  Los Angeles

Data Setelah Cleaning:


Unnamed: 0,Name,Age,City
0,Alice,25.0,New York
1,Bob,30.0,Los Angeles
2,Charlie,35.0,Chicago
4,Eve,32.5,Miami


In [2]:
# Identifikasi Missing Values
import pandas as pd
df = pd.read_csv('data.csv')
print(df.isnull().sum())


Nama     1
Age      1
City     0
dtype: int64


In [3]:
# Mengisi Missing Values df['Age'].fillna(df['Age'].mean())
# Mean df['Age'].fillna(df['Age'].median(), inplace=True) 
# Median df['Age'].fillna(df['Age'].mode()[0], inplace=True)
# Mode


In [4]:
# Forward/Backward Fill
df['Age'].fillna(method='ffill', inplace=True) # Forward fill
df['Age'].fillna(method='bfill', inplace=True) # Backward fill


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(method='ffill', inplace=True) # Forward fill
  df['Age'].fillna(method='ffill', inplace=True) # Forward fill
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(method='bfill', inplace=True) # Backward fill
  df['Age'].fillna(method='bfill', inplace=

In [5]:
# Identifikasi duplikat
print(df.duplicated().sum())
# Menghapus duplikat
df.drop_duplicates(inplace=True)

1


In [7]:
# Langkah 1: Membuat DataFrame
import pandas as pd
import numpy as np

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Eve', np.nan],
    'Age': [25, 30, 35, 25, np.nan, 50],
    'City': ['New York', 'Los Angeles', 'Chicago', 'New York', 'Miami', 'Los Angeles']
}

df = pd.DataFrame(data)

# Langkah 2: Identifikasi Outliers dengan IQR
Q1 = df['Age'].quantile(0.25)
Q3 = df['Age'].quantile(0.75)
IQR = Q3 - Q1

# Menentukan batas bawah dan atas untuk outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Menandai outliers
outliers = df[(df['Age'] < lower_bound) | (df['Age'] > upper_bound)]
print("Outliers:\n", outliers)

# Langkah 3: Menangani Outliers

# Menghapus outliers
df_no_outliers = df[(df['Age'] >= lower_bound) & (df['Age'] <= upper_bound)]

# Mengganti outliers dengan nilai median
median_age = df['Age'].median()
df['Age'] = np.where((df['Age'] < lower_bound) | (df['Age'] > upper_bound), median_age, df['Age'])

# Mengisi missing values dengan median
df['Age'].fillna(median_age, inplace=True)
df['Name'].fillna('Unknown', inplace=True)

# Menampilkan hasil akhir
print("\nData setelah penanganan missing values dan outliers:\n", df)

Outliers:
 Empty DataFrame
Columns: [Name, Age, City]
Index: []

Data setelah penanganan missing values dan outliers:
       Name   Age         City
0    Alice  25.0     New York
1      Bob  30.0  Los Angeles
2  Charlie  35.0      Chicago
3    Alice  25.0     New York
4      Eve  30.0        Miami
5  Unknown  50.0  Los Angeles


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(median_age, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Name'].fillna('Unknown', inplace=True)


In [1]:
# Contoh: misalnya, kita memiliki dataset dengan kolom "City" yang berisi data kategorikal: 
import pandas as pd 
data = {'City': ['New York', 'Los Angeles', 'Chicago', 'New York', 'Miami']} 
df = pd.DataFrame(data) 
# One-Hot Encoding 
df_one_hot = pd.get_dummies(df, columns=['City']) 
print(df_one_hot)

   City_Chicago  City_Los Angeles  City_Miami  City_New York
0         False             False       False           True
1         False              True       False          False
2          True             False       False          False
3         False             False       False           True
4         False             False        True          False


In [2]:
import pandas as pd

import numpy as np 
 
data = { 
'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Eve', np.nan], 
'Age': [25, 30, 35, 25, np.nan, 50], 'City': ['New York', 'Los Angeles', 'Chicago', 'New York', 'Miami', 'Los Angeles'] 
} 
df = pd.DataFrame(data) 
 
# Menghapus Data Tidak Valid: Jika jumlah data tidak valid kecil dan tidak signifikan, menghapusnya bisa menjadi solusi yang cepat dan mudah. 
df = df.dropna(subset=['Age', 'Name']) # Menghapus baris dengan nilai 'Age' atau 'Name' yang tidak valid

In [3]:
import pandas as pd
import numpy as np

# Contoh data dengan nilai yang hilang (NaN)
data = {
    'Name': ['Alice', np.nan, 'Charlie', 'David', np.nan],
    'Age': [25, np.nan, 30, np.nan, 40]
}

# Membuat DataFrame
df = pd.DataFrame(data)

# Mengisi nilai yang hilang pada kolom 'Age' dengan median
df['Age'] = df['Age'].fillna(df['Age'].median())

# Mengisi nilai yang hilang pada kolom 'Name' dengan nilai default 'Unknown'
df['Name'] = df['Name'].fillna('Unknown')

# Menampilkan DataFrame setelah perbaikan
print(df)

      Name   Age
0    Alice  25.0
1  Unknown  30.0
2  Charlie  30.0
3    David  30.0
4  Unknown  40.0


In [4]:
# Transformasi Data: Mengubah data tidak valid menjadi format yang sesuai. 
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')  # Mengubah nilai 'Age' yang tidak valid menjadi NaN 

In [5]:
# Pembersihan dengan Logika Bisnis: Menggunakan aturan bisnis untuk memperbaiki data. Misalnya, jika usia tidak masuk akal (misalnya lebih dari 120 tahun), maka ubah atau hapus. 
df = df[(df['Age'] >= 0) & (df['Age'] <= 120)]

In [6]:
import pandas as pd 
import numpy as np 
 
data = { 
'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Eve', np.nan], 
'Age': [25, 30, 35, 25, np.nan, 150],  # 150 dianggap sebagai nilai tidak valid 
'City': ['New York', 'Los Angeles', 'Chicago', 'New York', 'Miami', 'Los Angeles'] 
} 
df = pd.DataFrame(data) 
 
# Identifikasi nilai tidak valid
print("Data Awal:") 
print(df) 
 
# Mengubah nilai 'Age' yang tidak valid menjadi NaN 
df['Age'] = pd.to_numeric(df['Age'], errors='coerce') 
 
# Mengisi nilai 'Age' yang hilang atau tidak valid dengan median 
df['Age'] = df['Age'].fillna(df['Age'].median()) 
 
# Mengisi nilai 'Name' yang hilang dengan 'Unknown' 
df['Name'] = df['Name'].fillna('Unknown') 
 
print("\nData Setelah Menangani Nilai Tidak Valid:") 
print(df)

Data Awal:
      Name    Age         City
0    Alice   25.0     New York
1      Bob   30.0  Los Angeles
2  Charlie   35.0      Chicago
3    Alice   25.0     New York
4      Eve    NaN        Miami
5      NaN  150.0  Los Angeles

Data Setelah Menangani Nilai Tidak Valid:
      Name    Age         City
0    Alice   25.0     New York
1      Bob   30.0  Los Angeles
2  Charlie   35.0      Chicago
3    Alice   25.0     New York
4      Eve   30.0        Miami
5  Unknown  150.0  Los Angeles


In [7]:
import pandas as pd 
from sklearn.preprocessing import MinMaxScaler, StandardScaler 

# Contoh dataset 
data = { 
    'Age': [25, 30, 35, 40, 45], 
    'Salary': [50000, 60000, 70000, 80000, 90000] 
} 
df = pd.DataFrame(data) 

# Min-Max Scaling 
min_max_scaler = MinMaxScaler() 
df_min_max_scaled = pd.DataFrame(min_max_scaler.fit_transform(df), columns=df.columns) 

# Z-Score Normalization 
standard_scaler = StandardScaler() 
df_standard_scaled = pd.DataFrame(standard_scaler.fit_transform(df), columns=df.columns)

# Menampilkan hasil
print("Data Asli:")
print(df)

print("\nMin-Max Scaled Data:")
print(df_min_max_scaled) 

print("\nStandardized Data:")
print(df_standard_scaled)

Data Asli:
   Age  Salary
0   25   50000
1   30   60000
2   35   70000
3   40   80000
4   45   90000

Min-Max Scaled Data:
    Age  Salary
0  0.00    0.00
1  0.25    0.25
2  0.50    0.50
3  0.75    0.75
4  1.00    1.00

Standardized Data:
        Age    Salary
0 -1.414214 -1.414214
1 -0.707107 -0.707107
2  0.000000  0.000000
3  0.707107  0.707107
4  1.414214  1.414214


In [8]:
from sklearn.model_selection import KFold, cross_val_score 
from sklearn.linear_model import LinearRegression 
import numpy as np 
# Contoh dataset 
X = np.array([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]]) 
y = np.array([2, 3, 4, 5, 6]) 
# Model 
model = LinearRegression() 
# K-Fold Cross-Validation 
kf = KFold(n_splits=5)
scores = cross_val_score(model, X, y, cv=kf) 
print("K-Fold Cross-Validation Scores:", scores) 
print("Mean Score:", np.mean(scores))

K-Fold Cross-Validation Scores: [nan nan nan nan nan]
Mean Score: nan




In [9]:
# Implementasi K-Fold Cross-Validation di Python 
from sklearn.model_selection import KFold, cross_val_score 
from sklearn.linear_model import LogisticRegression 
from sklearn.datasets import load_iris 
# Load dataset 
iris = load_iris() 
X = iris.data 
y = iris.target 
# Model 
model = LogisticRegression(max_iter=1000) 
# K-Fold Cross-Validation 
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # Misalnya, menggunakan 5 folds dengan shuffle dan seed random 42 
scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy') 
print("K-Fold Cross-Validation Scores:", scores) 
print("Mean Score:", scores.mean())

K-Fold Cross-Validation Scores: [1.         1.         0.93333333 0.96666667 0.96666667]
Mean Score: 0.9733333333333334


In [10]:
import numpy as np 
# Data 
data = np.array([10, 15, 8, 12, 14, 20, 18, 16, 11, 13]) 
# Bootstrap sampling 
n_samples = 1000 
bootstrap_means = np.zeros(n_samples) 
for i in range(n_samples): bootstrap_sample = np.random.choice(data, size=len(data), replace=True) 
bootstrap_means[i] = np.mean(bootstrap_sample) 
# Confidence interval (95%) 
ci_lower = np.percentile(bootstrap_means, 2.5) 
ci_upper = np.percentile(bootstrap_means, 97.5) 
print("Mean:", np.mean(data)) 
print("95% Confidence Interval:", ci_lower, "-", ci_upper)

Mean: 13.7
95% Confidence Interval: 0.0 - 0.0


In [11]:
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, confusion_matrix 
from sklearn.datasets import load_iris 
from sklearn.linear_model import LogisticRegression 
 
# Load dataset 
iris = load_iris() 
X = iris.data 
y = iris.target 
 
# Bagi dataset menjadi data pelatihan dan data pengujian 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

# Inisialisasi dan latih model 
model = LogisticRegression(max_iter=1000) 
model.fit(X_train, y_train) 
 
# Prediksi dengan data pengujian 
y_pred = model.predict(X_test) 

# Evaluasi kinerja model 
accuracy = accuracy_score(y_test, y_pred) 
conf_matrix = confusion_matrix(y_test, y_pred) 

print("Accuracy:", accuracy) 
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 1.0
Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


In [12]:
# Contoh Confussion Matrix di NLP 
# Impor library yang diperlukan 
import nltk 
from nltk.corpus import movie_reviews 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import train_test_split 
from sklearn.svm import LinearSVC 
from sklearn.metrics import confusion_matrix, classification_report 
 
# Unduh dataset sentimen ulasan film dari NLTK 
nltk.download('movie_reviews') 
 
# Ambil ulasan dan label dari dataset 
documents = [(list(movie_reviews.words(fileid)), category) 
for category in movie_reviews.categories() 
for fileid in movie_reviews.fileids(category)] 
 
# Pisahkan teks ulasan dan label 
texts = [' '.join(document) for document, category in documents]
labels = [category for document, category in documents] 
 
# Ubah teks menjadi vektor fitur TF-IDF 
vectorizer = TfidfVectorizer(max_features=5000) 
X = vectorizer.fit_transform(texts) 
 
# Bagi dataset menjadi data pelatihan dan data pengujian 
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, 
random_state=42) 
 
# Inisialisasi dan latih model klasifikasi (misalnya, Linear SVM) 
classifier = LinearSVC() 
classifier.fit(X_train, y_train) 
 
# Prediksi kelas pada data pengujian 
y_pred = classifier.predict(X_test) 
 
# Evaluasi model menggunakan confusion matrix 
conf_matrix = confusion_matrix(y_test, y_pred) 
print("Confusion Matrix:") 
print(conf_matrix) 
 
# Evaluasi model menggunakan classification report 
report = classification_report(y_test, y_pred) 
print("\nClassification Report:") 
print(report)

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Confusion Matrix:
[[170  29]
 [ 37 164]]

Classification Report:
              precision    recall  f1-score   support

         neg       0.82      0.85      0.84       199
         pos       0.85      0.82      0.83       201

    accuracy                           0.83       400
   macro avg       0.84      0.84      0.83       400
weighted avg       0.84      0.83      0.83       400

