In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load dataset
data = pd.read_csv("seattle-weather.csv")

# Lihat 5 data pertama
display(data.head())

# Cek struktur data
data.info()

In [None]:
# Cek data kosong (null)
print("=== CEK DATA NULL ===")
print(data.isnull().sum())

# Cek duplikat
print("\nJumlah duplikat:", data.duplicated().sum())

# Hapus duplikat jika ada
data = data.drop_duplicates()


In [None]:
# Tentukan fitur dan label
X = data[['precipitation', 'temp_max', 'temp_min', 'wind']]
y = data['weather']

# Encode label menjadi angka
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Cek hasil encoding
print("Contoh encoding:", list(zip(y[:5], y_encoded[:5])))


In [None]:
# Cek distribusi sebelum SMOTE
print("=== Distribusi Sebelum Balancing ===")
print(pd.Series(y_encoded).value_counts())

# Terapkan SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y_encoded)

# Cek distribusi setelah SMOTE
print("\n=== Distribusi Setelah SMOTE ===")
print(pd.Series(y_resampled).value_counts())

# Gabungkan hasil ke dataframe
balanced_data = pd.DataFrame(X_resampled, columns=X.columns)
balanced_data['weather'] = le.inverse_transform(y_resampled)

# Simpan ke Excel
output_path = "seattle_weather_balanced.xlsx"
balanced_data.to_excel(output_path, index=False)

