In [3]:
# Import library
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Function to find the most occurring Label
def get_label(neighbours, y):
    sehat_count, tidak_sehat_count = 0, 0
    for element in neighbours:
        if y[element[1]] == 'Sehat':
            sehat_count += 1
        elif y[element[1]] == 'Tidak Sehat':
            tidak_sehat_count += 1
    if sehat_count == tidak_sehat_count:
        return y[neighbours[0][1]]
    return 'Sehat' if sehat_count > tidak_sehat_count else 'Tidak Sehat'

# Function to calculate Euclidean Distance
def distance(element1, element2):
    x_distance = (element1[0] - element2[0]) ** 2
    y_distance = (element1[1] - element2[1]) ** 2
    return (x_distance + y_distance) ** 0.5

# Function to find the distance between input point and all points in the dataset
def find_nearest(x, y, input, k):
    distances = []
    for id, element in enumerate(x):
        distances.append([distance(input, element), id])
    distances = sorted(distances)
    predicted_label = get_label(distances[0:k], y)
    return predicted_label, distances[0:k], distances[k:]

# Mengimpor data dari file CSV ke dalam DataFrame
df = pd.read_csv('Dataset - 8.csv')

print("Dataset Shape:", df.shape)
print("Kolom:", df.columns.tolist())
print("\nDistribusi Status Sehat:")
print(df['Status Sehat'].value_counts())
print("\nSample data:")
print(df.head())

# Dataset baru untuk diprediksi
n_gula = 20  # gram gula
n_kalori = 150  # kalori

# Visualiasi dataset (Grafik akan muncul di browser)
fig = px.scatter(df, x='Gula (g)', y='Kalori', color='Status Sehat', 
                 hover_data=['Nama Minuman', 'Jenis'],
                 width=800, height=400, title='Visualisasi Dataset Minuman')
fig.add_trace(go.Scatter(x=[n_gula], y=[n_kalori], name="Point to Classify",
                        marker=dict(size=15, color='red', symbol='star')))
fig.show()

# Mencari Nearest Neighbours sejumlah k = 3
x = df[['Gula (g)', 'Kalori']].to_numpy()
y = df['Status Sehat'].tolist()
k = 3
input = (n_gula, n_kalori)

predicted_label, nearest_neighbours, far_neighbours = find_nearest(x, y, input, k)
nearest_neighbours = [[neighbour[1], x[neighbour[1], 0], x[neighbour[1], 1], neighbour[0], y[neighbour[1]]] for
                      neighbour in nearest_neighbours]
nearest_neighbours = pd.DataFrame(nearest_neighbours, columns=['id', 'Gula (g)', 'Kalori', 'Jarak', 'Status Sehat'])

# Jarak data baru dengan jumlah k = 3
print(f"\n=== HASIL KNN dengan K={k} ===")
print("Nearest Neighbours:")
print(nearest_neighbours)
print("")
print(f'Prediksi Status Sehat untuk minuman baru dengan:')
print(f'- Gula: {n_gula} gram')
print(f'- Kalori: {n_kalori}')
print(f'Hasil prediksi: {predicted_label}')

# Visualisasi Prediksi

# Mencari Far Neighbours
far_neighbours = [[neighbour[1], x[neighbour[1], 0], x[neighbour[1], 1], neighbour[0], y[neighbour[1]]]
                  for neighbour in far_neighbours]
far_neighbours = pd.DataFrame(far_neighbours, columns=['id', 'Gula (g)', 'Kalori', 'Jarak', 'Status Sehat'])

# Visualisasi baru fig2 dengan data far neighbours
fig2 = px.scatter(far_neighbours, x='Gula (g)', y='Kalori', color='Status Sehat',
                  width=800, height=400, title='Visualisasi KNN Prediction')

# Menambahkan titik dari data baru dan nearest neighbours ke fig2
for index, neighbour in nearest_neighbours.iterrows():
    fig2.add_trace(
        go.Scatter(x=[input[0], neighbour['Gula (g)']],
                   y=[input[1], neighbour['Kalori']],
                   mode='lines+markers',
                   name=f'Neighbour {int(neighbour["id"])} - {neighbour["Status Sehat"]}',
                   line=dict(dash='dash'))
    )

# Tambahkan point yang akan diklasifikasi
fig2.add_trace(go.Scatter(x=[input[0]], y=[input[1]], 
                         name=f"New Point - Predicted: {predicted_label}",
                         marker=dict(size=15, color='red', symbol='star')))

fig2.show()

# Analisis tambahan
print(f"\n=== ANALISIS TAMBAHAN ===")
print(f"Rata-rata gula minuman sehat: {df[df['Status Sehat'] == 'Sehat']['Gula (g)'].mean():.2f} gram")
print(f"Rata-rata kalori minuman sehat: {df[df['Status Sehat'] == 'Sehat']['Kalori'].mean():.2f}")
print(f"Rata-rata gula minuman tidak sehat: {df[df['Status Sehat'] == 'Tidak Sehat']['Gula (g)'].mean():.2f} gram")
print(f"Rata-rata kalori minuman tidak sehat: {df[df['Status Sehat'] == 'Tidak Sehat']['Kalori'].mean():.2f}")

Dataset Shape: (1000, 5)
Kolom: ['Nama Minuman', 'Jenis', 'Gula (g)', 'Kalori', 'Status Sehat']

Distribusi Status Sehat:
Status Sehat
Tidak Sehat    504
Sehat          496
Name: count, dtype: int64

Sample data:
        Nama Minuman Jenis  Gula (g)  Kalori Status Sehat
0           Kopi her   Jus        15     234  Tidak Sehat
1      Soda indicate   Teh        36       6  Tidak Sehat
2  Soda professional   Jus        23      27        Sehat
3          Jus occur   Teh        17     193        Sehat
4      Jus attention  Kopi        30     184  Tidak Sehat



=== HASIL KNN dengan K=3 ===
Nearest Neighbours:
    id  Gula (g)  Kalori     Jarak Status Sehat
0  968        19     150  1.000000        Sehat
1  106        21     152  2.236068  Tidak Sehat
2  823        22     152  2.828427  Tidak Sehat

Prediksi Status Sehat untuk minuman baru dengan:
- Gula: 20 gram
- Kalori: 150
Hasil prediksi: Tidak Sehat



=== ANALISIS TAMBAHAN ===
Rata-rata gula minuman sehat: 19.62 gram
Rata-rata kalori minuman sehat: 125.88
Rata-rata gula minuman tidak sehat: 19.85 gram
Rata-rata kalori minuman tidak sehat: 118.03
