In [42]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
file = "https://docs.google.com/spreadsheets/d/1hUWzqJ8x8iXWB8hxMcficvav1LIG2mOKY4zx7VowT6g/export?format=csv"
data = pd.read_csv(file)

In [3]:
data

Unnamed: 0,no,nim,nama,alamat,nilai,tanggal_lahir
0,1,2020TI1,Lisa Watson,Kombeli,60.0,10/22/2002
1,2,2020TI2,Amy Weaver,Dongkala,70.0,12/12/1999
2,3,2020TI3,Julie Murray,Kombeli,59.0,4/7/2003
3,4,2020TI4,RUDI,Takimpo,95.0,2/16/2001
4,5,2020TI5,Kevin Hampton,Dongkala,87.0,3/24/2002
...,...,...,...,...,...,...
105,106,2020TI106,,,68.0,9/25/2001
106,107,2020TI107,Jenna Delacruz,Kombeli,77.0,8/30/1999
107,108,2020TI108,,Dongkala,87.0,9/8/2000
108,109,2020TI109,,Takimpo,60.0,3/24/1999


In [21]:
#1Menghapus baris dengan nilai null di kolom 'nama'
data= data.dropna(subset=['nama'])

In [26]:
data

Unnamed: 0,no,nim,nama,alamat,nilai,tanggal_lahir,nilai_normalized
0,1,2020TI1,Lisa Watson,Kombeli,60,10/22/2002,0.612245
1,2,2020TI2,Amy Weaver,Dongkala,70,12/12/1999,0.714286
2,3,2020TI3,Julie Murray,Kombeli,59,4/7/2003,0.602041
3,4,2020TI4,RUDI,Takimpo,95,2/16/2001,0.969388
4,5,2020TI5,Kevin Hampton,Dongkala,87,3/24/2002,0.887755
...,...,...,...,...,...,...,...
99,100,2020TI100,Shannon Guzman,Pasarwajo,70,5/28/2004,0.714286
100,101,2020TI101,Robert Rojas,Takimpo,98,12/28/2003,1.000000
101,102,2020TI102,Andrew Anderson,Takimpo,73,9/1/2001,0.744898
103,104,2020TI104,Daniel Smith,Takimpo,81,8/20/2003,0.826531


In [33]:
#2Mengubah tipe data kolom 'nilai' menjadi integer
data['nilai'] = pd.to_numeric(data['nilai'], errors='coerce').fillna(0).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['nilai'] = pd.to_numeric(data['nilai'], errors='coerce').fillna(0).astype(int)


In [34]:
data.dtypes

Unnamed: 0,0
no,int64
nim,object
nama,object
alamat,object
nilai,int64
tanggal_lahir,object
nilai_normalized,float64


In [35]:
#3Mengganti data null di kolom 'alamat' dengan 'Pasarwajo'
data['alamat'] = data['alamat'].fillna('Pasarwajo')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['alamat'] = data['alamat'].fillna('Pasarwajo')


In [36]:
data['alamat']

Unnamed: 0,alamat
0,Kombeli
1,Dongkala
2,Kombeli
3,Takimpo
4,Dongkala
...,...
99,Pasarwajo
100,Takimpo
101,Takimpo
103,Takimpo


In [37]:
#Data Transformation: Normalisasi kolom "nilai"
datanilai = (data['nilai'] - data['nilai'].min()) / (data['nilai'].max() - data['nilai'].min())

In [38]:
datanilai

Unnamed: 0,nilai
0,0.612245
1,0.714286
2,0.602041
3,0.969388
4,0.887755
...,...
99,0.714286
100,1.000000
101,0.744898
103,0.826531


In [77]:
#Data Reduction: Memilih hanya kolom yang relevan
reduced_data = data[['nama', 'alamat', 'nilai_normalized']]

In [78]:
reduced_data

Unnamed: 0,nama,alamat,nilai_normalized
0,Lisa Watson,Kombeli,0.612245
1,Amy Weaver,Dongkala,0.714286
2,Julie Murray,Kombeli,0.602041
3,RUDI,Takimpo,0.969388
4,Kevin Hampton,Dongkala,0.887755
...,...,...,...
99,Shannon Guzman,Pasarwajo,0.714286
100,Robert Rojas,Takimpo,1.000000
101,Andrew Anderson,Takimpo,0.744898
103,Daniel Smith,Takimpo,0.826531


In [63]:
#Data Encoding: Mengubah data kategorikal menjadi numerik
encoder = LabelEncoder()
data.loc[:, 'alamat_encoded'] = encoder.fit_transform(data['alamat'])

In [64]:
data[['alamat', 'alamat_encoded']].head()

Unnamed: 0,alamat,alamat_encoded
0,Kombeli,1
1,Dongkala,0
2,Kombeli,1
3,Takimpo,3
4,Dongkala,0


In [69]:
#Feature Selection: Memilih fitur-fitur tertentu (contoh: 'nilai_normalized' dan 'alamat_encoded')
selected_features = data[['nilai_normalized', 'alamat_encoded']]

In [70]:
selected_features

Unnamed: 0,nilai_normalized,alamat_encoded
0,0.612245,1
1,0.714286,0
2,0.602041,1
3,0.969388,3
4,0.887755,0
...,...,...
99,0.714286,2
100,1.000000,3
101,0.744898,3
103,0.826531,3


In [73]:
#Splitting Data: Membagi data menjadi training dan testing set
X = selected_features  # Fitur
y = data['nama']  # Target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.to_csv("X_train.csv", index=False)
X_test.to_csv("X_test.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)


In [80]:
print("\nData training dan testing:")
print("X_train:")
print(X_train.head())
print("y_train:")
print(y_train.head())
print("X_test:")
print(X_test.head())
print("y_test:")
print(y_test.head())



Data training dan testing:
X_train:
    nilai_normalized  alamat_encoded
54          0.897959               1
23          0.551020               1
69          0.622449               1
45          0.795918               0
99          0.714286               2
y_train:
54         John Clay
23     Nicolas Hogan
69        Luke Cline
45        Megan Peck
99    Shannon Guzman
Name: nama, dtype: object
X_test:
    nilai_normalized  alamat_encoded
31          0.897959               0
68          0.642857               0
63          0.540816               3
48          0.887755               0
43          0.714286               0
y_test:
31    Michael Simmons II
68         Eric Mccarthy
63        Joseph Sellers
48       Jessica Schmidt
43        Gwendolyn Rice
Name: nama, dtype: object
