import library yang diperlukan

- pandas: Untuk membaca dan memanipulasi data.
- sklearn.model_selection: Untuk membagi dataset dan melakukan validasi silang.
- sklearn.tree: Untuk membangun dan melatih model pohon keputusan.
- sklearn.metrics: Untuk mengevaluasi performa model.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

#### Langkah 1: Pilih dataset dari Kaggle

In [2]:
data = pd.read_csv('Student_performance_prediction.csv')

In [3]:
print(data)

     gender race/ethnicity parental level of education         lunch  \
0    female        group D                some college      standard   
1      male        group D          associate's degree      standard   
2    female        group D                some college  free/reduced   
3      male        group B                some college  free/reduced   
4    female        group D          associate's degree      standard   
..      ...            ...                         ...           ...   
995    male        group C                some college      standard   
996    male        group C                some college      standard   
997  female        group A                 high school      standard   
998    male        group E                 high school      standard   
999    male        group D                 high school      standard   

    test preparation course  math score  reading score  writing score  
0                 completed          59             70         

##### Langkah 1.1: Proses cleaning data
- Cek informasi awal dataset

In [4]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB
None


In [6]:
print(data.isnull().sum())

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64


In [7]:
print(data[data.duplicated()])

Empty DataFrame
Columns: [gender, race/ethnicity, parental level of education, lunch, test preparation course, math score, reading score, writing score]
Index: []


In [8]:
data['math score'] = data['math score'].astype(int)
data['reading score'] = data['reading score'].astype(int)
data['writing score'] = data['writing score'].astype(int)

In [9]:
data_encoded = pd.get_dummies(data, columns=['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course'])

In [10]:
data_encoded.to_csv('cleaned_data.csv', index=False)

print("Pembersihan data selesai dan dataset telah disimpan sebagai 'cleaned_data.csv'.")

Pembersihan data selesai dan dataset telah disimpan sebagai 'cleaned_data.csv'.


In [11]:
print(data_encoded.head())

   math score  reading score  writing score  gender_female  gender_male  \
0          59             70             78           True        False   
1          96             93             87          False         True   
2          57             76             77           True        False   
3          70             70             63          False         True   
4          83             85             86           True        False   

   race/ethnicity_group A  race/ethnicity_group B  race/ethnicity_group C  \
0                   False                   False                   False   
1                   False                   False                   False   
2                   False                   False                   False   
3                   False                    True                   False   
4                   False                   False                   False   

   race/ethnicity_group D  race/ethnicity_group E  \
0                    True        

In [12]:
data_encoded = data_encoded.astype(int)

In [13]:
print(data_encoded)

     math score  reading score  writing score  gender_female  gender_male  \
0            59             70             78              1            0   
1            96             93             87              0            1   
2            57             76             77              1            0   
3            70             70             63              0            1   
4            83             85             86              1            0   
..          ...            ...            ...            ...          ...   
995          77             77             71              0            1   
996          80             66             66              0            1   
997          67             86             86              1            0   
998          80             72             62              0            1   
999          58             47             45              0            1   

     race/ethnicity_group A  race/ethnicity_group B  race/ethnicity_group C

In [14]:
data_encoded.to_csv('cleaned_data.csv', index=False)

print("Pembersihan data selesai dan dataset telah disimpan sebagai 'cleaned_data.csv'.")

Pembersihan data selesai dan dataset telah disimpan sebagai 'cleaned_data.csv'.


In [15]:
print(data_encoded.head())

   math score  reading score  writing score  gender_female  gender_male  \
0          59             70             78              1            0   
1          96             93             87              0            1   
2          57             76             77              1            0   
3          70             70             63              0            1   
4          83             85             86              1            0   

   race/ethnicity_group A  race/ethnicity_group B  race/ethnicity_group C  \
0                       0                       0                       0   
1                       0                       0                       0   
2                       0                       0                       0   
3                       0                       1                       0   
4                       0                       0                       0   

   race/ethnicity_group D  race/ethnicity_group E  \
0                       1        

In [16]:
print(data_encoded.dtypes)

math score                                        int32
reading score                                     int32
writing score                                     int32
gender_female                                     int32
gender_male                                       int32
race/ethnicity_group A                            int32
race/ethnicity_group B                            int32
race/ethnicity_group C                            int32
race/ethnicity_group D                            int32
race/ethnicity_group E                            int32
parental level of education_associate's degree    int32
parental level of education_bachelor's degree     int32
parental level of education_high school           int32
parental level of education_master's degree       int32
parental level of education_some college          int32
parental level of education_some high school      int32
lunch_free/reduced                                int32
lunch_standard                                  

In [17]:
data['average_score'] = data[['math score', 'reading score', 'writing score']].mean(axis=1)
data['performance'] = data['average_score'].apply(lambda x: 1 if x >= 60 else 0)

In [22]:
x = data.drop(columns=["average_score", "performance", "math score", "reading score", "writing score"])  # Atribut
y = data["performance"]  # Label

In [23]:
print(x)

     gender race/ethnicity parental level of education         lunch  \
0    female        group D                some college      standard   
1      male        group D          associate's degree      standard   
2    female        group D                some college  free/reduced   
3      male        group B                some college  free/reduced   
4    female        group D          associate's degree      standard   
..      ...            ...                         ...           ...   
995    male        group C                some college      standard   
996    male        group C                some college      standard   
997  female        group A                 high school      standard   
998    male        group E                 high school      standard   
999    male        group D                 high school      standard   

    test preparation course  
0                 completed  
1                      none  
2                      none  
3              

In [24]:
print(y)

0      1
1      1
2      1
3      1
4      1
      ..
995    1
996    1
997    1
998    1
999    0
Name: performance, Length: 1000, dtype: int64


#### Langkah 3: Membagi data menjadi data training dan data testing
- menggunakan 70% data untuk training dan 30% untuk testing untuk memastikan keseimbangan antara pelatihan dan pengujian.

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

In [31]:
len(x_test)

300

In [32]:
len(y_train)

700

In [35]:
x_encoded = pd.get_dummies(x)

In [37]:
x_train_encoded, x_test_encoded, y_train, y_test = train_test_split(x_encoded, y, test_size=0.3, random_state=1)

In [39]:
print(x_encoded)

     gender_female  gender_male  race/ethnicity_group A  \
0             True        False                   False   
1            False         True                   False   
2             True        False                   False   
3            False         True                   False   
4             True        False                   False   
..             ...          ...                     ...   
995          False         True                   False   
996          False         True                   False   
997           True        False                    True   
998          False         True                   False   
999          False         True                   False   

     race/ethnicity_group B  race/ethnicity_group C  race/ethnicity_group D  \
0                     False                   False                    True   
1                     False                   False                    True   
2                     False                   False   

- Membuat model Decision Tree

In [42]:
data_model = DecisionTreeClassifier(random_state=1)


- Melatih model

In [43]:
data_model.fit(x_train_encoded, y_train)

- Memprediksi dengan data uji

In [44]:
y_pred = data_model.predict(x_test_encoded)

- Evaluasi model

In [49]:
accuracy = accuracy_score(y_test, y_pred)
print("Ketepatan: ", accuracy)

Ketepatan:  0.66


In [50]:
scores = cross_val_score(data_model, x, y, cv=5)

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Ali\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Ali\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Ali\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\tree\_classes.py", line 1009, in fit
    super()._fit(
  File "C:\Users\Ali\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\tree\_classes.py", line 252, in _fit
    X, y = self._validate_data(
  File "C:\Users\Ali\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 645, in _validate_data
    X = check_array(X, input_name="X", **check_X_params)
  File "C:\Users\Ali\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 997, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "C:\Users\Ali\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\_array_api.py", line 521, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "C:\Users\Ali\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: 'female'
