# Tugas Project ML 👨‍💻👩‍💻

## 🙆‍♂️🙆‍♀️ Anggota Kelompok:
1. Agnes Triana Cyntianesa	24060120120021
2. Arifa Alif Malicha Khairunnisa	24060120120033
3. Dimas Wahyu Ardiyanto	24060120140159
4. Muhammad Hafizh Roihan	24060120130123
5. Vito Ahmad Husein	24060120140111
6. Zara Zetira Puti	24060120120030

## 🕵️‍♂️ Link Deskripsi Dataset
https://archive.ics.uci.edu/ml/datasets/Lymphography
## 👩‍🏫 Link Data Dataset
https://archive.ics.uci.edu/ml/machine-learning-databases/lymphography/lymphography.data

## 📚 Import Library 


In [1]:
# Pandas is a Python library for data analysis
import pandas
# Pickle can transform a complex object into a byte stream and it can transform the byte stream into an object with the same internal structure
import pickle
# model_selection from sklearn is for split arrays or matrices into random train and test subsets
from sklearn import model_selection

## 📋 Menyiapkan Dataset
- Lymphography is an x-ray study of lymph nodes and lymphatic vessels made visible by the injection of a special dye.
- Domain berfokus pada pemeriksaan pembuluh dan kelenjar limfa secara radiologis.
- This lymphography domain was obtained from the University Medical Centre, Institute of Oncology, Ljubljana, Yugoslavia.
- Data Set Characteristics:  Multivariate
- Attribute Characteristics: Categorical
- Associated Tasks: Classification
- Number of Instances:148
- Number of Attributes:18
- Date Donated: November 1, 1988

In [2]:
# Mendefinisikan link dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/lymphography/lymphography.data"

# Mendefinisikan list berisi kolom - kolom dataset
names = ['class','lymphatic','bof','bolc','bols','bp','e','r','eu','lnd','lnu','cil','din','cin','cis','sf','do','eon','nodes']
dataset = pandas.read_csv(url, names=names)

## 🔎 Melihat Data dalam Dataset

In [3]:
dataset.head(10)

Unnamed: 0,class,lymphatic,bof,bolc,bols,bp,e,r,eu,lnd,lnu,cil,din,cin,cis,sf,do,eon,nodes
0,3,4,2,1,1,1,1,1,2,1,2,2,2,4,8,1,1,2,2
1,2,3,2,1,1,2,2,1,2,1,3,3,2,3,4,2,2,2,2
2,3,3,2,2,2,2,2,2,2,1,4,3,3,4,8,3,2,2,7
3,3,3,1,1,1,1,2,1,2,1,3,3,4,4,4,3,1,2,6
4,2,3,1,1,1,1,1,1,1,1,2,2,4,3,5,1,2,2,1
5,2,2,1,1,1,1,1,1,2,1,3,3,3,3,6,3,1,2,4
6,2,2,2,1,1,1,1,1,2,1,2,3,2,3,8,2,1,1,1
7,2,3,2,1,1,1,2,1,2,1,2,2,2,2,1,3,1,1,1
8,3,2,2,1,1,1,1,1,2,1,3,2,2,2,8,3,1,2,5
9,3,2,1,1,1,1,1,1,2,1,2,2,3,3,5,3,1,1,2


## 🌏 Melihat Distribusi Class Dataset

In [4]:
# Distribusi kelas data
dataset.groupby('class').size()

class
1     2
2    81
3    61
4     4
dtype: int64

## 🚧 Membagi Dataset 
- 80% digunakan untuk melatih model (Training datasets)
- 20% digunakan untuk data validasi dan menghitung keakuratan model (Validation datasets)


In [5]:
# menyimpan nilai-nilai dalam dataset ke dalam variabel array
array = dataset.values

# menyimpan data fitur- fitur pada dataset ke dalam variabel X
# Data fitur terdapat pada indeks 0-18 (termasuk label)
X = array[:,0:18]

# menyimpan data label pada dataset ke dalam variabel Y
# Data label terdapat pada indeks 0
Y = array[:,0]

# mendefinisikan ukuran testing data dan seed untuk random state
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

## 🔑 Melihat Label Data dalam Validation Dataset

In [6]:
print(Y_validation)

[2 3 3 3 2 3 2 3 2 3 2 2 2 2 2 2 2 2 2 3 3 2 2 3 3 3 2 3 2 2]


## 🔨 Membangun Model
#### Algoritma yang akan dicoba untuk menemukan model terbaik antara lain:
1. Nearest Neighbors - KNeighborsClassifier
2. Naive Bayes - Gaussian Naive Bayes
3. Support Vector Machine - SVC
4. Stochastic Gradient Descent - SGDClassifier
5. Decision Tree - DecisionTreeClassifier

In [7]:
# Import library berisi algoritma klasifikasi yang akan digunakan
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

# mendefinisikan array kosong
models = []

# membuat array asosiatif berisi nama algoritma dan algoritma klasifikasinya
models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('SGD', SGDClassifier()))
models.append(('Tree', DecisionTreeClassifier()))

## ⚡ K-Folds Cross Validation
- Random state = 7 
- Penilaian model berdasarkan akurasi

In [8]:
# Mendefinisikan seed dan scoring yang digunakan untuk menguji model dengan 
# validation data
seed = 7
scoring = 'accuracy'

# Menyimpan hasil pengujian
results = []
names = []

# membagi data menjadi training dan validation set menggunakan K-Fold cross
# validation dengan k = 10
for name, model in models:
  kfold = model_selection.KFold(n_splits=10, random_state=seed, shuffle=True)
  cv_results = model_selection.cross_val_score(model, X_train, Y_train,
                                               cv=kfold, scoring=scoring)
  
  # menguji akurasi dari masing-masing model menggunakan validation data
  results.append(cv_results)
  names.append(name)
  msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
  print(msg)

KNN: 0.805303 (0.158804)
NB: 0.958333 (0.076830)
SVM: 0.925000 (0.126106)
SGD: 0.890909 (0.098787)
Tree: 1.000000 (0.000000)


## 🏆 Menghitung Keakuratan Model Terbaik dengan Data Validasi

In [9]:
# import library untuk mengevaluasi algoritma
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# mendefinisikan KNN
bestModel = DecisionTreeClassifier()

# melatih data training dengan algoritma dari model terbaik
bestModel.fit(X_train, Y_train)

# menguji model klasifikasi yang dihasilkan dari proses pelatihan data dengan
# testing data
predictions = bestModel.predict(X_validation)

print("Akurasi: ", accuracy_score(Y_validation, predictions))
print("Confusion Matrix:\n", confusion_matrix(Y_validation, predictions), "\n")
print("Classification Report:\n", classification_report(Y_validation, predictions))

Akurasi:  1.0
Confusion Matrix:
 [[18  0]
 [ 0 12]] 

Classification Report:
               precision    recall  f1-score   support

           2       1.00      1.00      1.00        18
           3       1.00      1.00      1.00        12

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



## 🚚 Export Model untuk Dipakai di Website

In [10]:
# Hasil dump akan tampil jika dilakukan lewat Jupyter Notebook
filename='best_model.pkl'
pickle.dump(bestModel, open(filename,'wb'))

## 🛸 Cara Alternatif Export Model (Khusus Colab)

In [None]:
# Jika dilakukan lewat Google Colab
# import gc
# import pickle
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

# Export Pickle File
# pick_insert = open('drive/My Drive/data.pickle','wb')
# pickle.dump(data, pick_insert)
# pick_insert.close()

# Import Pickle File (Tidak perlu dipakai)
# pick_read = open('drive/My Drive/data.pickle','rb')
# data = pickle.load(pick_read)
# pick_read.close()

## 🚨 Install Library Khusus Jupyter Notebook

In [3]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install -U scikit-learn scipy matplotlib

Collecting scikit-learnNote: you may need to restart the kernel to use updated packages.

  Downloading scikit_learn-1.1.3-cp311-cp311-win_amd64.whl (7.5 MB)
     ---------------------------------------- 7.5/7.5 MB 3.6 MB/s eta 0:00:00
Collecting scipy
  Downloading scipy-1.9.3-cp311-cp311-win_amd64.whl (39.9 MB)
     ---------------------------------------- 39.9/39.9 MB 3.6 MB/s eta 0:00:00
Collecting matplotlib
  Using cached matplotlib-3.6.2-cp311-cp311-win_amd64.whl (7.2 MB)
Collecting joblib>=1.0.0
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
     -------------------------------------- 298.0/298.0 kB 2.6 MB/s eta 0:00:00
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting contourpy>=1.0.1
  Using cached contourpy-1.0.6-cp311-cp311-win_amd64.whl (163 kB)
Collecting cycler>=0.10
  Using cached cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting fonttools>=4.22.0
  Using cached fonttools-4.38.0-py3-none-any.whl (965 kB)
Collect