# ANALISIS DATASET CATEGORICAL
### Mengimport Library dan Menampilkan Dataset

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("healthcare-dataset-stroke-data.csv")

#### Ketentuan 1 : Mendapatkan dan Memasukkan Informasi ke Dictionary
info_dataset merupakan dictionary {Key : Value} :
1. df.shape[0] merupakan fungsi yang mengembalikan total baris dan kolom, sehingga data pertamanya dapat diambil. 
2. df.columns.values merupakan fungsi yang dapat mengambil nama dari atribut.
3. df.select_dtypes() merupakan fungsi yang digunakan untuk mengambil baris dengan kolom yang yang memiliki data type yang diminta, kemudian ambil kolomnya saja.
4. df.loc merupakan fungsi untuk mengakses data berdasarkan posisi.
5. df.isnull.any() merupakan fungsi yang mengidentifikasi data pada baris kolom tertentu apakah kosong atau tidak

In [5]:
info_dataset = {
  'jumlah_record': df.shape[0],
  'nama_atribut': df.columns.values,
  'atribut_objek': df.select_dtypes(['object']).columns,
  'atribut_angka': df.select_dtypes(['float', 'int']).columns,
  'atribut_nilaiNull': df.loc[:, df.isnull().any()].columns,
}

#### Ketentuan 2 : Menampilkan Informasi 
Membuat memanggil semua key dalam dictionary kemudian menampilkannya didalam perulangan for

In [166]:

print("JUMLAH RECORD :", info_dataset['jumlah_record'])

print("\nNAMA ATRIBUT")
for i in range(12) :
    print("[", i + 1,"]", info_dataset['nama_atribut'][i])

print("\nJUMLAH ATRIBUT DENGAN TIPE OBJEK")
print("Jumlah Atribut Tipe Objek :", len(info_dataset['atribut_objek']))
for i in range(5) :
    print("[", i + 1,"]", info_dataset['atribut_objek'][i])

print("\nJUMLAH ATRIBUT DENGAN TIPE ANGKA")
print("Jumlah Atribut Tipe Angka :", len(info_dataset['atribut_angka']))
for i in range(6) :
    print("[", i + 1,"]", info_dataset['atribut_angka'][i])

print("\nJUMLAH ATRIBUT DENGAN NILAI NULL")
print("Jumlah Atribut Nilai Null :", len(info_dataset['atribut_nilaiNull']))
for i in range(1) :
    print("[", i + 1,"]", info_dataset['atribut_nilaiNull'][i])
    

JUMLAH RECORD : 5110

NAMA ATRIBUT
[ 1 ] id
[ 2 ] gender
[ 3 ] age
[ 4 ] hypertension
[ 5 ] heart_disease
[ 6 ] ever_married
[ 7 ] work_type
[ 8 ] Residence_type
[ 9 ] avg_glucose_level
[ 10 ] bmi
[ 11 ] smoking_status
[ 12 ] stroke

JUMLAH ATRIBUT DENGAN TIPE OBJEK
Jumlah Atribut Tipe Objek : 5
[ 1 ] gender
[ 2 ] ever_married
[ 3 ] work_type
[ 4 ] Residence_type
[ 5 ] smoking_status

JUMLAH ATRIBUT DENGAN TIPE ANGKA
Jumlah Atribut Tipe Angka : 7
[ 1 ] id
[ 2 ] age
[ 3 ] hypertension
[ 4 ] heart_disease
[ 5 ] avg_glucose_level
[ 6 ] bmi

JUMLAH ATRIBUT DENGAN NILAI NULL
Jumlah Atribut Nilai Null : 1
[ 1 ] bmi


#### Ketentuan 3 : Menampilkan  jumlah record, nama attribute, jumlah attribute dengan tipe objek, jumlah attribute dengan tipe angka, jumlah attribute yang memiliki nilai null
df.describe() merupakan fungsi untuk mengembalikan deskripsi data dalam DataFrame. Fungsi ini juga yang menampilkan ketentuan 3 dari analisis dataset.

In [167]:
df.describe() 

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


#### Ketentuan 4 : Menampilkan Nilai Korelasi Atribut bertipe Angka
dropna() merupkan perintah yang menghapus baris yang mana ada data NaN.

corr() digunakan untuk menemukan korelasi berpasangan dari semua kolom dalam Pandas Dataframe dengan Python. Nilai NaN apa pun secara otomatis dikecualikan. Setiap tipe data atau kolom non-numerik dalam Dataframe, diabaikan.


In [168]:
df.dropna().corr()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
id,1.0,0.008984,0.001147,0.004016,0.006409,0.003084,0.004839
age,0.008984,1.0,0.274425,0.257123,0.235838,0.333398,0.232331
hypertension,0.001147,0.274425,1.0,0.115991,0.180543,0.167811,0.142515
heart_disease,0.004016,0.257123,0.115991,1.0,0.154525,0.041357,0.137938
avg_glucose_level,0.006409,0.235838,0.180543,0.154525,1.0,0.175502,0.138936
bmi,0.003084,0.333398,0.167811,0.041357,0.175502,1.0,0.042374
stroke,0.004839,0.232331,0.142515,0.137938,0.138936,0.042374,1.0


#### Ketentuan 5 : Mengubah Nilai Atribut Desimal ke Bilangan Bulat
df.head() menampilkan beberapa data pertama.
data ini sebagai perbandingan sebelum salah satu atributnya diubah.


In [169]:
df.head(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


astype(int) digunakan untuk mentransmisikan tipe data kolom (dtype) di objek pandas. Ini berguna ketika ingin mentransmisikan kolom DataFrame dari satu tipe data ke tipe data lainnya.

In [170]:
df[['avg_glucose_level']] = df[['avg_glucose_level']].astype(int)
df.head(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174,24.0,never smoked,1


#### Ketentuan 6 : Menampilkan 10 Record Pertama dari Atribut bertipe Angka.
df.select_dtypes() merupakan fungsi yang digunakan untuk mengambil baris dengan kolom yang yang memiliki data type yang diminta, kemudian ambil kolomnya saja

df.head() menampilkan beberapa data pertama.

In [171]:
data_angka = df.select_dtypes(include=np.number)
data_angka.head(10)

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
0,9046,67.0,0,1,228,36.6,1
1,51676,61.0,0,0,202,,1
2,31112,80.0,0,1,105,32.5,1
3,60182,49.0,0,0,171,34.4,1
4,1665,79.0,1,0,174,24.0,1
5,56669,81.0,0,0,186,29.0,1
6,53882,74.0,1,1,70,27.4,1
7,10434,69.0,0,0,94,22.8,1
8,27419,59.0,0,0,76,,1
9,60491,78.0,0,0,58,24.2,1


#### Ketentuan 7 : Menyimpan Dataset bertipe Angka ke File CSV
df(atribut).to_csv(file) digunakan untuk menyimpan beberapa atribut ke file CSV baru.

In [172]:
df[{'id', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'stroke'}].to_csv("healthcare_dataset_number.csv")

  df[{'id', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'stroke'}].to_csv("healthcare_dataset_number.csv")
