## Import All Necessary Packages/Library

In [1]:
import pandas as pd

## Data Wrangling

### Gathering Data

In [24]:
prodi_df = pd.read_csv("data\prodi\program-studi.csv")
prodi_df.head()

Unnamed: 0,No,Nama Prodi,Nama PT,Jenjang,LLDikti,Unnamed: 5
0,1,ADMINISTRASI RUMAH SAKIT,AKADEMI ADMINISTRASI RUMAH SAKIT MATARAM,D-III,8.0,
1,2,Akuntansi,Akademi Akuntansi (AKTAN) Boekittinggi,D-III,10.0,
2,3,Akuntansi,Akademi Akuntansi Bandung,D-III,4.0,
3,4,Akuntansi,Akademi Akuntansi Bina Insani,D-III,4.0,
4,5,Akuntansi,Akademi Akuntansi Borobudur,D-III,3.0,


### Assessing Data 

In [25]:
print(prodi_df.shape)

(25538, 6)


In [26]:
prodi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25538 entries, 0 to 25537
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   No          25538 non-null  int64  
 1   Nama Prodi  25538 non-null  object 
 2   Nama PT     25408 non-null  object 
 3   Jenjang     22222 non-null  object 
 4   LLDikti     11893 non-null  float64
 5   Unnamed: 5  1 non-null      object 
dtypes: float64(1), int64(1), object(4)
memory usage: 1.2+ MB


In [27]:
prodi_df.isna().sum()

No                0
Nama Prodi        0
Nama PT         130
Jenjang        3316
LLDikti       13645
Unnamed: 5    25537
dtype: int64

It appears that there are missing values in the 'Nama PT', 'Jenjang', 'LLDikti', and 'Unnamed: 5' columns

In [28]:
print("Jumlah duplikasi data = ", prodi_df.duplicated().sum())

Jumlah duplikasi data =  0


there's no duplicate data in this datasets

### Cleaning Data

Initially, we dropped the columns 'LLDikti' and 'Unnamed: 5' because the data in these columns was not used.

In [29]:
drop_col = ['LLDikti', 'Unnamed: 5']

columns_to_drop = ['LLDikti', 'Unnamed: 5']
prodi_df = prodi_df.drop(columns=[col for col in columns_to_drop if col in prodi_df.columns], errors='ignore')

prodi_df.head()

Unnamed: 0,No,Nama Prodi,Nama PT,Jenjang
0,1,ADMINISTRASI RUMAH SAKIT,AKADEMI ADMINISTRASI RUMAH SAKIT MATARAM,D-III
1,2,Akuntansi,Akademi Akuntansi (AKTAN) Boekittinggi,D-III
2,3,Akuntansi,Akademi Akuntansi Bandung,D-III
3,4,Akuntansi,Akademi Akuntansi Bina Insani,D-III
4,5,Akuntansi,Akademi Akuntansi Borobudur,D-III


In [30]:
prodi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25538 entries, 0 to 25537
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   No          25538 non-null  int64 
 1   Nama Prodi  25538 non-null  object
 2   Nama PT     25408 non-null  object
 3   Jenjang     22222 non-null  object
dtypes: int64(1), object(3)
memory usage: 798.2+ KB


Drop data 'Jenjang' = 'S2', 'S2 Terapan', dan 'S3' because the data was not used.

In [31]:
prodi_df.isna().sum()

No               0
Nama Prodi       0
Nama PT        130
Jenjang       3316
dtype: int64

In [32]:
jenjang_to_remove = ['s2', 's2 terapan', 's3']
prodi_df = prodi_df[~prodi_df['Jenjang'].str.lower().isin(jenjang_to_remove)]

In [33]:
print(prodi_df.shape)

(22107, 4)


In [34]:
prodi_df.isna().sum()

No               0
Nama Prodi       0
Nama PT        130
Jenjang       3316
dtype: int64

Handle missing values by removing rows with missing values

In [35]:
prodi_df = prodi_df.dropna()

In [36]:
print(prodi_df.shape)

(18791, 4)


In [37]:
prodi_df.isna().sum()

No            0
Nama Prodi    0
Nama PT       0
Jenjang       0
dtype: int64

Cleaning data in S1

In [38]:
prodi_df_s1 = prodi_df[prodi_df['Jenjang'].str.strip().str.upper() == 'S1']

In [39]:
prodi_df_s1.loc[:, 'Nama Prodi'] = prodi_df_s1['Nama Prodi'].str.strip()
prodi_df_s1.loc[:, 'Nama PT'] = prodi_df_s1['Nama PT'].str.strip()
prodi_df_s1.loc[:, 'Jenjang'] = prodi_df_s1['Jenjang'].str.strip()

In [40]:
print("\nData setelah pembersihan (khusus S1):")
print(prodi_df_s1.head())


Data setelah pembersihan (khusus S1):
      No                                   Nama Prodi  \
907  908          Pendidikan Guru Madrasah Ibtidaiyah   
908  909                              Ekonomi Syariah   
909  910  Hukum Keluarga Islam (Ahwal Al Syakhshiyah)   
910  911             Hukum Ekonomi Syariah (Muamalah)   
911  912                           Pendidikan Biologi   

                          Nama PT Jenjang  
907  IAI Agus Salim Metro Lampung      S1  
908  IAI Agus Salim Metro Lampung      S1  
909  IAI Agus Salim Metro Lampung      S1  
910                    IAIN Ambon      S1  
911                    IAIN Ambon      S1  


In [41]:
print(prodi_df.shape)

(18791, 4)


In [42]:
prodi_df.isna().sum()

No            0
Nama Prodi    0
Nama PT       0
Jenjang       0
dtype: int64

the missing value in the data The missing values in the data have been removed.

## Saving Cleaned Dataset

In [43]:
df = prodi_df.loc[:, 'Nama Prodi':'Jenjang']
df.head()

Unnamed: 0,Nama Prodi,Nama PT,Jenjang
0,ADMINISTRASI RUMAH SAKIT,AKADEMI ADMINISTRASI RUMAH SAKIT MATARAM,D-III
1,Akuntansi,Akademi Akuntansi (AKTAN) Boekittinggi,D-III
2,Akuntansi,Akademi Akuntansi Bandung,D-III
3,Akuntansi,Akademi Akuntansi Bina Insani,D-III
4,Akuntansi,Akademi Akuntansi Borobudur,D-III


In [53]:
df['Nama Prodi'] = df['Nama Prodi'].str.title()
df.head(10)

Unnamed: 0,Nama Prodi,Nama PT,Jenjang
0,Administrasi Rumah Sakit,AKADEMI ADMINISTRASI RUMAH SAKIT MATARAM,D-III
1,Akuntansi,Akademi Akuntansi (AKTAN) Boekittinggi,D-III
2,Akuntansi,Akademi Akuntansi Bandung,D-III
3,Akuntansi,Akademi Akuntansi Bina Insani,D-III
4,Akuntansi,Akademi Akuntansi Borobudur,D-III
5,Manajemen Informatika,Akademi Akuntansi Dan Komputer Stephen Jambi,D-III
6,Komputerisasi Akuntansi,Akademi Akuntansi Dan Komputer Stephen Jambi,D-III
7,Akuntansi,Akademi Akuntansi Dan Manajemen Mitra Lampung,D-III
8,Akuntansi,Akademi Akuntansi Dan Manajemen Pembangunan,D-III
9,Akuntansi,Akademi Akuntansi Denpasar,D-III


In [51]:
df.to_csv('data/prodi/cleaned-data.csv', index=False)
print("Saved as 'cleaned-data.csv'")

Saved as 'cleaned-data.csv'
