In [3]:
import pandas as pd

# Load dataset
df = pd.read_csv('hepatitis_csv.csv')
df.head()

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
0,30,male,False,False,False,False,False,False,False,False,False,False,False,1.0,85.0,18.0,4.0,,False,live
1,50,female,False,False,True,False,False,False,False,False,False,False,False,0.9,135.0,42.0,3.5,,False,live
2,78,female,True,False,True,False,False,True,False,False,False,False,False,0.7,96.0,32.0,4.0,,False,live
3,31,female,,True,False,False,False,True,False,False,False,False,False,0.7,46.0,52.0,4.0,80.0,False,live
4,34,female,True,False,False,False,False,True,False,False,False,False,False,1.0,,200.0,4.0,,False,live


In [4]:
# --- a. Create Data Subsets ---
# Subsets for different sex (male and female)
df_male = df[df['sex'] == 'male']
df_female = df[df['sex'] == 'female']

In [5]:
# --- b. Merge Two Subsets ---
df_merged = pd.concat([df_male, df_female])
# df_merged.to_csv("merged_subset.csv", index=False)
df_merged.head()

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
0,30,male,False,False,False,False,False,False,False,False,False,False,False,1.0,85.0,18.0,4.0,,False,live
20,22,male,True,True,True,False,False,True,False,False,False,False,False,0.9,48.0,20.0,4.2,64.0,False,live
24,25,male,False,True,False,False,False,True,False,False,False,False,False,0.4,45.0,18.0,4.3,70.0,False,live
27,58,male,True,False,True,False,False,True,True,False,True,False,False,1.4,175.0,55.0,2.7,36.0,False,live
32,41,male,True,True,True,True,True,True,False,False,False,False,False,0.7,81.0,53.0,5.0,74.0,False,live


In [6]:
# --- c. Sort Data ---
df_sorted = df.sort_values(by=['age', 'sgot', 'protime'])
# df_sorted.to_csv("sorted_data.csv", index=False)
df_sorted.head()

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
110,7,female,True,False,False,False,False,True,True,True,False,False,False,0.7,256.0,25.0,4.2,,True,live
37,20,male,False,False,True,True,True,False,True,True,True,False,False,2.3,150.0,68.0,3.9,,False,live
121,20,female,False,False,True,True,True,True,False,False,True,True,False,1.0,160.0,118.0,2.9,23.0,True,live
148,20,female,False,False,False,False,False,True,,False,False,False,False,0.9,89.0,152.0,4.0,,True,live
20,22,male,True,True,True,False,False,True,False,False,False,False,False,0.9,48.0,20.0,4.2,64.0,False,live


In [7]:
# --- d. Transpose Data ---
df_transposed = df.transpose()
# df_transposed.to_csv("transposed_data.csv")
df_transposed.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,145,146,147,148,149,150,151,152,153,154
age,30,50,78,31,34,34,51,23,39,30,...,31,41,70,20,36,46,44,61,53,43
sex,male,female,female,female,female,female,female,female,female,female,...,female,female,female,female,female,female,female,female,male,female
steroid,False,False,True,,True,True,False,True,True,True,...,False,True,False,False,True,True,True,False,False,True
antivirals,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
fatigue,False,True,True,False,False,False,True,False,True,False,...,True,True,True,False,False,True,True,True,True,True
malaise,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,True,False,True,False,False
anorexia,False,False,False,False,False,False,True,False,False,False,...,False,False,True,False,False,True,False,False,False,False
liver_big,False,False,True,True,True,True,True,True,True,True,...,True,True,,True,True,True,True,False,True,True
liver_firm,False,False,False,False,False,False,False,False,True,False,...,False,True,,,False,False,True,True,False,False
spleen_palpable,False,False,False,False,False,False,True,False,False,False,...,False,True,,False,False,False,False,False,True,True


In [8]:
# --- e. Melting Data to Long Format ---
melted_df = pd.melt(
    df,
    id_vars=['age', 'sex', 'class'],
    value_vars=['bilirubin', 'alk_phosphate', 'sgot', 'albumin', 'protime'],
    var_name='Feature',
    value_name='Value'
)
melted_df.tail()

Unnamed: 0,age,sex,class,Feature,Value
770,46,female,die,protime,50.0
771,44,female,live,protime,
772,61,female,live,protime,
773,53,male,live,protime,48.0
774,43,female,die,protime,42.0


In [9]:
# --- f. Casting Data to Wide Format ---
# Objective: Pivot melted data back into a wide format for numerical features
wide_df = melted_df.pivot_table(
    index=['age', 'sex', 'class'],
    columns='Feature',
    values='Value',
    aggfunc='mean'
).reset_index()
wide_df.head()

Feature,age,sex,class,albumin,alk_phosphate,bilirubin,protime,sgot
0,7,female,live,4.2,256.0,0.7,,25.0
1,20,female,live,3.45,124.5,0.95,23.0,135.0
2,20,male,live,3.9,150.0,2.3,,68.0
3,22,female,live,,,0.7,,24.0
4,22,male,live,4.2,48.0,0.9,64.0,20.0
