In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
aids_clinical_trials_group_study_175 = fetch_ucirepo(id=890) 
  
# data (as pandas dataframes) 
X = aids_clinical_trials_group_study_175.data.features 
y = aids_clinical_trials_group_study_175.data.targets 
  
# metadata 
print(aids_clinical_trials_group_study_175.metadata) 
  
# variable information 
print(aids_clinical_trials_group_study_175.variables) 


{'uci_id': 890, 'name': 'AIDS Clinical Trials Group Study 175', 'repository_url': 'https://archive.ics.uci.edu/dataset/890/aids+clinical+trials+group+study+175', 'data_url': 'https://archive.ics.uci.edu/static/public/890/data.csv', 'abstract': 'The AIDS Clinical Trials Group Study 175 Dataset contains healthcare statistics and categorical information about patients who have been diagnosed with AIDS. This dataset was initially published in 1996. The prediction task is to predict whether or not each patient died within a certain window of time or not. ', 'area': 'Health and Medicine', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Tabular', 'Multivariate'], 'num_instances': 2139, 'num_features': 23, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Sexual Orientation', 'Race', 'Gender'], 'target_col': ['cid'], 'index_col': ['pidnum'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1996, 'last_updated': 'Fri Nov 

In [None]:
import pandas as pd

# Convert features and target to pandas DataFrames
X_df = pd.DataFrame(aids_clinical_trials_group_study_175.data.features, columns=aids_clinical_trials_group_study_175.metadata.features)
y_df = pd.DataFrame(aids_clinical_trials_group_study_175.data.targets, columns=['cid'])

# Concatenate features and target
df = pd.concat([X_df, y_df], axis=1)

# Save as CSV
df.to_csv('act_dataset.csv', index=False)

In [1]:
#load the dataset
import pandas as pd
df = pd.read_csv('act_dataset.csv')
print(df.head()) # print the first 5 rows of the dataset
# print(df.shape) # print the shape of the dataset (rows, columns)
# print(df.tail()) # print the last 5 rows of the dataset

   time  trt  age     wtkg  hemo  homo  drugs  karnof  oprior  z30  ...  str2  \
0   948    2   48  89.8128     0     0      0     100       0    0  ...     0   
1  1002    3   61  49.4424     0     0      0      90       0    1  ...     1   
2   961    3   45  88.4520     0     1      1      90       0    1  ...     1   
3  1166    3   47  85.2768     0     1      0     100       0    1  ...     1   
4  1090    0   43  66.6792     0     1      0     100       0    1  ...     1   

   strat  symptom  treat  offtrt  cd40  cd420  cd80  cd820  cid  
0      1        0      1       0   422    477   566    324    0  
1      3        0      1       0   162    218   392    564    1  
2      3        0      1       1   326    274  2063   1893    0  
3      3        0      1       0   287    394  1590    966    0  
4      3        0      0       0   504    353   870    782    0  

[5 rows x 24 columns]


In [2]:
# Normalize the dataset columns in which the values are not in the same range
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
print(data.head()) # print the first 5 rows of the dataset


       time       trt       age      wtkg  hemo  homo  drugs    karnof  \
0  0.767461  0.666667  0.620690  0.456128   0.0   0.0    0.0  1.000000   
1  0.811832  1.000000  0.844828  0.143032   0.0   0.0    0.0  0.666667   
2  0.778143  1.000000  0.568966  0.445574   0.0   1.0    1.0  0.666667   
3  0.946590  1.000000  0.603448  0.420948   0.0   1.0    0.0  1.000000   
4  0.884141  0.000000  0.534483  0.276713   0.0   1.0    0.0  1.000000   

   oprior  z30  ...  str2  strat  symptom  treat  offtrt      cd40     cd420  \
0     0.0  0.0  ...   0.0    0.0      0.0    1.0     0.0  0.351960  0.400000   
1     0.0  1.0  ...   1.0    1.0      0.0    1.0     0.0  0.135113  0.157944   
2     0.0  1.0  ...   1.0    1.0      0.0    1.0     1.0  0.271893  0.210280   
3     0.0  1.0  ...   1.0    1.0      0.0    1.0     0.0  0.239366  0.322430   
4     0.0  1.0  ...   1.0    1.0      0.0    0.0     0.0  0.420350  0.284112   

       cd80     cd820  cid  
0  0.105814  0.033835  0.0  
1  0.070811  0.0

In [8]:
# Check for duplicate rows
print(df.duplicated().sum())


0


In [10]:
# Data Encoding

# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Instantiate LabelEncoder
le = LabelEncoder()


time       float64
trt        float64
age        float64
wtkg       float64
hemo       float64
homo       float64
drugs      float64
karnof     float64
oprior     float64
z30        float64
zprior     float64
preanti    float64
race       float64
gender     float64
str2       float64
strat      float64
symptom    float64
treat      float64
offtrt     float64
cd40       float64
cd420      float64
cd80       float64
cd820      float64
cid          int64
dtype: object
       time       trt       age      wtkg      hemo      homo     drugs  \
0  0.235799  0.424960  1.464542  1.107649 -0.303123 -1.396547 -0.388893   
1  0.420600  1.311779  2.957595 -1.936862 -0.303123 -1.396547 -0.388893   
2  0.280288  1.311779  1.119991  1.005025 -0.303123  0.716052  2.571400   
3  0.981848  1.311779  1.349692  0.765569 -0.303123  0.716052 -0.388893   
4  0.721757 -1.348678  0.890291 -0.636959 -0.303123  0.716052 -0.388893   

     karnof    oprior       z30  ...      str2     strat   symptom     treat  \

In [12]:
print(df.tail())



          time       trt       age      wtkg     hemo      homo     drugs  \
2134  0.725180  1.311779 -1.636415 -1.646094  3.29899 -1.396547 -0.388893   
2135 -1.656702 -1.348678 -2.095816  2.099680  3.29899 -1.396547 -0.388893   
2136  0.769669  0.424960  2.038793 -0.397503  3.29899  0.716052 -0.388893   
2137 -1.417145 -1.348678 -2.440366 -1.140667  3.29899 -1.396547 -0.388893   
2138  0.567756  1.311779  1.119991  0.164003  3.29899 -1.396547 -0.388893   

        karnof    oprior       z30  ...      str2     strat   symptom  \
2134  0.771836 -0.149888  0.904064  ...  0.840894  1.134907 -0.457338   
2135  0.771836 -0.149888  0.904064  ...  0.840894  1.134907 -0.457338   
2136 -0.923192 -0.149888  0.904064  ...  0.840894  1.134907 -0.457338   
2137  0.771836 -0.149888 -1.106116  ... -1.189210 -1.090177 -0.457338   
2138  0.771836 -0.149888 -1.106116  ... -1.189210 -1.090177 -0.457338   

         treat    offtrt      cd40     cd420      cd80     cd820  cid  
2134  0.575371  1.325309 -