In [7]:
# In this assignment you will experiment on your own. Using a health dataset of your choice (check with us if you are not sure), write code to demonstrate the following Pandas functions:
# Melt, Pivot, Aggregation, Iteration, Groupby

!pip install ucimlrepo

from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
breast_cancer = fetch_ucirepo(id=14) 
  
# data (as pandas dataframes) 
X = breast_cancer.data.features 
y = breast_cancer.data.targets 
  
# metadata 
print(breast_cancer.metadata) 
  
# variable information 
print(breast_cancer.variables) 


{'uci_id': 14, 'name': 'Breast Cancer', 'repository_url': 'https://archive.ics.uci.edu/dataset/14/breast+cancer', 'data_url': 'https://archive.ics.uci.edu/static/public/14/data.csv', 'abstract': 'This breast cancer domain was obtained from the University Medical Centre, Institute of Oncology, Ljubljana, Yugoslavia. This is one of three domains provided by the Oncology Institute that has repeatedly appeared in the machine learning literature. (See also lymphography and primary-tumor.)', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 286, 'num_features': 9, 'feature_types': ['Categorical'], 'demographics': ['Age'], 'target_col': ['Class'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1988, 'last_updated': 'Thu Mar 07 2024', 'dataset_doi': '10.24432/C51P4M', 'creators': ['Matjaz Zwitter', 'Milan Soklic'], 'intro_paper': None, 'additional_info': {'summary': 'Thi

In [8]:
import pandas as pd
import numpy as np

In [11]:
#1 (MELT) combine the data first before melting 

df = pd.concat([X, y], axis=1)  #pd.concat merges data 

melted_df = pd.melt(df, id_vars=y.columns.tolist(), var_name='Feature', value_name='Value')

print(melted_df.head())

                  Class Feature  Value
0  no-recurrence-events     age  30-39
1  no-recurrence-events     age  40-49
2  no-recurrence-events     age  40-49
3  no-recurrence-events     age  60-69
4  no-recurrence-events     age  40-49


In [12]:
# pd.melt reshapes the data under class, feature and value using numerical intergers

In [16]:
#2 (PIVOT) 
# Data has already been merged 
# I grouped the Diagnosis column and then listed the mean

df = X.join(y)

if isinstance(y, pd.DataFrame):  # To get the column name whether DataFrame or Series 
    target_col = y.columns[0]
else:                                
    target_col = y.name or 'target'
    df[target_col] = y.values

mean_pivot = df.groupby(target_col, dropna=False).mean(numeric_only=True)  # I want the mean
print(mean_pivot)



                      deg-malig
Class                          
no-recurrence-events   1.905473
recurrence-events      2.388235


In [29]:
print(y.head())
print(y.columns)  # I had to check the column head names before moving on. Kept getting errors. 

                  Class
0  no-recurrence-events
1  no-recurrence-events
2  no-recurrence-events
3  no-recurrence-events
4  no-recurrence-events
Index(['Class'], dtype='object')


In [30]:
#3 (Aggregate) 
df = X.join(y)
agg_simple = df.groupby('Class').mean(numeric_only=True)  # this skips any columns that contain strings 

print(agg_simple.head())

                      deg-malig
Class                          
no-recurrence-events   1.905473
recurrence-events      2.388235


In [40]:
#4 (ITERATION) 

print(X.columns.tolist()) # needed to see how to spell exact column headings 

['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'deg-malig', 'breast', 'breast-quad', 'irradiat']


In [41]:
df = X.join(y)  # first 5 rows of data only

for index, row in df.head(5).iterrows():
    print(f"Row {index}: Diagnosis = {row[y.columns[0]]}, Tumor size = {row['tumor-size']}, Degree of malignancy = {row['deg-malig']}")


Row 0: Diagnosis = no-recurrence-events, Tumor size = 30-34, Degree of malignancy = 3
Row 1: Diagnosis = no-recurrence-events, Tumor size = 20-24, Degree of malignancy = 2
Row 2: Diagnosis = no-recurrence-events, Tumor size = 20-24, Degree of malignancy = 2
Row 3: Diagnosis = no-recurrence-events, Tumor size = 15-19, Degree of malignancy = 2
Row 4: Diagnosis = no-recurrence-events, Tumor size = 0-4, Degree of malignancy = 2


In [47]:
#5: Groupby -- Im a little confused on this concept. (Is this just grouping?)

# degree of malignancy and tumor size

df = X.join(y)

# Group by degree of malignancy, calculate average tumor size
grouped = df.groupby('deg-malig')['tumor-size'].value_counts().head(10)
print(grouped)

deg-malig  tumor-size
1          14-Oct        14
           30-34         13
           25-29         11
           20-24         10
           15-19          9
           40-44          5
           0-4            3
           35-39          2
           9-May          2
           45-49          1
Name: count, dtype: int64


In [46]:
# this shows how often each tumor size appears for each malignancy level