In [1]:
import numpy as np
import pandas as pd

In [8]:
column_names = ['class','age','menopause','tumor_size','inv_nodes',\
               'node_caps','deg_malig','breast','breast_quad','irradiat']

In [9]:
data = pd.read_csv('breast+cancer/breast-cancer.data',names = column_names)

In [10]:
data

Unnamed: 0,class,age,menopause,tumor_size,inv_nodes,node_caps,deg_malig,breast,breast_quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no
...,...,...,...,...,...,...,...,...,...,...
281,recurrence-events,30-39,premeno,30-34,0-2,no,2,left,left_up,no
282,recurrence-events,30-39,premeno,20-24,0-2,no,3,left,left_up,yes
283,recurrence-events,60-69,ge40,20-24,0-2,no,1,right,left_up,no
284,recurrence-events,40-49,ge40,30-34,3-5,no,3,left,left_low,no


In [22]:
data.shape

(286, 10)

In [21]:
# melt
data_melted = pd.melt(
    data,
    id_vars=['class', 'age', 'menopause'],    # Columns to keep
    value_vars=['tumor_size', 'inv_nodes', 'deg_malig'],  # Columns to melt
    var_name='attribute',      # New column name for variable names
    value_name='value')        # New column name for values

In [17]:
data_melted.shape

(858, 5)

In [25]:
data_melted.head()

Unnamed: 0,class,age,menopause,attribute,value
0,no-recurrence-events,30-39,premeno,tumor_size,30-34
1,no-recurrence-events,40-49,premeno,tumor_size,20-24
2,no-recurrence-events,40-49,premeno,tumor_size,20-24
3,no-recurrence-events,60-69,ge40,tumor_size,15-19
4,no-recurrence-events,40-49,premeno,tumor_size,0-4


In [38]:
# pivot and aggregate
data_pivoted = data_melted.pivot_table(
    index=['class', 'age', 'menopause'],  # Set the rows based on these columns
    columns='attribute',                  # Create new columns for each unique value in 'attribute'
    values='value',                       # Populate with the 'value' column
    aggfunc='first'                       # Choose an aggregation function, e.g., 'first', 'mean', 'sum', etc.
)


In [39]:
data_pivoted

Unnamed: 0_level_0,Unnamed: 1_level_0,attribute,deg_malig,inv_nodes,tumor_size
class,age,menopause,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
no-recurrence-events,20-29,premeno,2,0-2,35-39
no-recurrence-events,30-39,lt40,3,0-2,15-19
no-recurrence-events,30-39,premeno,3,0-2,30-34
no-recurrence-events,40-49,ge40,3,0-2,20-24
no-recurrence-events,40-49,premeno,2,0-2,20-24
no-recurrence-events,50-59,ge40,2,0-2,25-29
no-recurrence-events,50-59,lt40,2,0-2,15-19
no-recurrence-events,50-59,premeno,2,0-2,25-29
no-recurrence-events,60-69,ge40,2,0-2,15-19
no-recurrence-events,60-69,lt40,1,0-2,10-14


In [40]:
# aggregate
df_aggregated = data.aggregate({
    'deg_malig': 'mean'
})

In [41]:
df_aggregated

deg_malig    2.048951
dtype: float64

In [42]:
# Iterate
ls_tumor_size_middle_value = []
for _,row in data.iterrows():
    tumor_range = row['tumor_size']
    low_tumor_size = int(tumor_range.split('-')[0])
    high_tumor_size = int(tumor_range.split('-')[1])
    ls_tumor_size_middle_value.append((low_tumor_size+high_tumor_size)/2)
    
data['tumor_size_middle_value'] = ls_tumor_size_middle_value

In [43]:
data.head()

Unnamed: 0,class,age,menopause,tumor_size,inv_nodes,node_caps,deg_malig,breast,breast_quad,irradiat,tumor_size_middle_value
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no,32.0
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no,22.0
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no,22.0
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no,17.0
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no,2.0


In [44]:
# Groupby
data.groupby('age').size()

age
20-29     1
30-39    36
40-49    90
50-59    96
60-69    57
70-79     6
dtype: int64