In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("global_cancer_patients_2015_2024.csv")

In [3]:
# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,Patient_ID,Age,Gender,Country_Region,Year,Genetic_Risk,Air_Pollution,Alcohol_Use,Smoking,Obesity_Level,Cancer_Type,Cancer_Stage,Treatment_Cost_USD,Survival_Years,Target_Severity_Score
0,PT0000000,71,Male,UK,2021,6.4,2.8,9.5,0.9,8.7,Lung,Stage III,62913.44,5.9,4.92
1,PT0000001,34,Male,China,2021,1.3,4.5,3.7,3.9,6.3,Leukemia,Stage 0,12573.41,4.7,4.65
2,PT0000002,80,Male,Pakistan,2023,7.4,7.9,2.4,4.7,0.1,Breast,Stage II,6984.33,7.1,5.84
3,PT0000003,40,Male,UK,2015,1.7,2.9,4.8,3.5,2.7,Colon,Stage I,67446.25,1.6,3.12
4,PT0000004,43,Female,Brazil,2017,5.1,2.8,2.3,6.7,0.5,Skin,Stage III,77977.12,2.9,3.62


In [4]:
# null values
df.isna().sum()

# shape
df.shape

# column info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Patient_ID             50000 non-null  object 
 1   Age                    50000 non-null  int64  
 2   Gender                 50000 non-null  object 
 3   Country_Region         50000 non-null  object 
 4   Year                   50000 non-null  int64  
 5   Genetic_Risk           50000 non-null  float64
 6   Air_Pollution          50000 non-null  float64
 7   Alcohol_Use            50000 non-null  float64
 8   Smoking                50000 non-null  float64
 9   Obesity_Level          50000 non-null  float64
 10  Cancer_Type            50000 non-null  object 
 11  Cancer_Stage           50000 non-null  object 
 12  Treatment_Cost_USD     50000 non-null  float64
 13  Survival_Years         50000 non-null  float64
 14  Target_Severity_Score  50000 non-null  float64
dtypes:

In [5]:
def categorize_age(age):
    if 0 <= age < 30:
        return 'Under 30'
    elif 30 <= age <= 50:
        return '30 - 50'
    elif 51 <= age <= 70:
        return '51 - 70'
    else:
        return 'Above 70'

df['Age_Group'] = df['Age'].apply(categorize_age)
df[["Age", "Age_Group"]]

Unnamed: 0,Age,Age_Group
0,71,Above 70
1,34,30 - 50
2,80,Above 70
3,40,30 - 50
4,43,30 - 50
...,...,...
49995,80,Above 70
49996,40,30 - 50
49997,74,Above 70
49998,21,Under 30


In [6]:
# Total cases per country
case_per_country = (
    df.groupby('Country_Region')["Patient_ID"]
    .count().rename("Count").reset_index()
)

case_per_country

Unnamed: 0,Country_Region,Count
0,Australia,5092
1,Brazil,5004
2,Canada,4864
3,China,4913
4,Germany,5024
5,India,5040
6,Pakistan,4926
7,Russia,5017
8,UK,5060
9,USA,5060


In [7]:
df.head()

Unnamed: 0,Patient_ID,Age,Gender,Country_Region,Year,Genetic_Risk,Air_Pollution,Alcohol_Use,Smoking,Obesity_Level,Cancer_Type,Cancer_Stage,Treatment_Cost_USD,Survival_Years,Target_Severity_Score,Age_Group
0,PT0000000,71,Male,UK,2021,6.4,2.8,9.5,0.9,8.7,Lung,Stage III,62913.44,5.9,4.92,Above 70
1,PT0000001,34,Male,China,2021,1.3,4.5,3.7,3.9,6.3,Leukemia,Stage 0,12573.41,4.7,4.65,30 - 50
2,PT0000002,80,Male,Pakistan,2023,7.4,7.9,2.4,4.7,0.1,Breast,Stage II,6984.33,7.1,5.84,Above 70
3,PT0000003,40,Male,UK,2015,1.7,2.9,4.8,3.5,2.7,Colon,Stage I,67446.25,1.6,3.12,30 - 50
4,PT0000004,43,Female,Brazil,2017,5.1,2.8,2.3,6.7,0.5,Skin,Stage III,77977.12,2.9,3.62,30 - 50


In [8]:
# Gender
country_case_by_gender = (
    df.groupby('Country_Region')["Gender"]
    .value_counts().unstack(fill_value=0)
)
country_case_by_gender

Gender,Female,Male,Other
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Australia,1713,1741,1638
Brazil,1645,1708,1651
Canada,1667,1585,1612
China,1607,1698,1608
Germany,1666,1675,1683
India,1693,1664,1683
Pakistan,1652,1658,1616
Russia,1688,1657,1672
UK,1684,1727,1649
USA,1694,1683,1683


In [9]:
# Age Group
country_case_by_ag = (
    df.groupby('Country_Region')["Age_Group"]
    .value_counts().unstack(fill_value=0)
)
country_case_by_ag

Age_Group,30 - 50,51 - 70,Above 70,Under 30
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Australia,1569,1343,1387,793
Brazil,1507,1423,1347,727
Canada,1508,1400,1283,673
China,1468,1412,1301,732
Germany,1453,1394,1440,737
India,1487,1454,1396,703
Pakistan,1450,1432,1334,710
Russia,1517,1426,1361,713
UK,1559,1422,1383,696
USA,1583,1410,1360,707


In [10]:
# cancer type by country
cancer_type_by_country = (
    df.groupby(['Country_Region', 'Cancer_Type'])
    .size().unstack(fill_value=0)
    .reset_index()
)
cancer_type_by_country

Cancer_Type,Country_Region,Breast,Cervical,Colon,Leukemia,Liver,Lung,Prostate,Skin
0,Australia,653,632,632,609,670,645,614,637
1,Brazil,597,622,623,619,627,626,619,671
2,Canada,581,636,629,636,562,625,581,614
3,China,615,643,620,590,593,609,619,624
4,Germany,625,596,626,679,624,657,627,590
5,India,640,637,633,577,671,607,639,636
6,Pakistan,604,608,697,612,611,574,608,612
7,Russia,628,593,645,662,642,584,663,600
8,UK,606,658,621,640,621,624,685,605
9,USA,640,597,650,642,628,608,653,642


In [11]:
# number of cases per year by country 
cases_per_year_by_country = (
    df.groupby('Country_Region')['Year'].value_counts()
    .unstack(fill_value=0).reset_index()
)
cases_per_year_by_country

Year,Country_Region,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,Australia,515,516,484,492,525,535,472,513,554,486
1,Brazil,499,525,508,504,496,485,492,506,491,498
2,Canada,511,478,481,489,490,502,501,487,437,488
3,China,529,533,535,474,477,514,444,460,496,451
4,Germany,473,503,512,517,519,478,484,507,503,528
5,India,515,508,507,477,504,532,519,470,505,503
6,Pakistan,460,506,499,451,500,501,517,481,490,521
7,Russia,512,493,524,501,515,507,504,492,473,496
8,UK,490,499,493,524,522,525,513,497,488,509
9,USA,508,533,494,507,517,500,521,468,498,514


In [12]:
# average risk factors by country
avg_risk_factors_by_country = (
    df.groupby('Country_Region')
    [['Genetic_Risk','Air_Pollution', 'Alcohol_Use', 'Smoking', 'Obesity_Level']]
    .mean().round(2).reset_index()
)
avg_risk_factors_by_country

Unnamed: 0,Country_Region,Genetic_Risk,Air_Pollution,Alcohol_Use,Smoking,Obesity_Level
0,Australia,5.01,5.03,5.01,4.97,4.97
1,Brazil,4.95,5.0,5.02,4.99,4.94
2,Canada,5.09,4.98,4.98,4.99,5.02
3,China,4.99,5.06,4.99,4.95,5.01
4,Germany,4.97,4.98,5.03,5.02,4.99
5,India,4.97,5.04,4.98,4.97,5.01
6,Pakistan,5.01,5.02,4.92,4.94,5.06
7,Russia,4.98,4.99,5.1,5.02,4.96
8,UK,5.0,5.05,5.05,4.95,4.96
9,USA,5.05,4.95,5.03,5.1,4.98


In [13]:
# cancer stage by country
cancer_stage_by_country = (
    df.groupby(['Country_Region', 'Cancer_Stage'])
    .size().unstack(fill_value=0)
    .reset_index()
)
cancer_stage_by_country


Cancer_Stage,Country_Region,Stage 0,Stage I,Stage II,Stage III,Stage IV
0,Australia,991,991,1054,1050,1006
1,Brazil,1000,1029,1029,962,984
2,Canada,978,977,978,943,988
3,China,941,1014,990,1023,945
4,Germany,1002,1027,987,1023,985
5,India,981,1044,991,985,1039
6,Pakistan,1013,940,990,1017,966
7,Russia,992,1002,1005,990,1028
8,UK,1007,1015,1071,988,979
9,USA,984,1007,1029,1027,1013


In [14]:
# treatment cost per year by country
cost_per_year_by_country = (
    df.groupby(['Country_Region', 'Year'])['Treatment_Cost_USD'].sum()
    .unstack(fill_value=0).reset_index()
)
cost_per_year_by_country

Year,Country_Region,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,Australia,27803903.0,26917397.36,25225186.3,25272526.91,27675539.22,28253874.77,24997826.46,27299768.33,29662082.88,24841270.99
1,Brazil,25939542.66,27316347.9,26855840.94,27139031.55,26884780.99,24403854.74,26360413.02,27107667.5,25614382.83,25292762.71
2,Canada,26067056.3,24726481.53,25917466.73,26776215.97,25346558.81,25798760.24,26430103.14,25743188.95,23030568.41,25933272.95
3,China,27326627.84,28972898.34,28350862.62,24681000.76,24953201.05,28064438.84,22854904.65,24228491.42,25836003.29,24626391.26
4,Germany,24885831.07,26347995.03,27868782.44,27029659.27,26988293.49,25553696.12,26146961.58,25766490.75,25866499.33,28658510.15
5,India,26913599.0,26938692.45,26660277.95,24328738.14,25833955.96,28166258.57,27318644.27,24233676.88,26434166.62,26688012.83
6,Pakistan,23958463.03,24969666.83,25115933.18,22686713.74,26502619.84,25036414.86,27375107.68,25278541.81,25204561.67,27895745.31
7,Russia,26887652.37,26386884.2,27644427.18,27133864.2,27089566.98,26630223.62,24986682.94,26237353.37,23298722.18,26191323.58
8,UK,26039738.44,26070609.26,25995187.03,28478179.27,27140137.16,27005514.68,26278275.79,26223210.95,25062560.19,25837834.85
9,USA,26402674.81,28963569.88,24683996.44,26955981.89,27287515.81,26868477.66,27623493.11,23900086.69,27179356.02,27700807.37


In [15]:
# average severity score per year by country
sev_score_per_year_by_country = (
    df.groupby(['Country_Region', 'Year'])['Target_Severity_Score'].mean()
    .unstack(fill_value=0).reset_index()
)
sev_score_per_year_by_country

Year,Country_Region,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,Australia,4.984485,4.99312,4.943843,5.056138,4.98941,4.886879,4.812458,4.899045,4.941625,4.953272
1,Brazil,4.971723,4.95019,4.909252,4.86506,4.828165,4.961938,4.910691,4.996285,4.90057,5.047972
2,Canada,4.984364,4.90251,5.042225,4.883006,4.915,4.995996,4.952216,5.00846,4.970641,4.968135
3,China,4.972231,4.880563,4.994,4.979873,4.89717,4.91858,4.985541,4.955717,4.947984,4.840067
4,Germany,4.908414,4.984513,4.883125,4.932224,5.021002,4.90523,4.937707,4.999467,4.957435,4.907557
5,India,5.003573,4.887441,5.008619,4.981048,4.955635,4.93203,4.917071,4.968596,4.907307,4.900437
6,Pakistan,4.907413,4.9067,4.991884,4.910466,5.03314,5.014711,4.965822,4.914262,4.914735,4.988042
7,Russia,4.995,4.931907,4.949771,4.977285,4.96833,4.90357,4.90119,4.953476,5.098076,4.940121
8,UK,4.829408,4.907615,5.012738,4.949962,4.941762,4.972914,4.944444,4.970282,4.994631,5.056071
9,USA,4.944547,4.907542,5.001255,5.026864,4.958414,5.01304,4.940211,4.951368,4.946084,4.978268


In [16]:
# cancer type occurences
cancer_type = (df["Cancer_Type"].value_counts(normalize=True) * 100).round(2)
cancer_type

Cancer_Type
Colon       12.75
Prostate    12.62
Leukemia    12.53
Liver       12.50
Skin        12.46
Cervical    12.44
Breast      12.38
Lung        12.32
Name: proportion, dtype: float64

In [17]:
# cancer stage occurences
cancer_stage = (df["Cancer_Stage"].value_counts(normalize=True) * 100).round(2)
cancer_stage

Cancer_Stage
Stage II     20.25
Stage I      20.09
Stage III    20.02
Stage IV     19.87
Stage 0      19.78
Name: proportion, dtype: float64

In [18]:
# cancer type by gender
cancer_type_by_gender = (
    df.groupby('Gender')['Cancer_Type']
    .value_counts().unstack(fill_value=0)
    .reset_index()
)
cancer_type_by_gender

Cancer_Type,Gender,Breast,Cervical,Colon,Leukemia,Liver,Lung,Prostate,Skin
0,Female,2051,2160,2108,2081,2064,2067,2113,2065
1,Male,2080,2028,2147,2092,2091,2103,2138,2117
2,Other,2058,2034,2121,2093,2094,1989,2057,2049


In [19]:
# cancer stage by gender
cancer_stage_by_gender = (
    df.groupby('Gender')['Cancer_Stage']
    .value_counts().unstack(fill_value=0)
    .reset_index()
)
cancer_stage_by_gender

Cancer_Stage,Gender,Stage 0,Stage I,Stage II,Stage III,Stage IV
0,Female,3354,3353,3375,3289,3338
1,Male,3266,3384,3421,3355,3370
2,Other,3269,3309,3328,3364,3225


In [20]:
# average risk factors by gender
avg_risk_factors_by_gender = (
    df.groupby('Gender')
    [['Genetic_Risk','Air_Pollution', 'Alcohol_Use', 'Smoking', 'Obesity_Level']]
    .mean().round(2).reset_index()
)
avg_risk_factors_by_gender

Unnamed: 0,Gender,Genetic_Risk,Air_Pollution,Alcohol_Use,Smoking,Obesity_Level
0,Female,5.01,5.02,4.99,5.0,5.0
1,Male,4.99,5.02,5.01,4.98,4.98
2,Other,5.01,4.99,5.04,4.99,4.99


In [21]:
# cancer stage by age group
cancer_stage_by_ag = (
    df.groupby('Age_Group')['Cancer_Stage']
    .value_counts().unstack(fill_value=0)
    .reset_index()
)
cancer_stage_by_ag

Cancer_Stage,Age_Group,Stage 0,Stage I,Stage II,Stage III,Stage IV
0,30 - 50,3003,3086,2999,3005,3008
1,51 - 70,2766,2839,2920,2793,2798
2,Above 70,2739,2679,2736,2748,2690
3,Under 30,1381,1442,1469,1462,1437


In [22]:
# cancer type by age group
cancer_stage_by_ag = (
    df.groupby('Age_Group')['Cancer_Type']
    .value_counts().unstack(fill_value=0)
    .reset_index()
)
cancer_stage_by_ag

Cancer_Type,Age_Group,Breast,Cervical,Colon,Leukemia,Liver,Lung,Prostate,Skin
0,30 - 50,1894,1900,1937,1880,1858,1897,1884,1851
1,51 - 70,1689,1710,1794,1756,1799,1754,1814,1800
2,Above 70,1697,1741,1757,1695,1665,1668,1708,1661
3,Under 30,909,871,888,935,927,840,902,919


In [23]:
# average risk factors by age group
avg_risk_factors_by_ag = (
    df.groupby('Age_Group')
    [['Genetic_Risk','Air_Pollution', 'Alcohol_Use', 'Smoking', 'Obesity_Level']]
    .mean().round(2).reset_index()
)
avg_risk_factors_by_ag

Unnamed: 0,Age_Group,Genetic_Risk,Air_Pollution,Alcohol_Use,Smoking,Obesity_Level
0,30 - 50,5.02,5.02,5.0,5.01,5.0
1,51 - 70,4.99,5.01,4.99,5.01,4.97
2,Above 70,5.02,5.02,5.01,4.98,4.99
3,Under 30,4.97,4.98,5.07,4.93,5.02


In [24]:
# total cost by cancer type
cost_by_cancer_type = (
    df.groupby('Cancer_Type')
    [['Treatment_Cost_USD']]
    .agg(['count', 'mean']).round(2).reset_index()
)
cost_by_cancer_type

Unnamed: 0_level_0,Cancer_Type,Treatment_Cost_USD,Treatment_Cost_USD
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean
0,Breast,6189,52484.26
1,Cervical,6222,52361.54
2,Colon,6376,52167.46
3,Leukemia,6266,52528.47
4,Liver,6249,52460.61
5,Lung,6159,53130.62
6,Prostate,6308,52620.3
7,Skin,6231,51997.51


In [25]:
cost_by_cancer_stage = (
    df.groupby('Cancer_Stage')
    [['Treatment_Cost_USD']]
    .agg(['count', 'mean']).round(2).reset_index()
)
cost_by_cancer_stage

Unnamed: 0_level_0,Cancer_Stage,Treatment_Cost_USD,Treatment_Cost_USD
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean
0,Stage 0,9889,52572.59
1,Stage I,10046,52674.08
2,Stage II,10124,52082.84
3,Stage III,10008,52708.2
4,Stage IV,9933,52302.47


In [28]:
# total cost by cancer type and country
cost_by_cancer_type = (
    df.groupby(['Cancer_Type', "Country_Region"])
    [['Treatment_Cost_USD']]
    .agg(['count', 'sum']).round(2).reset_index()
)
cost_by_cancer_type

Unnamed: 0_level_0,Cancer_Type,Country_Region,Treatment_Cost_USD,Treatment_Cost_USD
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,sum
0,Breast,Australia,653,34925015.08
1,Breast,Brazil,597,31758977.75
2,Breast,Canada,581,30945483.09
3,Breast,China,615,32550420.43
4,Breast,Germany,625,32017961.69
...,...,...,...,...
75,Skin,India,636,33412589.85
76,Skin,Pakistan,612,31261482.99
77,Skin,Russia,600,31432169.78
78,Skin,UK,605,31386817.66


In [27]:
# total cost by cancer stage and country
cost_by_cancer_stage = (
    df.groupby(['Cancer_Stage', "Country_Region"])
    [['Treatment_Cost_USD']]
    .agg(['count', 'mean']).round(2).reset_index()
)
cost_by_cancer_stage

Unnamed: 0_level_0,Cancer_Stage,Country_Region,Treatment_Cost_USD,Treatment_Cost_USD
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean
0,Stage 0,Australia,991,51797.26
1,Stage 0,Brazil,1000,52664.3
2,Stage 0,Canada,978,52805.39
3,Stage 0,China,941,53896.76
4,Stage 0,Germany,1002,52445.3
5,Stage 0,India,981,52278.71
6,Stage 0,Pakistan,1013,52034.69
7,Stage 0,Russia,992,52938.11
8,Stage 0,UK,1007,52096.83
9,Stage 0,USA,984,52857.29


In [33]:
df.head()

Unnamed: 0,Patient_ID,Age,Gender,Country_Region,Year,Genetic_Risk,Air_Pollution,Alcohol_Use,Smoking,Obesity_Level,Cancer_Type,Cancer_Stage,Treatment_Cost_USD,Survival_Years,Target_Severity_Score,Age_Group
0,PT0000000,71,Male,UK,2021,6.4,2.8,9.5,0.9,8.7,Lung,Stage III,62913.44,5.9,4.92,Above 70
1,PT0000001,34,Male,China,2021,1.3,4.5,3.7,3.9,6.3,Leukemia,Stage 0,12573.41,4.7,4.65,30 - 50
2,PT0000002,80,Male,Pakistan,2023,7.4,7.9,2.4,4.7,0.1,Breast,Stage II,6984.33,7.1,5.84,Above 70
3,PT0000003,40,Male,UK,2015,1.7,2.9,4.8,3.5,2.7,Colon,Stage I,67446.25,1.6,3.12,30 - 50
4,PT0000004,43,Female,Brazil,2017,5.1,2.8,2.3,6.7,0.5,Skin,Stage III,77977.12,2.9,3.62,30 - 50


In [None]:
# Lung cancer type in canada and pakistan
# lung cancer is the most expensive cancer type, canada had the least cases but pakistan spent the least
mask_canada = (df['Country_Region'] == 'Canada') & (df['Cancer_Type'] == 'Lung')
df_canada = df[mask_canada]
print(f"{df_canada.shape[0]} have lung cancer in Canada")

mask_pakistan = (df['Country_Region'] == 'Pakistan') & (df['Cancer_Type'] == 'Lung')
df_pakistan = df[mask_pakistan]
print(f"{df_pakistan.shape[0]} have lung cancer in Pakistan")

625 have lung cancer in Canada
574 have lung cancer in Pakistan


In [39]:
# Lung cancer stages in canada and pakistan
# lung cancer is the most expensive cancer type, canada had the least cases but pakistan spent the least
mask_canada = (df['Country_Region'] == 'Canada') & (df['Cancer_Stage'] == 'Stage III')
df_canada = df[mask_canada]
print(f"{df_canada.shape[0]} have Stage III cancer in Canada")

mask_pakistan = (df['Country_Region'] == 'Pakistan') & (df['Cancer_Stage'] == 'Stage III')
df_pakistan = df[mask_pakistan]
print(f"{df_pakistan.shape[0]} have Stage III cancer in Pakistan")

943 have Stage III cancer in Canada
1017 have Stage III cancer in Pakistan
