In [40]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import os
import pandas as pd
pd.set_option('display.max_columns', None) # show all columns
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import sweetviz as sv

# Load Data
Since the `.csv` file is quite large, set `low_memory` to `False`

In [41]:
df = pd.read_csv('./data/export.csv', low_memory=False)

# Preprocessing Data
[1] replacing spaces in column names with `'_'` <br>
[2] lowercasing columns <br>
[2] replace NaN values with `np.nan` and remove columns with more than `50%` NaN values <br>
[2] lowercasing data<br>
[3] convert columns to categorical by using unique values, and then follow `Data Distribution` guidelines <br>
[4] remove duplicated rows
### Data Distribution
#### Skewness
Skewness is usually described as a measure of a dataset’s symmetry – or lack of symmetry. A perfectly symmetrical data set will have a skewness of 0. The normal distribution has a skewness of 0. <br>
Skewness The rule of thumb seems to be:<br>
[-] If the skewness is between `-0.5` and `0.5,` the data are fairly symmetrical<br>
[-] If the skewness is between `-1` and `–0.5` or between `0.5` and `1`, the data are moderately skewed<br>
[-] If the skewness is less than `-1` or greater than `1`, the data are highly skewed<br>

In [42]:
# replacing spaces in column names with '_'
df.columns = [col.replace(' ','_') for col in df.columns]
# lowercasing columns
df.columns = [col.lower() for col in df.columns]
# replace NaN values with np.NaN and remove columns with more than 50% NaN values
df = df.replace(to_replace=["Blank(s)", 
                            "Recode not available",
                            "Unknown" ], value=np.nan).dropna(axis=1, 
                                                              thresh=int(((100-50.0)/100) * df.shape[0] + 1))
# lower case all data
df = df.apply(lambda x: x.astype(str).str.lower())
# convery columns to categorical and use `.skew()` to remove coloumns
for column in df:
    if len(df[column].unique()) == 1:
        df.drop(column, inplace=True, axis=1)
    else:  
        new_coloumn = column + "_cat"

        df[new_coloumn] = df[column].astype('category').cat.codes

        skew_value = df[new_coloumn].skew()

        if skew_value < -1 or skew_value > 1:
            df = df.drop([new_coloumn, column], 1)
        elif skew_value < -.5 or skew_value > .5:
            df = df.drop([new_coloumn, column], 1)

df = df.drop_duplicates(keep='first')

In [43]:
df

Unnamed: 0,age_recode_with_<1_year_olds,sex,year_of_diagnosis,prcda_2017,laterality,diagnostic_confirmation,seer_historic_stage_a_(1973-2015),site_specific_surgery_(1973-1997_varying_detail_by_year_and_site),regional_nodes_examined_(1988+),cod_to_site_recode,cod_to_site_rec_km,age_recode_with_single_ages_and_85+,ss_seq_#_1992+_-_mal+ins_(most_detail),ss_seq_#_2000+_-_mal+ins_(most_detail),ss_seq_#_1992+_-_mal_(most_detail),ss_seq_#_2000+_-_mal_(most_detail),ss_seq_#_1992+_-_mal+ins_(mid_detail),ss_seq_#_2000+_-_mal+ins_(mid_detail),ss_seq_#_1992+_-_mal_(mid_detail),ss_seq_#_2000+_-_mal_(mid_detail),ss_seq_#_1992+_-_mal+ins_(least_detail),ss_seq_#_2000+_-_mal+ins_(least_detail),ss_seq_#_1992+_-_mal_(least_detail),ss_seq_#_2000+_-_mal_(least_detail),patient_id,rural-urban_continuum_code,age_recode_with_<1_year_olds_cat,sex_cat,year_of_diagnosis_cat,prcda_2017_cat,laterality_cat,diagnostic_confirmation_cat,seer_historic_stage_a_(1973-2015)_cat,site_specific_surgery_(1973-1997_varying_detail_by_year_and_site)_cat,regional_nodes_examined_(1988+)_cat,cod_to_site_recode_cat,cod_to_site_rec_km_cat,age_recode_with_single_ages_and_85+_cat,ss_seq_#_1992+_-_mal+ins_(most_detail)_cat,ss_seq_#_2000+_-_mal+ins_(most_detail)_cat,ss_seq_#_1992+_-_mal_(most_detail)_cat,ss_seq_#_2000+_-_mal_(most_detail)_cat,ss_seq_#_1992+_-_mal+ins_(mid_detail)_cat,ss_seq_#_2000+_-_mal+ins_(mid_detail)_cat,ss_seq_#_1992+_-_mal_(mid_detail)_cat,ss_seq_#_2000+_-_mal_(mid_detail)_cat,ss_seq_#_1992+_-_mal+ins_(least_detail)_cat,ss_seq_#_2000+_-_mal+ins_(least_detail)_cat,ss_seq_#_1992+_-_mal_(least_detail)_cat,ss_seq_#_2000+_-_mal_(least_detail)_cat,patient_id_cat,rural-urban_continuum_code_cat
0,45-49 years,female,1980,not prcda,left - origin of primary,positive histology,,09,,miscellaneous malignant cancer,miscellaneous malignant cancer,45 years,not applicable (1973-1991 diagnosis),not applicable (1973-1999 diagnosis),not applicable (1973-1991 diagnosis),not applicable (1973-1999 diagnosis),not applicable (1973-1991 diagnosis),not applicable (1973-1999 diagnosis),not applicable (1973-1991 diagnosis),not applicable (1973-1999 diagnosis),not applicable (1973-1991 diagnosis),not applicable (1973-1999 diagnosis),not applicable (1973-1991 diagnosis),not applicable (1973-1999 diagnosis),105,unknown/missing/no match/not 1990-2018,10,0,5,0,0,5,2,8,53,42,43,38,4,4,4,4,4,4,4,4,4,4,4,4,7004,5
1,80-84 years,female,1997,not prcda,left - origin of primary,clinical diagnosis only,unstaged,00,99,chronic obstructive pulmonary disease and alli...,chronic obstructive pulmonary disease and alli...,81 years,01,not applicable (1973-1999 diagnosis),01,not applicable (1973-1999 diagnosis),01,not applicable (1973-1999 diagnosis),01,not applicable (1973-1999 diagnosis),01,not applicable (1973-1999 diagnosis),01,not applicable (1973-1999 diagnosis),188,counties in metropolitan areas ge 1 million pop,17,0,22,0,0,0,4,0,52,19,19,74,0,4,0,4,0,4,0,4,0,4,0,4,31408,0
2,70-74 years,female,1994,not prcda,right - origin of primary,positive histology,regional,02,00,lung and bronchus,lung and bronchus,70 years,01,not applicable (1973-1999 diagnosis),01,not applicable (1973-1999 diagnosis),01,not applicable (1973-1999 diagnosis),01,not applicable (1973-1999 diagnosis),01,not applicable (1973-1999 diagnosis),01,not applicable (1973-1999 diagnosis),415,counties in metropolitan areas ge 1 million pop,15,0,19,0,1,5,3,2,0,40,40,63,0,4,0,4,0,4,0,4,0,4,0,4,62923,0
3,50-54 years,male,1993,not prcda,left - origin of primary,positive histology,distant,01,00,brain and other nervous system,brain and other nervous system,52 years,01,not applicable (1973-1999 diagnosis),01,not applicable (1973-1999 diagnosis),01,not applicable (1973-1999 diagnosis),01,not applicable (1973-1999 diagnosis),01,not applicable (1973-1999 diagnosis),01,not applicable (1973-1999 diagnosis),676,counties in metropolitan areas ge 1 million pop,11,1,18,0,0,5,0,1,0,11,11,45,0,4,0,4,0,4,0,4,0,4,0,4,77181,0
4,70-74 years,female,1998,not prcda,right - origin of primary,positive histology,distant,,00,chronic lymphocytic leukemia,chronic lymphocytic leukemia,71 years,01,not applicable (1973-1999 diagnosis),01,not applicable (1973-1999 diagnosis),01,not applicable (1973-1999 diagnosis),01,not applicable (1973-1999 diagnosis),01,not applicable (1973-1999 diagnosis),01,not applicable (1973-1999 diagnosis),1383,counties in metropolitan areas ge 1 million pop,15,0,23,0,1,5,0,19,0,17,17,64,0,4,0,4,0,4,0,4,0,4,0,4,17221,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79969,85+ years,female,2018,not prcda,right - origin of primary,positive histology,,,00,alive,alive,85+ years,01,01,01,01,01,01,01,01,01,01,01,01,30636583,counties in metropolitan areas ge 1 million pop,18,0,43,0,1,5,2,19,0,5,5,78,0,0,0,0,0,0,0,0,0,0,0,0,61372,0
79970,70-74 years,male,2018,not prcda,right - origin of primary,radiography without microscopic confirm,,,00,lung and bronchus,lung and bronchus,70 years,01,01,01,01,01,01,01,01,01,01,01,01,30637083,counties in metropolitan areas ge 1 million pop,15,1,43,0,1,8,2,19,0,40,40,63,0,0,0,0,0,0,0,0,0,0,0,0,61373,0
79971,70-74 years,male,2018,not prcda,left - origin of primary,radiography without microscopic confirm,,,00,lung and bronchus,lung and bronchus,70 years,02,02,02,02,02,02,02,02,02,02,02,02,30637083,counties in metropolitan areas ge 1 million pop,15,1,43,0,0,8,2,19,0,40,40,63,1,1,1,1,1,1,1,1,1,1,1,1,61373,0
79972,65-69 years,female,2018,not prcda,right - origin of primary,positive histology,,,00,lung and bronchus,lung and bronchus,67 years,01,01,01,01,01,01,01,01,01,01,01,01,30637123,counties in metropolitan areas ge 1 million pop,14,0,43,0,1,5,2,19,0,40,40,60,0,0,0,0,0,0,0,0,0,0,0,0,61374,0


# Data Visualization

In [39]:
analyze_report = sv.analyze(df.iloc[: , :28])
analyze_report.show_html('analyze.html')#, open_browser=True)

                                             |          | [  0%]   00:00 -> (? left)

Report analyze.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [None]:
df['laterality'][df['sex']=='male'].value_counts().to_frame().plot(kind='pie', 
                                                                     y='laterality', 
                                                                     subplots=True, 
                                                                     shadow = True, 
                                                                     figsize=(20, 20), 
                                                                     autopct='%1.1f%%', 
                                                                     startangle=90, 
                                                                     ax=plt.subplot(131))
plt.title('Male Cancer Lung Side', size=16) 
df['laterality'][df['sex']=='female'].value_counts().to_frame().plot(kind='pie', 
                                                                       y='laterality', 
                                                                       subplots=True, 
                                                                       shadow = True, 
                                                                       figsize=(20, 20), 
                                                                       autopct='%1.1f%%',
                                                                       startangle=90, 
                                                                       ax=plt.subplot(132))
plt.title('Female Cancer Lung Side', size=16) 
df.groupby(['sex', 'laterality']).size().reset_index().pivot(columns='sex', 
                                                               index='laterality', 
                                                               values=0).plot(kind='bar', 
                                                                              stacked=True, 
                                                                              figsize=(20, 10),
                                                                              ax=plt.subplot(133))
plt.title('Gender cases per laterality', size=16)
plt.tight_layout()
plt.show()


####

df['laterality'][df['sex']=='male'].value_counts().to_frame().plot(kind='pie', 
                                                                     y='laterality', 
                                                                     subplots=True, 
                                                                     shadow = True, 
                                                                     figsize=(20, 20), 
                                                                     autopct='%1.1f%%', 
                                                                     startangle=90, 
                                                                     ax=plt.subplot(131))
plt.title('Male Cancer Lung Side', size=16) 
df['laterality'][df['sex']=='female'].value_counts().to_frame().plot(kind='pie', 
                                                                       y='laterality', 
                                                                       subplots=True, 
                                                                       shadow = True, 
                                                                       figsize=(20, 20), 
                                                                       autopct='%1.1f%%',
                                                                       startangle=90, 
                                                                       ax=plt.subplot(132))
plt.title('Female Cancer Lung Side', size=16) 
df.groupby(['sex', 'laterality']).size().reset_index().pivot(columns='sex', 
                                                               index='laterality', 
                                                               values=0).plot(kind='bar', 
                                                                              stacked=True, 
                                                                              figsize=(20, 10),
                                                                              ax=plt.subplot(133))
plt.title('Gender cases per laterality', size=16)
plt.tight_layout()
plt.show()


# for column in df:
#     df[column].value_counts().plot(kind='bar');
    

#     plt.show()


In [None]:
analyze_report = sv.analyze(df, pairwise_analysis='off')
analyze_report.show_html('analyze.html', open_browser=True)

In [None]:
df

In [None]:
df['lung_side'][df['gender']=='male'].value_counts().to_frame().plot(kind='pie', 
                                                                     y='lung_side', 
                                                                     subplots=True, 
                                                                     shadow = True, 
                                                                     figsize=(20, 20), 
                                                                     autopct='%1.1f%%', 
                                                                     startangle=90, 
                                                                     ax=plt.subplot(131))
plt.title('Male Cancer Lung Side', size=16) 
df['lung_side'][df['gender']=='female'].value_counts().to_frame().plot(kind='pie', 
                                                                       y='lung_side', 
                                                                       subplots=True, 
                                                                       shadow = True, 
                                                                       figsize=(20, 20), 
                                                                       autopct='%1.1f%%',
                                                                       startangle=90, 
                                                                       ax=plt.subplot(132))
plt.title('Female Cancer Lung Side', size=16) 
df.groupby(['gender', 'lung_side']).size().reset_index().pivot(columns='gender', 
                                                               index='lung_side', 
                                                               values=0).plot(kind='bar', 
                                                                              stacked=True, 
                                                                              figsize=(20, 10),
                                                                              ax=plt.subplot(133))
plt.title('Gender cases per lung side', size=16)
plt.tight_layout()
plt.show()

In [None]:
df