In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np

In [2]:
data = pd.read_csv (r'../data/2022.csv')

In [3]:
data.head()

Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,_FRTRES1,_VEGRES1,_FRUTSU1,_VEGESU1,_FRTLT1A,_VEGLT1A,_FRT16A,_VEG23A,_FRUITE1,_VEGETE1
0,1.0,1.0,1192021,1,19,2021,1100.0,2021000001,2021000000.0,1.0,...,1.0,1.0,100.0,214.0,1.0,1.0,1.0,1.0,0.0,0.0
1,1.0,1.0,1212021,1,21,2021,1100.0,2021000002,2021000000.0,1.0,...,1.0,1.0,100.0,128.0,1.0,1.0,1.0,1.0,0.0,0.0
2,1.0,1.0,1212021,1,21,2021,1100.0,2021000003,2021000000.0,1.0,...,1.0,1.0,100.0,71.0,1.0,2.0,1.0,1.0,0.0,0.0
3,1.0,1.0,1172021,1,17,2021,1100.0,2021000004,2021000000.0,1.0,...,1.0,1.0,114.0,165.0,1.0,1.0,1.0,1.0,0.0,0.0
4,1.0,1.0,1152021,1,15,2021,1100.0,2021000005,2021000000.0,1.0,...,1.0,1.0,100.0,258.0,1.0,1.0,1.0,1.0,0.0,0.0


In [4]:
data.shape

(438693, 303)

## Data Preprocessing

In [5]:
# we create a backup to the data
df = data.copy()

In [None]:
x = df.isna().sum()

In [None]:
x

In [None]:
count = 0
for i in x:
    z = (i/438693)*100
    if z > 90:
        count+=1
print(count)

In [None]:
# As we see that 91 columns are having empty values greater than 50% so its better to drop them
df = df.loc[:, df.isna().sum() < 0.90*df.shape[0]]

In [None]:
df.shape

In [None]:
# CHCOCNCR(Other types of cancer), CHCSCNCR(skin cancer) - This field is for Chronic condition and has the following values:-
# 1 - Yes
# 2 - No
# 7 - Don't Know
# 9 - Refused
# For our analysis purpose we only need rows where CHCOCNCR or CHCSCNCR is 1 or 2

df = df.loc[(df['CHCSCNCR'].isin([1, 2]) | df['CHCOCNCR'].isin([1, 2]))]


In [None]:
df.shape

In [None]:
# store the count of label for cancer disease
skin_cancer_count = np.where(df['CHCSCNCR']==1, 1, 0) #for skin cancer
other_cancer_count = np.where(df['CHCOCNCR']==1, 1, 0) # for other cancer
skin_other_count = np.where((df['CHCSCNCR']==1) | (df['CHCOCNCR']==1), 1, 0)


In [None]:
# drop the cancer label coulmn 
df = df.drop(['CHCSCNCR'], axis=1)
df = df.drop(['CHCOCNCR'], axis=1)

In [None]:
df.shape

### There are some columns which are not related to health as per our observation from the code book hence its better to remove them

In [None]:
# removing columns FMONTH, DISPCODE, SEQNO, PVTRESD1, HHADULT
df = df.drop(columns = ['FMONTH','DISPCODE','SEQNO','PVTRESD1','HHADULT', 'STATERE1', 'CELPHON1'], axis=1)
# removing date columns IMONTH, IDAY, IDATE, and IYEAR as they are the columns of interview date
df = df.drop(columns = ['IMONTH', 'IDAY', 'IDATE', 'IYEAR'], axis=1)

## Heatmaps are a good way to find out the coorelation between columns. This way we can remove the highly corelated columns 

In [None]:
# Replacing the NaN or Null values with the mode of each columns
df2 = df.fillna(df.median())
# creating a corelation matrix
corr_mat = df2.corr().abs()

In [None]:
corr_mat

In [None]:
high_corrcol = []
for i in range(1,corr_mat.shape[0]-1):
    for j in range(0,i):
        if corr_mat.iloc[i][j] > 0.85:
            high_corrcol.append(corr_mat.columns[j])
high_corrcol

In [None]:
df2 = df2.drop(high_corrcol,axis = 1)

In [None]:
df2.shape

In [None]:
# Now we are inserting the labels columns back to the dataframe
df2.insert(len(df2.columns), 'Cancerlabels', skin_other_count)

In [None]:
df2.shape

In [None]:
final_data = df2.copy()

# Data Visulization

## Relationship between health condition and cancer
The data provided has the chronic health condition criteria which will help us to know how chronic health condition is related to cancer
Field name for chronic health condition - HAVARTH3


In [None]:
data_visual = final_data.copy()