In [1]:
import pandas as pd

# Part 1: Datasets Combination

## Load the datasets and check the number of columns of each dataset

In [2]:
columns_to_retain = ["SEQN", "MCQ220", "MCQ010", "MCQ053"]

In [3]:
cancer2015 = pd.read_sas("MCQ_H (1).XPT October 2015.XPT")

cancer_2015 = cancer2015[columns_to_retain]

cancer_2015 = cancer_2015.fillna(cancer_2015.median()).round()

In [4]:
cancer2017 = pd.read_sas("MCQ_I.XPT September 2017.XPT")

cancer_2017 = cancer2017[columns_to_retain]

cancer_2017 = cancer_2017.fillna(cancer_2017.median()).round()

In [5]:
cancer2020 = pd.read_sas("MCQ_J.XPT February 2020.XPT")

cancer_2020 = cancer2020[columns_to_retain]

cancer_2020 = cancer_2020.fillna(cancer_2020.median()).round()

In [6]:
cancer2021 = pd.read_sas("P_MCQ.XPT August 2021.XPT")

cancer_2021 = cancer2021[columns_to_retain]

cancer_2021 = cancer_2021.fillna(cancer_2021.median()).round()

## Since the SEQN column is the identifier among the datasets, lets check that they are continous

In [7]:
print(cancer_2015.SEQN.tail(1).tolist()[0])
print(cancer_2017.SEQN.head(1).tolist()[0])
print(cancer_2017.SEQN.tail(1).tolist()[0])
print(cancer_2020.SEQN.head(1).tolist()[0])
print(cancer_2020.SEQN.tail(1).tolist()[0])
print(cancer_2021.SEQN.head(1).tolist()[0])

83731.0
83732.0
93702.0
93703.0
102956.0
109263.0


In [8]:
# Merge the datasets and clean the missing 

In [9]:
df_cancer = pd.concat([cancer_2015, cancer_2017, cancer_2020, cancer_2021], ignore_index=True)

print("Number of rows after joining the datasets should be:", len(cancer_2015) + len(cancer_2017) + len(cancer_2020) + len(cancer_2021))

Number of rows after joining the datasets should be: 43228


In [10]:
df_cancer

Unnamed: 0,SEQN,MCQ220,MCQ010,MCQ053
0,73557.0,2.0,2.0,2.0
1,73558.0,2.0,1.0,2.0
2,73559.0,1.0,2.0,2.0
3,73560.0,2.0,2.0,2.0
4,73561.0,2.0,2.0,2.0
...,...,...,...,...
43223,124818.0,2.0,2.0,2.0
43224,124819.0,2.0,2.0,2.0
43225,124820.0,2.0,2.0,2.0
43226,124821.0,2.0,1.0,2.0


In [11]:
# df_cancer = df_cancer.fillna(df_cancer.mean()).round()

In [12]:
df_cancer.isna().sum()

SEQN      0
MCQ220    0
MCQ010    0
MCQ053    0
dtype: int64

In [13]:
# df_cancer.dropna(inplace=True)

df_cancer

Unnamed: 0,SEQN,MCQ220,MCQ010,MCQ053
0,73557.0,2.0,2.0,2.0
1,73558.0,2.0,1.0,2.0
2,73559.0,1.0,2.0,2.0
3,73560.0,2.0,2.0,2.0
4,73561.0,2.0,2.0,2.0
...,...,...,...,...
43223,124818.0,2.0,2.0,2.0
43224,124819.0,2.0,2.0,2.0
43225,124820.0,2.0,2.0,2.0
43226,124821.0,2.0,1.0,2.0


In [14]:
df_cancer.isna().sum()

SEQN      0
MCQ220    0
MCQ010    0
MCQ053    0
dtype: int64

# Part 3: Save the cleaned dataset

In [15]:
df_cancer.to_csv("cleaned_data/cleaned_cancer_data.csv", index=False)