Final project

In [124]:
import numpy as np
import pandas as pd
import pymysql as mysql
import matplotlib.pyplot as plt

### Cancer set

In [125]:
#load cancer dataset to be used in the target database
cancer = pd.read_excel('cancer.xlsx')

In [126]:
cancer.head(25)

Unnamed: 0,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,Smoking,Passive Smoker,Chest Pain,Coughing of Blood,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,P1,33,1,2,4,5,4,3,2,2,4,3,2,2,4,3,4,2,2,3,1,2,3,4,Low
1,P10,17,1,3,1,5,3,4,2,2,2,2,4,2,3,1,3,7,8,6,2,1,7,2,Medium
2,P100,35,1,4,5,6,5,5,4,6,7,2,3,4,8,8,7,9,2,1,4,6,7,2,High
3,P1000,37,1,7,7,7,7,6,7,7,7,7,7,7,8,4,2,3,1,4,5,6,7,5,High
4,P101,46,1,6,8,7,7,7,6,7,7,8,7,7,9,3,2,4,1,4,2,4,2,3,High
5,P102,35,1,4,5,6,5,5,4,6,7,2,3,4,8,8,7,9,2,1,4,6,7,2,High
6,P103,52,2,2,4,5,4,3,2,2,4,3,2,2,4,3,4,2,2,3,1,2,3,4,Low
7,P104,28,2,3,1,4,3,2,3,4,3,1,4,3,1,3,2,2,4,2,2,3,4,3,Low
8,P105,35,2,4,5,6,5,6,5,5,5,6,6,6,5,1,4,3,2,4,6,2,4,1,Medium
9,P106,46,1,2,3,4,2,4,3,3,3,2,3,4,4,1,2,4,6,5,4,2,1,5,Medium


In [127]:
cancer['Passive Smoker'].value_counts()

2    284
7    187
4    161
3    140
8    108
1     60
6     30
5     30
Name: Passive Smoker, dtype: int64

In [128]:
#drop patient id
cancer_drop = cancer.drop(['Patient Id'], axis=1)
#cancer_drop.head()

In [129]:
#Replace space with underscore in column names
cancer_drop.columns = [i.replace(' ','_') for i in cancer_drop.columns]
cancer_drop.head(2)

Unnamed: 0,Age,Gender,Air_Pollution,Alcohol_use,Dust_Allergy,OccuPational_Hazards,Genetic_Risk,chronic_Lung_Disease,Balanced_Diet,Obesity,Smoking,Passive_Smoker,Chest_Pain,Coughing_of_Blood,Fatigue,Weight_Loss,Shortness_of_Breath,Wheezing,Swallowing_Difficulty,Clubbing_of_Finger_Nails,Frequent_Cold,Dry_Cough,Snoring,Level
0,33,1,2,4,5,4,3,2,2,4,3,2,2,4,3,4,2,2,3,1,2,3,4,Low
1,17,1,3,1,5,3,4,2,2,2,2,4,2,3,1,3,7,8,6,2,1,7,2,Medium


### Insurance set

In [130]:
#load insurance dataset to be used in the target database
insurance = pd.read_csv('insurance.csv')

In [131]:
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [132]:
# Convert bmi to level of Low, Normal, and High considering 18.5-24.9 is a normal range
def bmi_category(bmi):
    if bmi < 18.5:
        return "Low"
    elif bmi >= 18.5 and bmi <= 24.9:
        return "Normal"
    else:
        return "High"
insurance["Bmi_cat"] = insurance["bmi"].apply(bmi_category)

insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,Bmi_cat
0,19,female,27.9,0,yes,southwest,16884.924,High
1,18,male,33.77,1,no,southeast,1725.5523,High
2,28,male,33.0,3,no,southeast,4449.462,High
3,33,male,22.705,0,no,northwest,21984.47061,Normal
4,32,male,28.88,0,no,northwest,3866.8552,High


In [133]:
insurance['smoker'].value_counts()


no     1064
yes     274
Name: smoker, dtype: int64

In [134]:
#encode sex feature
encoding = {"male": 1, "female": 2}

# Apply the encoding to the "Gender" column
insurance["Gender"] = insurance["sex"].map(encoding)

In [135]:
#Change "age" to "Age" to match cancer set
insurance = insurance.rename(columns={"age": "Age"})

In [136]:
insurance.head()

Unnamed: 0,Age,sex,bmi,children,smoker,region,charges,Bmi_cat,Gender
0,19,female,27.9,0,yes,southwest,16884.924,High,2
1,18,male,33.77,1,no,southeast,1725.5523,High,1
2,28,male,33.0,3,no,southeast,4449.462,High,1
3,33,male,22.705,0,no,northwest,21984.47061,Normal,1
4,32,male,28.88,0,no,northwest,3866.8552,High,1


In [137]:
#Covert charges to level:
# Convert bmi to level of Low, Normal, and High considering 18.5-24.9 is a normal range
def charge_cat(charges):
    if charges < 10000:
        return "Low"
    elif charges >= 10000 and charges <= 25000:
        return "Medium"
    else:
        return "High"
insurance["Charge_cat"] = insurance["charges"].apply(charge_cat)

insurance.head()

Unnamed: 0,Age,sex,bmi,children,smoker,region,charges,Bmi_cat,Gender,Charge_cat
0,19,female,27.9,0,yes,southwest,16884.924,High,2,Medium
1,18,male,33.77,1,no,southeast,1725.5523,High,1,Low
2,28,male,33.0,3,no,southeast,4449.462,High,1,Low
3,33,male,22.705,0,no,northwest,21984.47061,Normal,1,Medium
4,32,male,28.88,0,no,northwest,3866.8552,High,1,Low


In [138]:
#Drop bmi and sex
in_drop = insurance.drop(columns=["sex", "bmi", "charges"])

### Merging datasets to normalize the data

In [139]:
pd.set_option('display.max_columns', None) # to make sure you can see all the columns in output window

In [140]:
#Merge to dataset 
merged_df = pd.merge(cancer_drop, in_drop, on=["Age", "Gender"])
merged_df.tail(20)

Unnamed: 0,Age,Gender,Air_Pollution,Alcohol_use,Dust_Allergy,OccuPational_Hazards,Genetic_Risk,chronic_Lung_Disease,Balanced_Diet,Obesity,Smoking,Passive_Smoker,Chest_Pain,Coughing_of_Blood,Fatigue,Weight_Loss,Shortness_of_Breath,Wheezing,Swallowing_Difficulty,Clubbing_of_Finger_Nails,Frequent_Cold,Dry_Cough,Snoring,Level,children,smoker,region,Bmi_cat,Charge_cat
13437,25,1,4,5,6,5,5,4,6,7,2,3,4,8,8,7,9,2,1,4,6,7,2,High,1,no,northeast,High,Low
13438,25,1,4,5,6,5,5,4,6,7,2,3,4,8,8,7,9,2,1,4,6,7,2,High,0,yes,northwest,Normal,Medium
13439,25,1,4,5,6,5,5,4,6,7,2,3,4,8,8,7,9,2,1,4,6,7,2,High,4,no,northwest,High,Low
13440,25,1,4,5,6,5,5,4,6,7,2,3,4,8,8,7,9,2,1,4,6,7,2,High,3,yes,southwest,High,Medium
13441,25,1,4,5,6,5,5,4,6,7,2,3,4,8,8,7,9,2,1,4,6,7,2,High,2,no,northeast,High,Medium
13442,25,1,4,5,6,5,5,4,6,7,2,3,4,8,8,7,9,2,1,4,6,7,2,High,2,yes,southeast,High,High
13443,29,1,6,7,7,7,7,6,7,7,7,7,7,7,2,7,6,7,6,7,2,3,1,High,0,no,southeast,High,Low
13444,29,1,6,7,7,7,7,6,7,7,7,7,7,7,2,7,6,7,6,7,2,3,1,High,2,no,northwest,High,Medium
13445,29,1,6,7,7,7,7,6,7,7,7,7,7,7,2,7,6,7,6,7,2,3,1,High,1,no,northeast,High,Low
13446,29,1,6,7,7,7,7,6,7,7,7,7,7,7,2,7,6,7,6,7,2,3,1,High,1,no,northeast,High,Medium


In [141]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13457 entries, 0 to 13456
Data columns (total 29 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       13457 non-null  int64 
 1   Gender                    13457 non-null  int64 
 2   Air_Pollution             13457 non-null  int64 
 3   Alcohol_use               13457 non-null  int64 
 4   Dust_Allergy              13457 non-null  int64 
 5   OccuPational_Hazards      13457 non-null  int64 
 6   Genetic_Risk              13457 non-null  int64 
 7   chronic_Lung_Disease      13457 non-null  int64 
 8   Balanced_Diet             13457 non-null  int64 
 9   Obesity                   13457 non-null  int64 
 10  Smoking                   13457 non-null  int64 
 11  Passive_Smoker            13457 non-null  int64 
 12  Chest_Pain                13457 non-null  int64 
 13  Coughing_of_Blood         13457 non-null  int64 
 14  Fatigue               

In [142]:
duplicate_rows = merged_df[merged_df.duplicated()]

# Print the duplicate rows
print(duplicate_rows)

       Age  Gender  Air_Pollution  Alcohol_use  Dust_Allergy  \
26      33       1              2            4             5   
27      33       1              2            4             5   
28      33       1              2            4             5   
29      33       1              2            4             5   
30      33       1              2            4             5   
...    ...     ...            ...          ...           ...   
13435   25       1              4            5             6   
13436   25       1              4            5             6   
13442   25       1              4            5             6   
13447   29       1              6            7             7   
13455   29       1              6            7             7   

       OccuPational_Hazards  Genetic_Risk  chronic_Lung_Disease  \
26                        4             3                     2   
27                        4             3                     2   
28                        4   

In [143]:
#Dropt the duplicate
merged_df_drop = merged_df.drop_duplicates(ignore_index=True)
merged_df_drop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1730 entries, 0 to 1729
Data columns (total 29 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1730 non-null   int64 
 1   Gender                    1730 non-null   int64 
 2   Air_Pollution             1730 non-null   int64 
 3   Alcohol_use               1730 non-null   int64 
 4   Dust_Allergy              1730 non-null   int64 
 5   OccuPational_Hazards      1730 non-null   int64 
 6   Genetic_Risk              1730 non-null   int64 
 7   chronic_Lung_Disease      1730 non-null   int64 
 8   Balanced_Diet             1730 non-null   int64 
 9   Obesity                   1730 non-null   int64 
 10  Smoking                   1730 non-null   int64 
 11  Passive_Smoker            1730 non-null   int64 
 12  Chest_Pain                1730 non-null   int64 
 13  Coughing_of_Blood         1730 non-null   int64 
 14  Fatigue                 

In [144]:
merged_df_drop.head()

Unnamed: 0,Age,Gender,Air_Pollution,Alcohol_use,Dust_Allergy,OccuPational_Hazards,Genetic_Risk,chronic_Lung_Disease,Balanced_Diet,Obesity,Smoking,Passive_Smoker,Chest_Pain,Coughing_of_Blood,Fatigue,Weight_Loss,Shortness_of_Breath,Wheezing,Swallowing_Difficulty,Clubbing_of_Finger_Nails,Frequent_Cold,Dry_Cough,Snoring,Level,children,smoker,region,Bmi_cat,Charge_cat
0,33,1,2,4,5,4,3,2,2,4,3,2,2,4,3,4,2,2,3,1,2,3,4,Low,0,no,northwest,Normal,Medium
1,33,1,2,4,5,4,3,2,2,4,3,2,2,4,3,4,2,2,3,1,2,3,4,Low,2,no,southeast,High,Low
2,33,1,2,4,5,4,3,2,2,4,3,2,2,4,3,4,2,2,3,1,2,3,4,Low,0,no,northeast,High,Medium
3,33,1,2,4,5,4,3,2,2,4,3,2,2,4,3,4,2,2,3,1,2,3,4,Low,1,no,southeast,High,Medium
4,33,1,2,4,5,4,3,2,2,4,3,2,2,4,3,4,2,2,3,1,2,3,4,Low,5,no,southwest,High,Low


In [145]:
merged_df_drop.to_csv("combined.csv", index=False)

In [146]:
df = pd.read_csv("combined.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1730 entries, 0 to 1729
Data columns (total 29 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1730 non-null   int64 
 1   Gender                    1730 non-null   int64 
 2   Air_Pollution             1730 non-null   int64 
 3   Alcohol_use               1730 non-null   int64 
 4   Dust_Allergy              1730 non-null   int64 
 5   OccuPational_Hazards      1730 non-null   int64 
 6   Genetic_Risk              1730 non-null   int64 
 7   chronic_Lung_Disease      1730 non-null   int64 
 8   Balanced_Diet             1730 non-null   int64 
 9   Obesity                   1730 non-null   int64 
 10  Smoking                   1730 non-null   int64 
 11  Passive_Smoker            1730 non-null   int64 
 12  Chest_Pain                1730 non-null   int64 
 13  Coughing_of_Blood         1730 non-null   int64 
 14  Fatigue                 

### Split the dataset into multiple files matching target relational tables

### Person

In [147]:
person_df = df[["Age", "Gender", "Bmi_cat", "children", "smoker", "Charge_cat"]]
person_df.head()

Unnamed: 0,Age,Gender,Bmi_cat,children,smoker,Charge_cat
0,33,1,Normal,0,no,Medium
1,33,1,High,2,no,Low
2,33,1,High,0,no,Medium
3,33,1,High,1,no,Medium
4,33,1,High,5,no,Low


In [148]:
duplicate_rows = person_df [person_df .duplicated()]

# Print the duplicate rows
print(duplicate_rows)

      Age  Gender Bmi_cat  children smoker Charge_cat
7      33       1    High         5     no        Low
12     33       1    High         2     no        Low
13     33       1  Normal         0     no     Medium
14     33       1    High         2     no        Low
15     33       1    High         0     no     Medium
...   ...     ...     ...       ...    ...        ...
1709   25       1    High         0     no        Low
1715   25       1    High         4     no        Low
1722   29       1    High         0     no        Low
1728   29       1    High         1     no        Low
1729   29       1    High         2     no        Low

[1214 rows x 6 columns]


In [149]:
#Dropt the duplicate
person_df =  person_df.drop_duplicates(ignore_index=True)
person_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 516 entries, 0 to 515
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Age         516 non-null    int64 
 1   Gender      516 non-null    int64 
 2   Bmi_cat     516 non-null    object
 3   children    516 non-null    int64 
 4   smoker      516 non-null    object
 5   Charge_cat  516 non-null    object
dtypes: int64(3), object(3)
memory usage: 24.3+ KB


In [150]:
person_df.to_csv("person.csv", index=False)

### Factor

In [151]:
factor_df = df[["Air_Pollution", "Alcohol_use", "Dust_Allergy", "OccuPational_Hazards", 
                "Genetic_Risk", "Balanced_Diet", "Obesity", "Smoking", "Passive_Smoker"]]
factor_df.head()

Unnamed: 0,Air_Pollution,Alcohol_use,Dust_Allergy,OccuPational_Hazards,Genetic_Risk,Balanced_Diet,Obesity,Smoking,Passive_Smoker
0,2,4,5,4,3,2,4,3,2
1,2,4,5,4,3,2,4,3,2
2,2,4,5,4,3,2,4,3,2
3,2,4,5,4,3,2,4,3,2
4,2,4,5,4,3,2,4,3,2


In [152]:
#Dropt the duplicate
factor_df =  factor_df.drop_duplicates(ignore_index=True)
factor_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   Air_Pollution         50 non-null     int64
 1   Alcohol_use           50 non-null     int64
 2   Dust_Allergy          50 non-null     int64
 3   OccuPational_Hazards  50 non-null     int64
 4   Genetic_Risk          50 non-null     int64
 5   Balanced_Diet         50 non-null     int64
 6   Obesity               50 non-null     int64
 7   Smoking               50 non-null     int64
 8   Passive_Smoker        50 non-null     int64
dtypes: int64(9)
memory usage: 3.6 KB


In [120]:
factor_df.to_csv("factor.csv", index=False)

### Symptom

In [153]:
symptom_df = df[["chronic_Lung_Disease", "Chest_Pain", "Coughing_of_Blood", "Fatigue", 
                 "Weight_Loss", "Shortness_of_Breath", "Wheezing", "Swallowing_Difficulty", 
                 "Clubbing_of_Finger_Nails", "Frequent_Cold", "Dry_Cough", "Snoring"]]
symptom_df.head()

Unnamed: 0,chronic_Lung_Disease,Chest_Pain,Coughing_of_Blood,Fatigue,Weight_Loss,Shortness_of_Breath,Wheezing,Swallowing_Difficulty,Clubbing_of_Finger_Nails,Frequent_Cold,Dry_Cough,Snoring
0,2,2,4,3,4,2,2,3,1,2,3,4
1,2,2,4,3,4,2,2,3,1,2,3,4
2,2,2,4,3,4,2,2,3,1,2,3,4
3,2,2,4,3,4,2,2,3,1,2,3,4
4,2,2,4,3,4,2,2,3,1,2,3,4


In [165]:
#Drop the duplicate
symptom_df =  symptom_df.drop_duplicates(ignore_index=True)
symptom_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46 entries, 0 to 45
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   chronic_Lung_Disease      46 non-null     int64
 1   Chest_Pain                46 non-null     int64
 2   Coughing_of_Blood         46 non-null     int64
 3   Fatigue                   46 non-null     int64
 4   Weight_Loss               46 non-null     int64
 5   Shortness_of_Breath       46 non-null     int64
 6   Wheezing                  46 non-null     int64
 7   Swallowing_Difficulty     46 non-null     int64
 8   Clubbing_of_Finger_Nails  46 non-null     int64
 9   Frequent_Cold             46 non-null     int64
 10  Dry_Cough                 46 non-null     int64
 11  Snoring                   46 non-null     int64
dtypes: int64(12)
memory usage: 4.4 KB


In [166]:
symptom_df.to_csv("symptom.csv", index=False)

### Region

In [161]:
region_df = df[["region", "Charge_cat"]]
region_df.head()

Unnamed: 0,region,Charge_cat
0,northwest,Medium
1,southeast,Low
2,northeast,Medium
3,southeast,Medium
4,southwest,Low


In [162]:
#Drop the duplicate
region_df =  region_df.drop_duplicates(ignore_index=True)
region_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   region      12 non-null     object
 1   Charge_cat  12 non-null     object
dtypes: object(2)
memory usage: 320.0+ bytes


In [164]:
region_df.to_csv("region.csv", index=False)

### Cancer Level

In [169]:
level_df = df[["Level", "Charge_cat"]]
level_df.head()

Unnamed: 0,Level,Charge_cat
0,Low,Medium
1,Low,Low
2,Low,Medium
3,Low,Medium
4,Low,Low


In [170]:
#Drop the duplicate
level_df =  level_df.drop_duplicates(ignore_index=True)
level_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Level       9 non-null      object
 1   Charge_cat  9 non-null      object
dtypes: object(2)
memory usage: 272.0+ bytes


In [171]:
level_df.to_csv("level.csv", index=False)