In [1]:
import json
import pandas as pd

with open('tobacco-problem-0001-of-0001.json') as f:
    data = json.load(f)

# Tobacco Problem Reports dataset

Reports of tobacco users experiencing health problems from using tobacco.

In [2]:
df = pd.DataFrame(data['results'])[:1000]
df.head()

Unnamed: 0,date_submitted,nonuser_affected,reported_health_problems,number_tobacco_products,report_id,number_health_problems,reported_product_problems,tobacco_products,number_product_problems
0,08/09/2019,No,"[Lip injury, Mouth injury, Nose injury, Bleedi...",1,944,9,[No information provided],"[Electronic cigarette, electronic nicotine or ...",0
1,08/09/2019,No,"[Tonic-clonic seizures, Seizure cluster]",1,1199,2,[No information provided],"[Electronic cigarette, electronic nicotine or ...",0
2,08/08/2019,No,[Seizure],1,923,1,[No information provided],"[Electronic cigarette, electronic nicotine or ...",0
3,07/25/2019,Yes,"[Diarrhea, Burning eyes, Chest cold, Daydreami...",1,908,11,[No information provided],"[Electronic cigarette, electronic nicotine or ...",0
4,01/22/2017,No,[Chemical burn],1,247,1,"[Exploded, caught on fire, or burned abnormally]","[Electronic cigarette, electronic nicotine or ...",1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   date_submitted             1000 non-null   object
 1   nonuser_affected           1000 non-null   object
 2   reported_health_problems   1000 non-null   object
 3   number_tobacco_products    1000 non-null   int64 
 4   report_id                  1000 non-null   int64 
 5   number_health_problems     1000 non-null   int64 
 6   reported_product_problems  1000 non-null   object
 7   tobacco_products           1000 non-null   object
 8   number_product_problems    1000 non-null   int64 
dtypes: int64(4), object(5)
memory usage: 70.4+ KB


In [4]:
df.describe()

Unnamed: 0,number_tobacco_products,report_id,number_health_problems,number_product_problems
count,1000.0,1000.0,1000.0,1000.0
mean,1.043,956248.2,1.882,0.899
std,0.281479,1050673.0,2.54923,1.713399
min,1.0,225.0,0.0,0.0
25%,1.0,780.75,1.0,0.0
50%,1.0,1185.5,1.0,0.0
75%,1.0,2089348.0,2.0,1.0
max,5.0,2179397.0,40.0,22.0


In [5]:
df.isna().sum()

date_submitted               0
nonuser_affected             0
reported_health_problems     0
number_tobacco_products      0
report_id                    0
number_health_problems       0
reported_product_problems    0
tobacco_products             0
number_product_problems      0
dtype: int64

# Normalization (lists, lower-case, null values)

In [6]:
df['nonuser_affected'] = df['nonuser_affected'].apply(lambda x: x.lower() if x != 'No information provided' else None)

In [7]:
df['reported_health_problems'] = df['reported_health_problems'].apply(lambda x: None if x[0]=='No information provided' else [label.lower() for label in x])

In [8]:
df['reported_product_problems'] = df['reported_product_problems'].apply(lambda x: None if x[0]=='No information provided' else [label.lower() for label in x])

In [9]:
df['tobacco_products'] = df['tobacco_products'].apply(lambda x: [label.lower() for label in x])

In [10]:
df.isna().sum()

date_submitted                 0
nonuser_affected              99
reported_health_problems     181
number_tobacco_products        0
report_id                      0
number_health_problems         0
reported_product_problems    588
tobacco_products               0
number_product_problems        0
dtype: int64

# One-hot encoding

In [11]:
def encoder(row, uniques):
    return {label: int(label in row) for label in uniques}

## Nonuser_affected [String]

- Creating a copy of the dataframe where the missing values of the chosen column are dropped since None type data cannot be iterated

In [15]:
df_affected_filtered = df.dropna(subset='nonuser_affected')

In [16]:
affected = set()

for i in df_affected_filtered['nonuser_affected']:
    if i == None:
        continue
    affected.add(i)
affected

{'neither', 'no', 'unknown', 'yes'}

In [17]:
nonuser_affected_df = pd.DataFrame(df_affected_filtered['nonuser_affected'].apply(lambda x: encoder(x, affected)).to_list())

nonuser_affected_df.insert(loc=0, column='report_id', value=df_affected_filtered['report_id'])

In [18]:
nonuser_affected_df.head()

Unnamed: 0,report_id,unknown,neither,yes,no
0,944.0,0,0,0,1
1,1199.0,0,0,0,1
2,923.0,0,0,0,1
3,908.0,0,0,1,0
4,247.0,0,0,0,1


## Reported health problems [List of strings]

- Creating a copy of the dataframe where the missing values of the chosen column are dropped since None type data cannot be iterated

In [19]:
df_health_problems_filtered = df.dropna(subset='reported_health_problems')
len(df_health_problems_filtered)

819

In [20]:
health_problems = set()

for i in df_health_problems_filtered['reported_health_problems']:
    for j in i:
        health_problems.add(j)

In [21]:
health_problems_df = pd.DataFrame(df_health_problems_filtered['reported_health_problems'].apply(lambda x: encoder(x, health_problems)).to_list())

health_problems_df.insert(loc=0, column='report_id', value=df_health_problems_filtered['report_id'])

In [22]:
health_problems_df.head()

Unnamed: 0,report_id,complex partial seizures,painful respiration,dizziness,neurologist consultation,ear feels clogged,status epilepticus,sores mouth,bnp increased,migraine,...,hallucination,pain throat,sleep apnea,bronchitis,disorganized speech,whipworm infection,blisters with epidermal loss due to burn (second degree) of lower limb(s),confusion,numb mouth,weight loss
0,944.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1199.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,923.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,908.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,247.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Reported product problems [List of strings]

- Creating a copy of the dataframe where the missing values of the chosen column are dropped since None type data cannot be iterated

In [23]:
df_product_problems_filtered = df.dropna(subset='reported_product_problems')
len(df_product_problems_filtered)

412

In [24]:
product_problems = set()

for i in df_product_problems_filtered['reported_product_problems']:
    for j in i:
        product_problems.add(j)

In [25]:
product_problems_df = pd.DataFrame(df_product_problems_filtered['reported_product_problems'].apply(lambda x: encoder(x, product_problems)).to_list())
product_problems_df.insert(loc=0, column='report_id', value=df_product_problems_filtered['report_id'])

In [26]:
product_problems_df

Unnamed: 0,report_id,appearance or look issue,caught on fire when it wasn’t supposed to,overheated,"appearance, look, smell, or taste issue",product failed or did not work correctly,smell issue,"exploded, caught on fire when it wasn't supposed to","exploded, caught on fire, or burned abnormally","product failed or did not work correctly (not involving overheating, fire, explosion or abnormal burning)",...,"damaged, broken, or defective part",wrong product in package,foreign material (something in the product that does not belong),hard to open,leaked,taste issue,label issue,hard to use,caught on fire when it wasn't supposed to,hard to open or to use
0,,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,1,1,1,0,0
4,247.0,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
407,751.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
408,,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,1,1,0,0,0
409,,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
410,,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


## Tobacco products [List of strings]

- Creating a copy of the dataframe where the missing values of the chosen column are dropped since None type data cannot be iterated

In [27]:
df_tobacco_products_filtered = df.dropna(subset='tobacco_products')

In [28]:
tobacco_products = set()

for i in df_tobacco_products_filtered['tobacco_products']:
    for j in i:
        tobacco_products.add(j)

In [29]:
tobacco_products_df = pd.DataFrame(df_tobacco_products_filtered['tobacco_products'].apply(lambda x: encoder(x, product_problems)).to_list())
tobacco_products_df.insert(loc=0, column='report_id', value=df_tobacco_products_filtered['report_id'])

In [30]:
tobacco_products_df

Unnamed: 0,report_id,appearance or look issue,caught on fire when it wasn’t supposed to,overheated,"appearance, look, smell, or taste issue",product failed or did not work correctly,smell issue,"exploded, caught on fire when it wasn't supposed to","exploded, caught on fire, or burned abnormally","product failed or did not work correctly (not involving overheating, fire, explosion or abnormal burning)",...,"damaged, broken, or defective part",wrong product in package,foreign material (something in the product that does not belong),hard to open,leaked,taste issue,label issue,hard to use,caught on fire when it wasn't supposed to,hard to open or to use
0,944,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1199,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,923,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,908,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,247,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2076566,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,321,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,2076369,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,322,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
trans_df = df.drop(columns=['nonuser_affected','reported_health_problems','reported_product_problems','tobacco_products'], axis=1)
trans_df

Unnamed: 0,date_submitted,number_tobacco_products,report_id,number_health_problems,number_product_problems
0,08/09/2019,1,944,9,0
1,08/09/2019,1,1199,2,0
2,08/08/2019,1,923,1,0
3,07/25/2019,1,908,11,0
4,01/22/2017,1,247,1,1
...,...,...,...,...,...
995,11/06/2019,2,2076566,10,0
996,04/20/2017,1,321,0,2
997,11/02/2019,1,2076369,1,0
998,04/25/2017,1,322,0,2


# Cleaning

**Duplicates**

In [32]:
trans_df.duplicated().sum()

0

**Missing Data**

In [33]:
trans_df.isna().sum()

date_submitted             0
number_tobacco_products    0
report_id                  0
number_health_problems     0
number_product_problems    0
dtype: int64

In [34]:
nonuser_affected_df.isna().sum()

report_id    86
unknown       0
neither       0
yes           0
no            0
dtype: int64

In [35]:
health_problems_df.isna().sum()

report_id                                                                    147
complex partial seizures                                                       0
painful respiration                                                            0
dizziness                                                                      0
neurologist consultation                                                       0
                                                                            ... 
whipworm infection                                                             0
blisters with epidermal loss due to burn (second degree) of lower limb(s)      0
confusion                                                                      0
numb mouth                                                                     0
weight loss                                                                    0
Length: 833, dtype: int64

In [36]:
product_problems_df.isna().sum()

report_id                                                                                                    245
appearance or look issue                                                                                       0
caught on fire when it wasn’t supposed to                                                                      0
overheated                                                                                                     0
appearance, look, smell, or taste issue                                                                        0
product failed or did not work correctly                                                                       0
smell issue                                                                                                    0
exploded, caught on fire when it wasn't supposed to                                                            0
exploded, caught on fire, or burned abnormally                                                  

In [37]:
tobacco_products_df.isna().sum()

report_id                                                                                                    0
appearance or look issue                                                                                     0
caught on fire when it wasn’t supposed to                                                                    0
overheated                                                                                                   0
appearance, look, smell, or taste issue                                                                      0
product failed or did not work correctly                                                                     0
smell issue                                                                                                  0
exploded, caught on fire when it wasn't supposed to                                                          0
exploded, caught on fire, or burned abnormally                                                               0
p

In [38]:
trans_df.describe()

Unnamed: 0,number_tobacco_products,report_id,number_health_problems,number_product_problems
count,1000.0,1000.0,1000.0,1000.0
mean,1.043,956248.2,1.882,0.899
std,0.281479,1050673.0,2.54923,1.713399
min,1.0,225.0,0.0,0.0
25%,1.0,780.75,1.0,0.0
50%,1.0,1185.5,1.0,0.0
75%,1.0,2089348.0,2.0,1.0
max,5.0,2179397.0,40.0,22.0


In [39]:
health_problems_df.describe()

Unnamed: 0,report_id,complex partial seizures,painful respiration,dizziness,neurologist consultation,ear feels clogged,status epilepticus,sores mouth,bnp increased,migraine,...,hallucination,pain throat,sleep apnea,bronchitis,disorganized speech,whipworm infection,blisters with epidermal loss due to burn (second degree) of lower limb(s),confusion,numb mouth,weight loss
count,672.0,819.0,819.0,819.0,819.0,819.0,819.0,819.0,819.0,819.0,...,819.0,819.0,819.0,819.0,819.0,819.0,819.0,819.0,819.0,819.0
mean,904125.0,0.002442,0.002442,0.013431,0.001221,0.001221,0.001221,0.001221,0.001221,0.006105,...,0.006105,0.001221,0.001221,0.006105,0.001221,0.001221,0.001221,0.006105,0.001221,0.007326
std,1043980.0,0.049386,0.049386,0.115182,0.034943,0.034943,0.034943,0.034943,0.034943,0.077943,...,0.077943,0.034943,0.034943,0.077943,0.034943,0.034943,0.034943,0.077943,0.034943,0.08533
min,240.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,849.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1172.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2084573.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2179141.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Analysis

- Most of the missing values in the transformed data are in report_id column. This means that report_id column should not be the primary column of each dataset.