Import Required Libraries

In [3]:
# Basic libraries
import pandas as pd
import numpy as np

# Visualization (for later phases)
import matplotlib.pyplot as plt
import seaborn as sns

# Settings
pd.set_option('display.max_columns', None)


Load All 5 Datasets

In [4]:
# Load datasets
admissions = pd.read_csv(r"D:\Unified Mentor Intership\Data Science projects\Tobacco Use and Mortality, 2004-2015\Datasets\admissions.csv")
fatalities = pd.read_csv(r"D:\Unified Mentor Intership\Data Science projects\Tobacco Use and Mortality, 2004-2015\Datasets\fatalities.csv")
metrics = pd.read_csv(r"D:\Unified Mentor Intership\Data Science projects\Tobacco Use and Mortality, 2004-2015\Datasets\metrics.csv")
prescriptions = pd.read_csv(r"D:\Unified Mentor Intership\Data Science projects\Tobacco Use and Mortality, 2004-2015\Datasets\prescriptions.csv")
smokers = pd.read_csv(r"D:\Unified Mentor Intership\Data Science projects\Tobacco Use and Mortality, 2004-2015\Datasets\smokers.csv")


Initial Inspection

In [5]:
# Quick overview
print(admissions.shape)
print(fatalities.shape)
print(metrics.shape)
print(prescriptions.shape)
print(smokers.shape)

# Check null values
print(admissions.isnull().sum())
print(fatalities.isnull().sum())
print(metrics.isnull().sum())
print(prescriptions.isnull().sum())
print(smokers.isnull().sum())

# Check dtypes
print(admissions.dtypes)
print(metrics.columns)


(1386, 7)
(1166, 7)
(31, 9)
(9, 9)
(56, 9)
Year               0
ICD10 Code         0
ICD10 Diagnosis    0
Diagnosis Type     0
Metric             0
Sex                0
Value              0
dtype: int64
Year               0
ICD10 Code         0
ICD10 Diagnosis    0
Diagnosis Type     0
Metric             0
Sex                0
Value              0
dtype: int64
Year                                                     0
Tobacco Price\nIndex                                     0
Retail Prices\nIndex                                     0
Tobacco Price Index Relative to Retail Price Index       0
Real Households' Disposable Income                       0
Affordability of Tobacco Index                           0
Household Expenditure on Tobacco                         0
Household Expenditure Total                              0
Expenditure on Tobacco as a Percentage of Expenditure    0
dtype: int64
Year                                                           0
All Pharmacotherapy Prescrip

Clean Year Columns (e.g., "2014/15" → 2014)

In [6]:
# Function to extract first year
def extract_year(x):
    if isinstance(x, str) and "/" in x:
        return int(x.split("/")[0])
    return int(x)

# Apply to datasets
admissions["Year"] = admissions["Year"].apply(extract_year)
fatalities["Year"] = fatalities["Year"].apply(extract_year)
prescriptions["Year"] = prescriptions["Year"].apply(extract_year)


Convert Value Columns to Numeric

In [7]:
# Convert Value column to numeric
admissions["Value"] = pd.to_numeric(admissions["Value"], errors='coerce')
fatalities["Value"] = pd.to_numeric(fatalities["Value"], errors='coerce')


Clean Column Names in metrics & prescriptions

In [8]:
# Remove \n and extra spaces
metrics.columns = [col.strip().replace("\n", " ") for col in metrics.columns]
prescriptions.columns = [col.strip().replace("\n", " ") for col in prescriptions.columns]


Final Sanity Check

In [9]:
# View cleaned data samples
print(admissions.head())
print(metrics.columns)
print(prescriptions["Year"].unique())


   Year                                         ICD10 Code  \
0  2014                                          All codes   
1  2014  C33-C34 & C00-C14 & C15 & C32 & C53 & C67 & C6...   
2  2014                                            C00-D48   
3  2014                                            J00-J99   
4  2014                                            I00-I99   

                               ICD10 Diagnosis  \
0                               All admissions   
1  All diseases which can be caused by smoking   
2                                  All cancers   
3                     All respiratory diseases   
4                     All circulatory diseases   

                                Diagnosis Type                Metric   Sex  \
0                               All admissions  Number of admissions  Male   
1  All diseases which can be caused by smoking  Number of admissions  Male   
2                                  All cancers  Number of admissions  Male   
3             

Save Cleaned Files

In [10]:
# Save cleaned versions
admissions.to_csv("admissions_cleaned.csv", index=False)
fatalities.to_csv("fatalities_cleaned.csv", index=False)
metrics.to_csv("metrics_cleaned.csv", index=False)
prescriptions.to_csv("prescriptions_cleaned.csv", index=False)
smokers.to_csv("smokers_cleaned.csv", index=False)
