# PreProcess

This notebook is used to analyze the data and prepare it to be uploaded to the database.

In [None]:
!pip install pandas

In [1]:
# Import used libraries
import pandas as pd

In [7]:
# Load the data
deaths_causes = pd.read_csv("C:\projects\BigData2Project\source_1\deaths-of-people-experiencing-homelessness\homeless-deaths-by-cause.csv")
deaths_demographics = pd.read_csv("C:\projects\BigData2Project\source_1\deaths-of-people-experiencing-homelessness\homeless-deaths-by-demographics.csv")
deaths_monthly = pd.read_csv("C:\projects\BigData2Project\source_1\deaths-of-people-experiencing-homelessness\homeless-deaths-by-month.csv")

## Data Exploration

In [8]:
deaths_causes.head()

Unnamed: 0,_id,Year of death,Cause_of_death,Age_group,Gender,Count
0,1,2023,Cardiovascular Disease,Unknown,Male,1
1,2,2019,Other,20-39,Male,1
2,3,2018,Suicide,20-39,Male,1
3,4,2017,Accident,40-59,Male,2
4,5,2022,Other,40-59,Female,1


In [9]:
deaths_demographics.head()

Unnamed: 0,_id,Year of death,Age_group,Gender,Count
0,1,2017,Unknown,Female,1
1,2,2017,20-39,Transgender,1
2,3,2017,60+,Female,6
3,4,2017,40-59,Female,12
4,5,2017,60+,Male,20


In [10]:
deaths_monthly.head()

Unnamed: 0,_id,Year of death,Month of death,Count
0,1,2022,July,19
1,2,2023,February,13
2,3,2023,April,8
3,4,2017,October,6
4,5,2023,March,14


## Missing values

In [11]:
deaths_causes.isnull().sum()

_id               0
Year of death     0
Cause_of_death    0
Age_group         0
Gender            0
Count             0
dtype: int64

In [12]:
deaths_demographics.isnull().sum()

_id              0
Year of death    0
Age_group        0
Gender           0
Count            0
dtype: int64

In [13]:
deaths_monthly.isnull().sum()

_id               0
Year of death     0
Month of death    0
Count             0
dtype: int64

In [14]:
# Check the data types
deaths_causes.dtypes

_id                int64
Year of death      int64
Cause_of_death    object
Age_group         object
Gender            object
Count              int64
dtype: object

In [15]:
deaths_demographics.dtypes

_id               int64
Year of death     int64
Age_group        object
Gender           object
Count             int64
dtype: object

In [16]:
deaths_monthly.dtypes

_id                int64
Year of death      int64
Month of death    object
Count              int64
dtype: object

In [17]:
# Check the shape of the data
deaths_causes.shape

(267, 6)

In [18]:
deaths_demographics.shape

(75, 5)

In [19]:
deaths_monthly.shape

(84, 4)

In [20]:
# Check the unique values of the data
deaths_causes.nunique()

_id               267
Year of death       7
Cause_of_death     12
Age_group           5
Gender              3
Count              23
dtype: int64

In [21]:
deaths_demographics.nunique()

_id              75
Year of death     7
Age_group         5
Gender            4
Count            31
dtype: int64

In [22]:
deaths_monthly.nunique()

_id               84
Year of death      7
Month of death    12
Count             21
dtype: int64

In [23]:
# Check the descriptive statistics
deaths_causes.describe()

Unnamed: 0,_id,Year of death,Count
count,267.0,267.0,267.0
mean,134.0,2020.026217,3.805243
std,77.220464,1.91631,5.949623
min,1.0,2017.0,1.0
25%,67.5,2018.0,1.0
50%,134.0,2020.0,2.0
75%,200.5,2022.0,3.5
max,267.0,2023.0,51.0


In [24]:
deaths_demographics.describe()

Unnamed: 0,_id,Year of death,Count
count,75.0,75.0,75.0
mean,38.0,2020.2,13.733333
std,21.794495,1.924241,16.414672
min,1.0,2017.0,1.0
25%,19.5,2019.0,2.0
50%,38.0,2020.0,6.0
75%,56.5,2022.0,19.5
max,75.0,2023.0,70.0


In [25]:
deaths_monthly.describe()

Unnamed: 0,_id,Year of death,Count
count,84.0,84.0,84.0
mean,42.5,2020.0,12.166667
std,24.392622,2.012012,5.060676
min,1.0,2017.0,1.0
25%,21.75,2018.0,8.0
50%,42.5,2020.0,11.0
75%,63.25,2022.0,15.0
max,84.0,2023.0,26.0


In [30]:
# Check how many samples are there for each year
deaths_causes['Year of death'].value_counts()

Year of death
2021    47
2019    44
2022    39
2020    37
2018    34
2017    34
2023    32
Name: count, dtype: int64

In [31]:
deaths_demographics['Year of death'].value_counts()

Year of death
2021    15
2022    12
2019    11
2020    10
2023    10
2017     9
2018     8
Name: count, dtype: int64

In [32]:
deaths_monthly['Year of death'].value_counts()

Year of death
2022    12
2023    12
2017    12
2021    12
2018    12
2019    12
2020    12
Name: count, dtype: int64

## Data Reporting
Based on everything we asked, we are going to report the following:
- The total of Rows each dataset has
- The range of years each dataset has data for
- The number of missing values for each dataset
- The number of unique values for each dataset
- The descriptive statistics for each dataset
- The number of samples for each year
- The columns of each dataset

All this information will be printed into a file called "DataReport.txt"

In [39]:
def print_report_for_datasource(datasource):
    file.write(f"Total of Rows: {datasource.shape[0]}\n")
    file.write(f"Range of years: {datasource['Year of death'].min()} - {datasource['Year of death'].max()}\n\n")
    file.write(f"Missing values:\n{datasource.isnull().sum()}\n\n")
    file.write(f"Unique values:\n{datasource.nunique()}\n\n")
    file.write(f"Descriptive statistics:\n{datasource.describe()}\n\n")
    file.write(f"Samples for each year:\n{datasource['Year of death'].value_counts()}\n")
    file.write(f"Columns:\n{datasource.columns}\n")


# Create the report
with open("DataReport.txt", "w") as file:
    file.write("# Deaths by causes dataset # \n\n")
    print_report_for_datasource(deaths_causes)
    file.write("\n\n")
    print_report_for_datasource(deaths_demographics)
    file.write("\n\n")
    file.write("# Deaths by monthly dataset #\n")
    print_report_for_datasource(deaths_monthly)
    

## Data Preparation

In [None]:
# 