<h1 style="text-align: center;">DATA CLEANING FOR CENSUS04</h1>

<h2 style="text-align: center;">Import necessary libraries</h2>

In [None]:
pip install ydata-profiling --user

In [None]:
pip install seaborn

In [None]:
import csv
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

<h2 style="text-align: center;">Read in the CSV file</h2>

In [None]:
#Reading census0S4.csv into a pandas dataframe
df = pd.read_csv('census04.csv')
df

In [None]:
# prints the dataframe's information
df.info()

In [None]:
# prints the amount of null values in each column of the data frame
df.isnull().sum()

<h4>Observations made:</h4>

+ Age is a string (object) instead of an integer (int64)
+ There are null values for Marital Status and Religion

<h2 style = "text-align: center;">Clean the Age column</h2>

In [None]:
# printing the data series for age
df['Age']

In [None]:
# printing the unique values of age
df['Age'].unique()

<h4>Observations made about Age:</h4>

+ Age dtype is object (string) instead of int64 (integer)
+ Some ages are float values
+ There is an empty value

In [None]:
# printing the row(s) with the empty value for age
df[df['Age'] == ' ']

<h4>Did a comparison of people living in the same house as the person with the empty age value</h4>

In [None]:
# Dataframe of people living in the same house as the person with the empty age value
df[(df['Street'] == 'Edwards Mall') & (df['House Number'] == 21)]

<h4>Did a comparison of people in the same demographic group as the person with the empty age value</h4>

In [None]:
# Dataframe for women who are christian, divorced and the head of their houses
df_age = df[(df['Relationship to Head of House'] == 'Head') & (df['Gender'] == 'Female') & (df['Marital Status'] == 'Divorced') & (df['Religion'] == 'Christian')]
df_age

In [None]:
# finding the mode of the ages of women who are christian, divorced and the head of their houses
df_age['Age'].mode()

In [None]:
# printing the dataframe for women that are 70 years old
df[(df['Age'] == '70')]

<h4>Observations made about Age:</h4>

+ The mode of the ages of women in the same demographic group is 70
+ As she is not listed as retired, her real age is probably less than 70

<h4>Finding the value count of ages of people in the same demographic group as the person with the empty age value </h4>

In [None]:
# finding the value count of the ages of women who are christian, divorced and the head of their houses
df_age['Age'].value_counts()

<h4>Observations made about Age:</h4>

+ The second highest occuring age in this demographic group is 64 years
+ Her age will be closer to 64 than 70 as she is not retired yet


In [None]:
# printing the dataframe for women that are 64 years old
df[(df['Age'] == '64')]

<h4>I did the following below</h4>

+  Casted age from object to int
+  Filled in 64 years as the empty value of age
+  Printed the new unique values of age

In [None]:
# Casting Age from object(str) to int
# When there's an error convert to a NaN value
try:
    df['Age'] = df['Age'].astype(int)

except ValueError:
    df['Age'] = pd.to_numeric(df['Age'], errors = 'coerce')

# fill in the NaN value with 64

df['Age'] = df['Age'].fillna(64)

# casting all values of age to int

df = df.astype({'Age': 'int64'})

# printing the unique values for age
df['Age'].unique()

<h2 style = "text-align: center;">Clean the Religion column</h2>

In [None]:
# printing the columns that have null values of Religion
df[df['Religion'].isna()]

In [None]:
# printing the unique values of Religion
print(df['Religion'].unique())

In [None]:
# printing the row(s) with the empty value for religion
df[df['Religion'] == ' ']

<h4>Did a comparison of people in the same demographic group as the person with the empty religion value</h4>

In [None]:
# Dataframe for 20 year old males that are single and lodgers
df_religion = df[(df['Age'] == 20) & (df['Relationship to Head of House'] == 'Lodger') &  (df['Marital Status'] == 'Single')]
df_religion

In [None]:
# finding the mode of the religion of 20 year old males that are single and lodgers
df_religion['Religion'].mode()

<h4>Observation made</h4>

+  Most people in this demographic group are Methodist

<h4>Filled in Methodist for the empty value of religion</h4>

In [None]:
# replacing the empty religion value with the mode religion value gotten above
df['Religion'] = df['Religion'].replace(' ', 'Methodist')

<h4>Observation made about religion</h4>

 + It seems most people with null values of religion are minors (below 18years)

<h4>Printed the ages of the people with null values of religion</h4>

In [None]:
# printing the ages of the people with null values of religion
df_age_of_missing_religion = df[df['Religion'].isna()]
df_age_of_missing_religion['Age'].unique()

<h4>Observation made:</h4>  

+ The people with null values of religion are aged 32 and below

<h4>Printed the dataframe of people in town aged 32 and below</h4>

In [None]:
# Dataframe of the religion of people aged 32 and below
df_below_32 = df_age_of_missing_religion[df_age_of_missing_religion['Age'] <= 32]
df_below_32

In [None]:
# Unique value(s) for the religion of all people in the town 32 & below
df_below_32['Religion'].unique()

In [None]:
# printing the data series for the religion of people in the town 32 & below
df_below_32['Religion']

<h4>Observation made:</h4>

+ All the people in this town age 32 and below have null values of religion
+ The length of the data series for the religion of people in town below 32 is the same length with the dataframe of people with null values of religion

<h4>The following was done below:</h4>

+ Changed null values of religion according to the religion of the head of house
+ Changed five religions (Private, Jedi, Undecided, Sith, and Agnostic) to none as they aren't recognised religions
+ Printed the new unique values of religion

In [None]:
# Changing null values of religion according to the religion of the head of house
x = 0
for number, street, religion, relationship in zip(df["House Number"], df["Street"],df["Religion"], df["Relationship to Head of House"]):
    if type(religion) == float:
        house = df[(df["House Number"] == number) & (df["Street"] == street)]
        for head_relationship, head_religion in zip(house["Relationship to Head of House"], house["Religion"]):
            if head_relationship == "Head":
                df.at[x,"Religion"] = head_religion

    x += 1

In [None]:
# replacing all Private values of religion to none
df['Religion'] = df['Religion'].replace('Private', 'None')

# replacing all Jedi values of religion to none
df['Religion'] = df['Religion'].replace('Jedi', 'None')

# replacing all Undecided values of religion to none
df['Religion'] = df['Religion'].replace('Undecided', 'None')

# replacing all Sith values of religion to none
df['Religion'] = df['Religion'].replace('Sith', 'None')

# replacing all Agnostic values of religion to none
df['Religion'] = df['Religion'].replace('Agnostic', 'None')

# printing the unique values of religion
df['Religion'].unique()

<h4>Observation made:</h4>

+ There are still some people with null values of religion

<h4>The following was done below:</h4>

+ Printed the dataframe of the  people that still have null values of religion
+ Found the mode of their relatinship to head of house

In [None]:
# printing the dataframe of the  people that still have null values of religion
df_missing_religion = df[df['Religion'].isna()]
df_missing_religion

In [None]:
# printing the mode of the relationship to head of house of people that still have missing religions
df_missing_religion['Relationship to Head of House'].mode()

<h4>Observation made:</h4>

+ Most people that still have null values of religion are the head of their houses

<h4>The following was done below:</h4>

+ Replaced the remaining null values of religion to none as it will be difficult to find their actual religions with the data given
+ Printed the new unique values of religion

In [None]:
# replacing all null values of religion to none
df['Religion'].fillna('None', inplace = True)

# printing the unique values of religion
df['Religion'].unique()

<h2 style = "text-align: center;">Clean the marital status column</h2>

In [None]:
# printing the columns that have null values of Marital Status
df[df['Marital Status'].isna()]

In [None]:
# printing the unique values of Marital Status
df['Marital Status'].unique()

In [None]:
# printing the ages of the people with null values of Marital Status
df_marital_status = df[df['Marital Status'].isna()]
df_marital_status['Age'].unique()

<h4>Observations made:</h4>

+ Only minors (below 18 years old) have null values for Marital Status
+ Minors are supposed to have 'NA' value of Marital Status

<h4>The following was done:</h4>

+ Changed all nullvalues of marital status to 'NA'
+ Printed the new unique values of marital status

In [None]:
# Replacing null values with NA
df['Marital Status'].fillna('NA', inplace = True)

# printing the unique values of Marital status
df['Marital Status'].unique()

<h2 style = "text-align: center;">Clean the first name column</h2>

In [None]:
# printing the row(s) with empty values of First Name
df[df['First Name'] == ' ']

In [None]:
df['First Name'] = df['First Name'].replace(' ', 'Unknown')

<h2 style = "text-align: center;">Clean the surname column</h2>

In [None]:
# printing the row(s) with empty values of Surname
df[df['Surname'] == ' ']

<h4>Did a comparison of people living in the same household to find Surname</h4>

In [None]:
# printing the dataframe of people living in no 43 Windy Dale
df[(df['Street'] == 'Windy Dale') & (df['House Number'] == 43)]

<h4>Observation made:</h4>

+ The head of house surname is Preston
+ Her relationship to the head of house is daughter

<h4>I did the following:</h4>

+ Changed her surname from an empty value to Preston

In [None]:
# Surname was changed from an empty value to Preston
df.at[315, 'Surname'] = 'Preston'

<h4>Did a comparison of people living in the same household to find Surname</h4>

In [None]:
# printing the dataframe of people living in no 4 Fletcher Spur
df[(df['Street'] == 'Fletcher Spur') & (df['House Number'] == 4)]

<h4>Observation made:</h4>

+ She is married
+ Her husband's surname is Moore

<h4>I did the following:</h4>

+ Changed her surname from an empty value to Moore (Her husband's surname)

In [None]:
# Surname was changed from an empty value to Moore
df.at[3165, 'Surname'] = 'Moore'

<h4>Did a comparison of people living in the same household to find Surname</h4>

In [None]:
# printing the dataframe of people living in no 6 Hazel Ford
df[(df['Street'] == 'Hazel Ford') & (df['House Number'] == 6)]

<h4>Observation made:</h4>

+ Her relationship to the head of house is daughter
+ All the other children in this house has Brown as their surname

<h4>I did the following:</h4>

+ Changed her surname from an empty value to Brown

In [None]:
# Surname was changed from an empty value to Brown
df.at[7128, 'Surname'] = 'Brown'

<h2 style = "text-align:center;">Clean the relationship to the head of house column</h2>

In [None]:
# printing the unique values of Relationship to Head of House
df['Relationship to Head of House'].unique()

In [None]:
# printing the row(s) with empty values of Relationship to Head of House
df[df['Relationship to Head of House'] == ' ']

<h4>Did a comparison of people living in the same household to find Relationship to Head of House</h4>

In [None]:
# printing the dataframe of people living in 52 Vulture Parkway
df[(df['House Number'] == 52) & (df['Street'] == 'Vulture Parkway')]

<h4>Observation made:</h4>

+ She has the same surname as the head of house and they are both female
+ It's safe to assume she might be the head of house's daughter

<h4>I did the following:</h4>

+ Changed her relationship to head of house from an empty value to daughter

In [None]:
# Relationship to Head of House was changed from an empty value to daughter
df.at[1967, 'Relationship to Head of House'] = 'Daughter'

<h4>Did a comparison of people living in the same household to find Relationship to Head of House</h4>

In [None]:
# printing the dataframe of people living in 54 Hazel Ford
df[(df['House Number'] == 54) & (df['Street'] == 'Hazel Ford')]

<h4>Observation made:</h4>

+ He has the same surname as the head of house
+ He is a minor and seems to have other siblings with Daly as their surname

<h4>I did the following:</h4>

+ Changed his relationship to head of house from an empty value to son

In [None]:
# Relationship to Head of House was changed from an empty value to son
df.at[7441, 'Relationship to Head of House'] = 'Son'

<h4>Dataframe of minor head of houses</h4>

In [None]:
# printing the dataframe of minors that are the head of their house(s)
df_catch_minor_head = df[(df['Relationship to Head of House'] == 'Head') & (df['Age'] < 18)]
df_catch_minor_head

<h4>Did a comparison of people living in the same household</h4>

In [None]:
# printing the dataframe of people living in 28 Windy Dale
df[(df['House Number'] == 28) & (df['Street'] == 'Windy Dale')]

<h4>Did a comparison of people living in the same household</h4>

In [None]:
# printing the dataframe of people living in 1 Ochardnip Road
df[(df['House Number'] == 1) & (df['Street'] == 'Orchardnip Road')]

<h4>Changed their ages to 18 to avoid unecessary complicated analysis since it won't significantly change the analysis</h4>

In [None]:
# Minor heads of house ages was changed to 18
df.at[264, 'Age'] = 18

df.at[7987, 'Age'] = 18

<h2 style = "text-align: center;">Clean the gender column</h2>

In [None]:
# printing the unique values of gender
df['Gender'].unique()

In [None]:
# printing the row(s) with empty values of Gender
df[df['Gender'] == ' ']

<h4>Did a comparison of people living in the same household</h4>

In [None]:
# printing the dataframe of people living in 26 Murray Coves
df[(df['House Number'] == 26) & (df['Street'] == 'Murray Coves')]

<h4>Observation made:</h4>

+ Craig is the partner to the head of house
+ Craig is a very male name
+ It is safe to assume that Craig might be male

<h4>I did the following:</h4>

+ Changed Craig's gender from an empty value to Male
+ Printed the new unique values of gender

In [None]:
# Gender was changed from an empty value to Male
df['Gender'] = df['Gender'].replace(' ', 'Male')

# printing the unique values of gender
df['Gender'].unique()

<h2 style = "text-align: center;">Clean the occupation column</h2>

In [None]:
# printing the row(s) with empty values of Occupation
df[df['Occupation'] == ' ']

<h4>Observation made:</h4>

+ The two people with an empty value of Occupation are 6 & 15

<h4>Did a comparison of people in the same demographic group to find occupation</h4>

In [None]:
# printing the dataframe of the occupation of people age 6 - 15
df_occupation = df[(df['Age'] <= 15) & (df['Age'] >= 6) & (df['Occupation'])]
df_occupation

In [None]:
# printing the mode of the occupation of people age 6 - 15
df_occupation['Occupation'].mode()

In [None]:
# Occupation was changed from an empty value to Student
df['Occupation'] = df['Occupation'].replace(' ', 'Student')

<h2 style = "text-align: center;">Clean the infirmity column</h2>

In [None]:
# printing the row(s) with empty values of Infirmity
df[df['Infirmity'] == ' ']

In [None]:
# printing the value count for Infimity
df['Infirmity'].value_counts()

<h4>Observation made:</h4>  
    
+ Most people in town have an Infirmity value of None    

In [None]:
# Infirmity was changed from an empty value to None
df['Infirmity'] = df['Infirmity'].replace(' ', 'None')

# printing the unique values for Infimity
df['Infirmity'].unique()

<h1 style = "text-align: center;">Data Visualisation</h1>

In [None]:
# Using pandas profiling to create a report on census04.csv
profile = ProfileReport(df, title='Census Report', explorative = True)
profile

<h2 style = "text-align: center;">Age Pyramid</h2>

In [None]:
# Grouping age according to every 5 years starting from 0
def Age_group(x):
    if x >= 100:
        return "100 and Above"
    elif x > 94 and x < 100:
        return "95-99"
    elif x > 89 and x < 95:
        return "90-94"
    elif x > 84 and x < 90:
        return "85-89"
    elif x > 79 and x < 85:
        return "80-84"
    elif x > 74 and x < 80:
        return "75-79"
    elif x > 69 and x < 75:
        return "70-74"
    elif x > 64 and x < 70:
        return "65-69"
    elif x > 59 and x < 65:
        return "60-64"
    elif x > 54 and x < 60:
        return "55-59"
    elif x > 49 and x < 55:
        return "50-54"
    elif x > 44 and x < 50:
        return "45-49"
    elif x > 39 and x < 45:
        return "40-44"
    elif x > 34 and x < 40:
        return '35-39'
    elif x > 29 and x < 35:
        return "30-34"
    elif x > 24 and x < 30:
        return "25-29"
    elif x > 19 and x < 25:
        return "20-24"
    elif x > 14 and x < 20:
        return "15-19"
    elif x > 9 and x <15 :
        return "10-14"
    elif x > 4 and x < 10:
        return "5-9"
    elif x >= 0 and x < 5:
        return "0-4"

census_age_range = []
for age_range in df['Age']:
    census_age_range.append(Age_group(age_range))
df['Age_Range'] = census_age_range
df['Age_Range']


In [None]:
# sorting male and female according to their age_ranges
male = df['Age_Range'][df.Gender == 'Male'].value_counts().sort_index()
female = df['Age_Range'][df.Gender == 'Female'].value_counts().sort_index()

In [None]:
male

In [None]:
female

In [None]:
df_gender = pd.DataFrame({
    'Male': -1 * male,
    'Female': female
})

In [None]:
df_gender.index.names = ['Age_Range']
df_gender.reset_index(inplace=True)

In [None]:
age_label = ["100 and above", "95-99", "90-94", "85-89", "80-84", "75-79", "70-74", "65-69", "60-64", "55-59", "50-54",
             "45-49", "40-44", '35-39', "30-34", "25-29", "20-24", "15-19", "10-14", "5-9", "0-4"]

In [None]:
# plotting the age pyramid
age_pyramid_plt = sns.barplot(x = 'Male', y = 'Age_Range', data = df_gender, color = ('blue'), order = age_label, label = 'Male')
age_pyramid_plt = sns.barplot(x = 'Female', y = 'Age_Range', data = df_gender, color = ('pink'), order = age_label, label = 'Female')
age_pyramid_plt.legend()
plt.xlabel('Population Count')
plt.title('Age Pyramid')
plt.xticks(ticks=[-400, -300, -200, -100, 0, 100, 200, 300, 400], labels = [400, 300, 200, 100, 0, 100, 200, 300, 400])
plt.show()

<h2 style = "text-align: center;">Unemployment Trends</h2>

In [None]:
unemployment_plt = sns.histplot(data = df[df['Occupation'] == 'Unemployed'], x ="Age", bins = 10, hue = "Gender", multiple = 'stack' )
unemployment_plt.set_xlabel("unemployed age")
plt.ylabel("Population Count")
plt.title("Unemployed Population Age Histplot")
plt.show()

In [None]:
box_unemployment_plt = sns.boxplot(data = df[df['Occupation'] == 'Unemployed'], x = 'Age')
plt.ylabel('Unemployed Population')
plt.title('Unemployed Distribution by Age')
plt.xticks(rotation = 90)
plt.show()

<h2 style = "text-align: center;">Religious Affiliations</h2>

In [None]:
religion_plt = sns.countplot(data = df, x = "Religion")
plt.ylabel('Population Count')
plt.title('Religion Countplot')
plt.xticks(rotation = 90)
plt.show()

In [None]:
box_religion_plt = sns.boxplot(data = df, x = 'Religion', y = 'Age')
plt.ylabel('Age')
plt.title('Religion Distribution by Age')
plt.xticks(rotation = 90)
plt.show()

In [None]:
box_marital_status = sns.boxplot(data = df, x = 'Marital Status', y = 'Age')
plt.ylabel('Age')
plt.title('Marital Status Distribution by Age')
plt.xticks(rotation = 90)
plt.show()

In [None]:
sns.histplot(data = df[(df['Marital Status'] == 'Married') | (df['Marital Status'] == 'Divorced')], x = 'Age', hue = 'Marital Status', binwidth = 10, multiple = 'stack')
plt.ylabel('Population')
plt.title('Married Vs Divorced Distribution by Age')

In [None]:
employment = sns.histplot(data = df[df['Occupation'] == 'Unemployed'], x ="Age", bins = 10, hue = "Gender", multiple = 'stack' )
employment.set_xlabel("unemployed age count")
plt.show()

<h2 style = "text-align: center;">Infirmity Distribution</h2>

In [None]:
box_infirmity = sns.boxplot(data = df, x = 'Infirmity', y = 'Age')
plt.ylabel('Age')
plt.title('Infirmity Distribution by Age')
plt.xticks(rotation = 90)
plt.show()

<h2 style = "text-align: center;">Further Analysis</h2>

<h2 style = "text-align: center;">Occupancy Level</h2>

<h4>I did the following:</h4>

+ Found the amount of occupants in each house
+ Found the median of occupants in each house
+ Calculated the mean amount of occupants in each street
+ Then I calculated the occupancy difference
+ A box plot was plotted to show this analysis

In [None]:
# Finding the amount of occupants in each house
df_occupancy = df.groupby(['Street', 'House Number']).size().reset_index(name = 'Occupancy Level')
df_occupancy

In [None]:
# Finding the median of the occupants in each house
df_occupancy['Occupancy Level'].median()

In [None]:
# Finding the mean amount of occupants in each street
df_streets = df_occupancy.groupby(['Street']).agg({'Occupancy Level' : 'mean'}).reset_index()
df_streets

In [None]:
# finding the mean occupancy of houses in each street
mean_occupancy = []

for i in range(len(df_occupancy)):
    mean_occupancy.append(0)
df_occupancy['Mean Occupancy'] = mean_occupancy

for numb, row in enumerate(df_streets['Street']):
    for street, occupant in zip(df_streets['Street'], df_streets['Occupancy Level']):
        if row == street:
            df_occupancy.at[numb, 'Mean Occupancy'] = float(occupant)
df_occupancy

In [None]:
# finding the occupancy difference of houses on each street
df_occupancy['Occupancy Difference'] = df_occupancy['Occupancy Level'] - df_occupancy['Mean Occupancy']
df_occupancy

In [None]:
occupancy_difference_plt = sns.boxplot(data = df_occupancy, x = 'Occupancy Difference')

plt.title('Occupancy Difference Of Houses on Each Street')
plt.xticks(rotation = 90)
plt.show()

<h2 style = "text-align: center;">Divorce and Marriage Rates</h2>

<h4>Find the amount of people of marriageable age in town</h4>

In [None]:
# finding the amount of people of marriageable age in town
df_marriagable_age = df[(df['Age'] > 18) & (df['Marital Status'] != 'NA')]
marriagable_age = len(df_marriagable_age['Marital Status'])
marriagable_age

In [None]:
# finding the amount of men of marriageable age in town
marriagable_men = len(df_marriagable_age[df_marriagable_age['Gender'] == 'Male']['Gender'])
marriagable_men

In [None]:
# finding the amount of women of marriageable age in town
marriagable_women = len(df_marriagable_age[df_marriagable_age['Gender'] == 'Female']['Gender'])
marriagable_women

<h4>Find the amount of married people in town</h4>

In [None]:
# finding the amount of married people in town
df_married = df[df['Marital Status'] == 'Married']
married_people = len(df[df['Marital Status'] == 'Married'])
married_people

In [None]:
# finding the amount of married men in town
married_men = len(df_married[df_married['Gender'] == 'Male']['Gender'])
married_men

In [None]:
# finding the amount of married women in town
married_women = len(df_married[df_married['Gender'] == 'Female']['Gender'])
married_women

In [None]:
#countplot of married people by gender
married_by_gender_plt = sns.countplot(data = df_married, x = 'Gender')
plt.show()

In [None]:
# boxplot of married people by gender
married_by_gender_plt = sns.boxplot(data = df_married, x = 'Gender', y = 'Age')
plt.show()

<h4>Find the amount of divorced people in town</h4>

In [None]:
# finding the amount of divorced people in town
df_divorced = df[df['Marital Status'] == 'Divorced']
divorced_people = len(df[df['Marital Status'] == 'Divorced'])
divorced_people

In [None]:
# finding the amount of divorced men in town
divorced_men = len(df_divorced[df_divorced['Gender'] == 'Male']['Gender'])
divorced_men

In [None]:
# finding the amount of divorced women in town
divorced_women = len(df_divorced[df_divorced['Gender'] == 'Female']['Gender'])
divorced_women

In [None]:
# countplot of divorced people by gender
divorced_by_gender_plt = sns.countplot(data = df_divorced, x = 'Gender')
plt.ylabel('Population Count')
plt.title('Divorced Distribution by Gender')
plt.show()
plt.show()

In [None]:
# boxplot of divorced people by gender
divorced_by_gender_plt = sns.boxplot(data = df_divorced, x = 'Gender', y = 'Age')
plt.show()

In [None]:
sns.countplot(data = df[(df['Marital Status'] == 'Married') | (df['Marital Status'] == 'Divorced')], x = 'Marital Status')
plt.ylabel('Population Count')
plt.title('Married Vs Divorced Distribution')
plt.show()

<h4>Calculating the marriage and divorce rate per 100000 people in town</h4>

In [None]:
# finding the total amount of marriages in town (including divorced people as they were once married)
all_marriage = round((divorced_women + married_people / 2), 0)
all_marriage

In [None]:
# finding the marriage rate per 100000 in town
marriage_rate = round((all_marriage / marriagable_age) * 100000, 0)
marriage_rate

In [None]:
# finding the divorce rate per 100000 in town
divorce_rate = round((divorced_women / all_marriage) * 100000, 0)
divorce_rate

<h2 style = "text-align: center;">Unemployment Rate</h2>

<h4>Calculating the unemployment rate of people in town</h4>

In [None]:
# finding the total amount of unemployed people
df_unemployed_people = df[df['Occupation'] == 'Unemployed']
df_unemployed_people

In [None]:
# finding the total amount of occupation (people with occupation) in town
total_occupation = len(df['Occupation'])
total_occupation

In [None]:
# Dataframe of the non_labour force (i.e, minors, students, retired people)
df_non_labour_force = df[(df['Occupation'] == 'Retired') | (df['Occupation'] == 'Student') | (df['Occupation'] == 'Child') | (df['Occupation'] == 'Unemployment')]
df_non_labour_force

In [None]:
# finding the total sum of the non-labour force
non_labour_force = len(df_non_labour_force)
non_labour_force

In [None]:
# finding the total sum of the labour force
labour_force = total_occupation - non_labour_force
labour_force

In [None]:
# finding the unemployment rate per 100000 people in town
num_unemployed_people = len(df_unemployed_people)

unemployment_rate = round((num_unemployed_people / labour_force) * 100000, 0)

unemployment_rate

In [None]:
# finding the percentage of unemploymed people in town
percentage_unemployement = round((num_unemployed_people / labour_force) * 100, 0)
percentage_unemployement

<h2 style = "text-align: center;">Birth Rate</h2>

<h4>Calculating the birth rate of people in town</h4>

In [None]:
# finding the births per 100,000 over the past year
number_of_births = len(df[df['Age'] == 0])
women_of_birthing_age = len(df[df['Age'].between(25,29) & (df['Gender'] == 'Female')])

birth_rate = round((number_of_births / women_of_birthing_age) * 100000, 0)
birth_rate

In [None]:
# finding the percentage of births over the past year
percentage_birth_rate = round((number_of_births / women_of_birthing_age) * 100, 0)
percentage_birth_rate

In [None]:
total_population = len(df)

In [None]:
# finding the births per 1000 (crude birth) over the past year
crude_birth_rate = (number_of_births / 1000)
crude_birth_rate

In [None]:
# finding the births per 100,000 in the past 5 years
number_of_todlers = len(df[df['Age'] == 4])
women_with_todlers = len(df[df['Age'].between(30,34) & (df['Gender'] == 'Female')])

continous_birth_rate = round((number_of_todlers / women_with_todlers) * 100000, 0)
continous_birth_rate

In [None]:
# finding the births per 1000 (crude birth) over the past year
past_crude_birth_rate = (number_of_todlers  / 1000)
past_crude_birth_rate

In [None]:
# finding the percentage of births in the past 5 years
percentage_continous_birth_rate = round((number_of_todlers / women_with_todlers) * 100, 0)
percentage_continous_birth_rate

In [None]:
birth_rate_decrease =  (birth_rate - continous_birth_rate)
birth_rate_decrease

In [None]:
percentage_birth_rate_decrease =  (percentage_birth_rate - percentage_continous_birth_rate)
percentage_birth_rate_decrease

<h2 style = "text-align: center;">Immigration</h2>

<h4>Calculating the immigration rate of people in town</h4>

In [None]:
# printing the unique values of relationship to head of house
df['Relationship to Head of House'].unique()

In [None]:
# dataframe of immigrants in town
df_immigration = df[(df['Relationship to Head of House'] == 'Lodgers') | (df['Relationship to Head of House'] == 'Visitor') | (df['Relationship to Head of House'] == 'None')]
df_immigration

In [None]:
# calculting the rate of immigrants in town
immigrants = len(df_immigration[(df_immigration['Marital Status'] != 'Divorced') | (df_immigration['Marital Status'] != 'Widowed')])
population  = len(df)

rate_of_immigration = (immigrants / population) * 100
rate_of_immigration

<h2 style = "text-align: center;">Emmigration</h2>

<h4>Calculating the Emmigration rate of people in town</h4>

In [None]:
df_divorced_population = df[df['Marital Status'] == 'Divorced']

df_divorced_men = df_divorced_population[df_divorced_population['Gender'] == 'Male']
num_divorced_men = len(df_divorced_men)

df_divorced_women = df_divorced_population[df_divorced_population['Gender'] == 'Female']
num_divorced_women = len(df_divorced_women)

In [None]:
divorce_by_gender_plt = sns.countplot(data = df_divorced_population, x = 'Gender')
plt.title('Divorce Distribution by Gender')
plt.show()

In [None]:
emigrated_divorced_people = divorced_women - divorced_men
emigrated_divorced_people

In [None]:
total_population = len(df)
rate_of_emmigration = round((emigrated_divorced_people / total_population) * 100000, 0)
rate_of_emmigration

In [None]:
growth_rate = (birth_rate + rate_of_immigration - death_rate + rate_of_emigration)
growth_rate

<h2 style = "text-align: center;">Commuters</h2>

<h4>Calculating the rate of commuters in town</h4>

In [None]:
df[df['Occupation'] == 'Nurse']

In [None]:
# dataframe of possible commuters in town
df_commuters = df[(df['Occupation'] == 'University Student') |
                  (df['Occupation'] == 'PhD Student') |
                  (df['Occupation'] == 'Research') |
                  (df['Relationship to Head of House'] == 'Visitors') |
                  (df['Relationship to Head of House'] == 'Lodgers') |
                 ]
df_commuters

In [None]:
# finding the number of commuters in town
commuters = len(df_commuters)
commuters

In [None]:
# finding the rate of commuters per 100000 in town
rate_of_commuters = round((commuters / total_population) * 100000, 0)
rate_of_commuters

In [None]:
# finding the percentage of commuters in town
percentage_of_commuters = round((commuters / total_population) * 100, 0)
percentage_of_commuters

<h2 style = "text-align: center;">Non Commuters</h2>

<h4>Calculating the rate of non-commuters in town</h4>

In [None]:
# finding the number of non commuters in town
non_commuters = len(df) - commuters
non_commuters

In [None]:
# finding the rate of non commuters per 100000 in town
rate_of_non_commuters = round((non_commuters / total_population) * 100000, 0)
rate_of_non_commuters

In [None]:
# finding the percentage of non commuters in town
percentage_of_non_commuters = round((non_commuters / total_population) * 100, 0)
percentage_of_non_commuters

In [None]:
# prints the dataframe's information
df.info()

<h3 style = "text-align: center;">Saving all data Visualisation chats</h3>

In [None]:
figures_to_save = {}
figures_to_save['./age_pyramid.png'] = age_pyramid_plt
figures_to_save['./unemployment_histplot.png'] = unemployment_plt
figures_to_save['./unemployment_boxplot.png'] = box_unemployment_plt
figures_to_save['./religion_countplot.png'] = religion_plt
figures_to_save['./religion_boxplot.png'] = box_religion_plt

for k, v in figures_to_save.items():
    v == v.savefig(k)
    print(v)