# Goal
## Build a Machine Learning Model that can predict the homeless population in a given CoC.

### Checking to see if all the five datasets have the exact column.

In [2]:
import pandas as pd

# Define the file paths for your datasets
file_paths = ["PitCount_2017.csv", "PitCount_2018.csv", "PitCount_2019.csv", "PitCount_2020.csv", "PitCount_2021.csv"]

# Initialize an empty list to store column names for each dataset
columns_list = []

# Loop through each dataset file
for file_path in file_paths:
    # Read the dataset into a DataFrame
    df = pd.read_csv(file_path)
    
    # Extract the column names and append to the list
    columns_list.append(set(df.columns))

# Check if all datasets have the same columns
if all(x == columns_list[0] for x in columns_list):
    print("All datasets have the exact same columns.")
else:
    print("Datasets have different columns.")

All datasets have the exact same columns.


#### Each dataset represents a different year. However, they do not have a year column. Adding a year column so that we can merge them into a unified dataset.

In [3]:
import pandas as pd

# Loop through each dataset file
for file_path in file_paths:
    # Extract the year from the file name
    year = int(file_path.split("_")[-1].split(".")[0])
    
    # Read the dataset into a DataFrame
    df = pd.read_csv(file_path)
    
    # Add a "Year" column with the corresponding year value
    df["Year"] = year
    
    # Save the modified DataFrame back to the original file
    df.to_csv(file_path, index=False)

    print(f"Year column added to {file_path} and saved successfully.")


Year column added to PitCount_2017.csv and saved successfully.
Year column added to PitCount_2018.csv and saved successfully.
Year column added to PitCount_2019.csv and saved successfully.
Year column added to PitCount_2020.csv and saved successfully.
Year column added to PitCount_2021.csv and saved successfully.


In [4]:
pc17 = pd.read_csv("PitCount_2017.csv")

pc17.head()

Unnamed: 0,CoC Number,CoC Name,Count Types,Overall Homeless,Overall Homeless - Under 18,Overall Homeless - Age 18 to 24,Overall Homeless - Over 24,Overall Homeless - Female,Overall Homeless - Male,Overall Homeless - Transgender,...,Sheltered ES Homeless Parenting Youth Age 18-24,Sheltered TH Homeless Parenting Youth Age 18-24,Sheltered Total Homeless Parenting Youth Age 18-24,Unsheltered Homeless Parenting Youth Age 18-24,Overall Homeless Children of Parenting Youth,Sheltered ES Homeless Children of Parenting Youth,Sheltered TH Homeless Children of Parenting Youth,Sheltered Total Homeless Children of Parenting Youth,Unsheltered Homeless Children of Parenting Youth,Year
0,AK-500,Anchorage CoC,Sheltered and Unsheltered Count,1128,170,133,825,445,678,4,...,3,9,12,0.0,21,6,15,21,0.0,2017
1,AK-501,Alaska Balance of State CoC,Sheltered and Unsheltered Count,717,135,53,529,320,397,0,...,7,3,10,0.0,18,14,4,18,0.0,2017
2,AL-500,"Birmingham/Jefferson, St. Clair, Shelby Counti...",Sheltered and Unsheltered Count,1092,104,125,863,334,738,19,...,2,0,2,0.0,10,4,6,10,0.0,2017
3,AL-501,Mobile City & County/Baldwin County CoC,Sheltered and Unsheltered Count,606,116,35,455,222,383,1,...,3,9,12,0.0,18,6,12,18,0.0,2017
4,AL-502,Florence/Northwest Alabama CoC,Sheltered and Unsheltered Count,155,0,8,147,72,81,0,...,0,0,0,0.0,0,0,0,0,0.0,2017


#### Merging the datasets

In [5]:
# Initialize an empty list to store DataFrames
dfs = []

# Loop through each dataset file
for file_path in file_paths:
    # Read the dataset into a DataFrame
    df = pd.read_csv(file_path)
    
    # Append the DataFrame to the list
    dfs.append(df)

# Concatenate all DataFrames in the list into one
pc17_21 = pd.concat(dfs, ignore_index=True)

# Display the combined DataFrame
print(pc17_21.head())


  CoC Number                                           CoC Name  \
0     AK-500                                      Anchorage CoC   
1     AK-501                        Alaska Balance of State CoC   
2     AL-500  Birmingham/Jefferson, St. Clair, Shelby Counti...   
3     AL-501            Mobile City & County/Baldwin County CoC   
4     AL-502                     Florence/Northwest Alabama CoC   

                       Count Types Overall Homeless  \
0  Sheltered and Unsheltered Count            1,128   
1  Sheltered and Unsheltered Count              717   
2  Sheltered and Unsheltered Count            1,092   
3  Sheltered and Unsheltered Count              606   
4  Sheltered and Unsheltered Count              155   

  Overall Homeless - Under 18 Overall Homeless - Age 18 to 24  \
0                         170                             133   
1                         135                              53   
2                         104                             125   
3     

In [6]:
pc17_21.shape

(1940, 543)

In [7]:
pc17_21.head()

Unnamed: 0,CoC Number,CoC Name,Count Types,Overall Homeless,Overall Homeless - Under 18,Overall Homeless - Age 18 to 24,Overall Homeless - Over 24,Overall Homeless - Female,Overall Homeless - Male,Overall Homeless - Transgender,...,Sheltered ES Homeless Parenting Youth Age 18-24,Sheltered TH Homeless Parenting Youth Age 18-24,Sheltered Total Homeless Parenting Youth Age 18-24,Unsheltered Homeless Parenting Youth Age 18-24,Overall Homeless Children of Parenting Youth,Sheltered ES Homeless Children of Parenting Youth,Sheltered TH Homeless Children of Parenting Youth,Sheltered Total Homeless Children of Parenting Youth,Unsheltered Homeless Children of Parenting Youth,Year
0,AK-500,Anchorage CoC,Sheltered and Unsheltered Count,1128,170,133,825,445,678,4,...,3,9,12,0.0,21,6,15,21,0.0,2017
1,AK-501,Alaska Balance of State CoC,Sheltered and Unsheltered Count,717,135,53,529,320,397,0,...,7,3,10,0.0,18,14,4,18,0.0,2017
2,AL-500,"Birmingham/Jefferson, St. Clair, Shelby Counti...",Sheltered and Unsheltered Count,1092,104,125,863,334,738,19,...,2,0,2,0.0,10,4,6,10,0.0,2017
3,AL-501,Mobile City & County/Baldwin County CoC,Sheltered and Unsheltered Count,606,116,35,455,222,383,1,...,3,9,12,0.0,18,6,12,18,0.0,2017
4,AL-502,Florence/Northwest Alabama CoC,Sheltered and Unsheltered Count,155,0,8,147,72,81,0,...,0,0,0,0.0,0,0,0,0,0.0,2017


#### Looking for any null values.

In [8]:
pc17_21.isnull().sum()

CoC Number                                              10
CoC Name                                                10
Count Types                                             19
Overall Homeless                                        10
Overall Homeless - Under 18                             10
                                                        ..
Sheltered ES Homeless Children of Parenting Youth       10
Sheltered TH Homeless Children of Parenting Youth       10
Sheltered Total Homeless Children of Parenting Youth    10
Unsheltered Homeless Children of Parenting Youth        52
Year                                                     0
Length: 543, dtype: int64

##### Since the column "CoC Name" has 10 null values and we are going to make a clustering model as well we should remove the rows where this variable is null

In [9]:
pc17_21 = pc17_21.dropna(subset=['CoC Name'])

In [11]:
# Get the count of null values for each column
null_counts = pc17_21.isnull().sum()

# Print columns with null values and their respective counts
print("Columns with null values:")
for column, null_count in null_counts.items():
    if null_count > 0:
        print(f"{column}: {null_count} null values")


Columns with null values:
CoC Number: 5 null values
Count Types: 9 null values
Unsheltered Homeless - Under 18: 32 null values
Unsheltered Homeless - Age 18 to 24: 35 null values
Unsheltered Homeless - Over 24: 35 null values
Unsheltered Homeless - Female: 34 null values
Unsheltered Homeless - Male: 34 null values
Unsheltered Homeless - Transgender: 35 null values
Unsheltered Homeless - Gender that is not Singularly Female or Male: 34 null values
Unsheltered Homeless - Non-Hispanic/Non-Latin(o)(a)(x): 36 null values
Unsheltered Homeless - Hispanic/Latin(o)(a)(x): 35 null values
Unsheltered Homeless - White: 35 null values
Unsheltered Homeless - Black, African American, or African: 35 null values
Unsheltered Homeless - Asian or Asian American: 35 null values
Unsheltered Homeless - American Indian, Alaska Native, or Indigenous: 35 null values
Unsheltered Homeless - Native Hawaiian or Other Pacific Islander: 35 null values
Unsheltered Homeless - Multiple Races: 34 null values
Unsheltered 

#### Checking to see if any rows have more than 5 null values.

In [22]:
def count_rows_with_nulls_above_threshold(data, threshold):
    # Count the number of null values in each row
    null_counts = data.isnull().sum(axis=1)
    # Count the number of rows with null values above the threshold
    rows_above_threshold = (null_counts > threshold).sum()
    return rows_above_threshold

# Define the thresholds
thresholds = [2, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

# Iterate through the thresholds and count rows with null values above each threshold
for threshold in thresholds:
    rows_above_threshold = count_rows_with_nulls_above_threshold(pc17_21, threshold)
    print(f"Number of rows with more than {threshold} null values: {rows_above_threshold}")



Number of rows with more than 2 null values: 60
Number of rows with more than 5 null values: 53
Number of rows with more than 10 null values: 45
Number of rows with more than 20 null values: 42
Number of rows with more than 30 null values: 39
Number of rows with more than 40 null values: 39
Number of rows with more than 50 null values: 36
Number of rows with more than 60 null values: 35
Number of rows with more than 70 null values: 30
Number of rows with more than 80 null values: 30
Number of rows with more than 90 null values: 25
Number of rows with more than 100 null values: 0


36