In [2]:
# DATA MANIPULATION WITH PANDAS
#################################################################################

import pandas as pd

homelessness = pd.read_csv('homelessness.csv')


In [9]:
######################################################## SORTING
# 
# df.sort_values('column_name')  sorts by 'column_name' in ascending order
# use ascending=False to reverse order

# can sort by multiple columns. pass a list of columns.
# df.sort_values(['col1'],['col2']...) sorts by col1 first, then col2
# can pas a list to ascending=[True,False,...] 

####################################     Sorting Problems     #########################

# Sort homelessness by the number of homeless individuals in the individuals column, from smallest to largest, and save this as homelessness_ind.
# Print the head of the sorted DataFrame.

homeless_ind = homelessness.sort_values('individuals')
print(homeless_ind.head())


# sort homelessness by the number of homeless family_members in descending order, 
# and save this as homelessness_fam.

homelessness_fam = homelessness.sort_values('family_members', ascending=False)
print(homelessness_fam.head())

# Sort homelessness first by region (ascending), and then by number of family members (descending). 
# Save this as homelessness_reg_fam

homelessness_reg_fam = homelessness.sort_values(['region','family_members'], ascending = [True,False])
print(homelessness_reg_fam.head())



    Unnamed: 0              region         state  individuals  family_members  \
50          50            Mountain       Wyoming        434.0           205.0   
34          34  West North Central  North Dakota        467.0            75.0   
7            7      South Atlantic      Delaware        708.0           374.0   
39          39         New England  Rhode Island        747.0           354.0   
45          45         New England       Vermont        780.0           511.0   

    state_pop  
50     577601  
34     758080  
7      965479  
39    1058287  
45     624358  
    Unnamed: 0              region          state  individuals  \
32          32        Mid-Atlantic       New York      39827.0   
4            4             Pacific     California     109008.0   
21          21         New England  Massachusetts       6811.0   
9            9      South Atlantic        Florida      21443.0   
43          43  West South Central          Texas      19199.0   

    family_members  

In [13]:
################################################ SUBSETTING COLUMNS
#
# df['column_name'] will subset just that column
# df[['col_1', 'col2', ...]] subsets with mult columns
# need 2 square brackets. outer subsets the df, inner provides a list of columns to subset
# can also use a predefined list to subset with
# cols_to_subset = ['col1', 'col2', 'col3',...]
# df[cols_to_subset]


############################## SUBSETTING PROBLEMS ###################################################

# Create a Series called individuals that contains only the individuals column of homelessness.

individuals = homelessness['individuals']
print(individuals.head())

# Create a DataFrame called state_fam that contains only the state and family_members columns of homelessness, in that order.

state_fam = homelessness[['state', 'family_members']]
print(state_fam.head())

# Create a DataFrame called ind_state that contains the individuals and state columns of homelessness, in that order.

ind_state = homelessness[['individuals','state']]
print(ind_state.head())



0      2570.0
1      1434.0
2      7259.0
3      2280.0
4    109008.0
Name: individuals, dtype: float64
        state  family_members
0     Alabama           864.0
1      Alaska           582.0
2     Arizona          2606.0
3    Arkansas           432.0
4  California         20964.0
   individuals       state
0       2570.0     Alabama
1       1434.0      Alaska
2       7259.0     Arizona
3       2280.0    Arkansas
4     109008.0  California


In [21]:
######################################### FILTERING (SUBSETTING ROWS)
 
# df['height'] > 50  will return a list of True/False for each row of data

# df[df['height] > 50] will return a subsetted dataframe of rows that match the condition
# can also use other operators such as df[df['breed'] == 'poodle']

# use parentheses around each condition if using multiple
# df[ (df['height] > 50) & (df['breed'] == 'poodle') ]

# isin() method subsets based on categorical data
# takes in a list of categorical values to filter by
# dogs['color'].isin(['black', 'brown'])

##########################    FILTERING PROBLEMS      #################################

# Filter homelessness for cases where the number of individuals is greater than ten thousand, assigning to ind_gt_10k. View the printed result.

ind_gt_10k = homelessness[homelessness['individuals'] > 10000]
print(ind_gt_10k)

# Filter homelessness for cases where the USA Census region is "Mountain", assigning to mountain_reg. View the printed result.

mountain_reg = homelessness[homelessness['region'] == 'Mountain']
print(mountain_reg)

# Filter homelessness for cases where the number of family_members is less than one thousand and the region is "Pacific", assigning to fam_lt_1k_pac. View the printed result.         

fam_lt_1k_pac = homelessness[(homelessness['family_members'] < 1000) & (homelessness['region'] == 'Pacific')]
print(fam_lt_1k_pac)

# Filter homelessness for cases where the USA census state is in the list of Mojave states, canu, assigning to mojave_homelessness. View the printed result.

canu = ["California", "Arizona", "Nevada", "Utah"]

mojave_homelessness = homelessness[homelessness['state'].isin(canu)]
print(mojave_homelessness)


    Unnamed: 0              region       state  individuals  family_members  \
4            4             Pacific  California     109008.0         20964.0   
9            9      South Atlantic     Florida      21443.0          9587.0   
32          32        Mid-Atlantic    New York      39827.0         52070.0   
37          37             Pacific      Oregon      11139.0          3337.0   
43          43  West South Central       Texas      19199.0          6111.0   
47          47             Pacific  Washington      16424.0          5880.0   

    state_pop  
4    39461588  
9    21244317  
32   19530351  
37    4181886  
43   28628666  
47    7523869  
    Unnamed: 0    region       state  individuals  family_members  state_pop
2            2  Mountain     Arizona       7259.0          2606.0    7158024
5            5  Mountain    Colorado       7607.0          3250.0    5691287
12          12  Mountain       Idaho       1297.0           715.0    1750536
26          26  Mountain  

In [32]:
###################################  NEW COLUMNS

#  add a new column for each dog's height in meters
# dogs['height_m'] = dogs['height_cm']/100

# add a BMI column - weight/height-squared
# dogs['bmi'] = dogs['weight_kg'] / dogs['height_m'] ** 2

###########################    MULTIPLE MANIPULATIONS

#  find the skinny tall dogs
# bmi_lt_100 = dogs[['bmi'] < 100 ]
# bmi_lt_100_height = bmi_lt_100.sort_values('height_cm', ascending=False)
# bmi_lt_100_height[['name', 'height_cm', 'bmi']]


# Add a new column to homelessness, named total, containing the sum of the individuals and family_members columns.
homelessness['total'] = homelessness['individuals'] + homelessness['family_members']

# Add another column to homelessness, named p_homeless, containing the proportion of the total homeless population to the total population in each state state_pop.
homelessness['p_homeless'] = homelessness['total'] / homelessness['state_pop']


# Add a column to homelessness, indiv_per_10k, containing the number of homeless individuals per ten thousand people in each state, using state_pop for state population.
homelessness['indiv_per_10k'] = (homelessness['individuals'] / homelessness['state_pop']) * 10000 

# Subset rows where indiv_per_10k is higher than 20, assigning to high_homelessness.
high_homelessness = homelessness[homelessness['indiv_per_10k'] > 20]

# Sort high_homelessness by descending indiv_per_10k, assigning to high_homelessness_srt.
high_homelessness_srt = high_homelessness.sort_values('indiv_per_10k',ascending=False)

# Select only the state and indiv_per_10k columns of high_homelessness_srt and save as result. Look at the result.
result = high_homelessness_srt[['state','indiv_per_10k']]
print(result)


                   state  indiv_per_10k
8   District of Columbia      53.738381
11                Hawaii      29.079406
4             California      27.623825
37                Oregon      26.636307
28                Nevada      23.314189
47            Washington      21.829195
32              New York      20.392363
