## 1. Inspecting DataFrames

In [91]:
import pandas as pd
import numpy as np

In [77]:
df = pd.read_csv("../ZDatasets/homelessness.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,region,state,individuals,family_members,state_pop
0,0,East South Central,Alabama,2570.0,864.0,4887681
1,1,Pacific,Alaska,1434.0,582.0,735139
2,2,Mountain,Arizona,7259.0,2606.0,7158024
3,3,West South Central,Arkansas,2280.0,432.0,3009733
4,4,Pacific,California,109008.0,20964.0,39461588


In [78]:
### Exploring a DataFrame
print(df.info())
print(df.shape)
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      51 non-null     int64  
 1   region          51 non-null     object 
 2   state           51 non-null     object 
 3   individuals     51 non-null     float64
 4   family_members  51 non-null     float64
 5   state_pop       51 non-null     int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 2.5+ KB
None
(51, 6)
       Unnamed: 0    individuals  family_members     state_pop
count   51.000000      51.000000       51.000000  5.100000e+01
mean    25.000000    7225.784314     3504.882353  6.405637e+06
std     14.866069   15991.025083     7805.411811  7.327258e+06
min      0.000000     434.000000       75.000000  5.776010e+05
25%     12.500000    1446.500000      592.000000  1.777414e+06
50%     25.000000    3082.000000     1482.000000  4.461153e+06
75%     37.500000    6

In [79]:
### Components of a Dataframe
df.values
print(df.columns)
print(df.index)

Index(['Unnamed: 0', 'region', 'state', 'individuals', 'family_members',
       'state_pop'],
      dtype='object')
RangeIndex(start=0, stop=51, step=1)


### 1.2. Sorting the DataFrame

#### 1.2.1 Sort a column in descending order

In [80]:
df.sort_values('state', ascending=False).head()

Unnamed: 0.1,Unnamed: 0,region,state,individuals,family_members,state_pop
50,50,Mountain,Wyoming,434.0,205.0,577601
49,49,East North Central,Wisconsin,2740.0,2167.0,5807406
48,48,South Atlantic,West Virginia,1021.0,222.0,1804291
47,47,Pacific,Washington,16424.0,5880.0,7523869
46,46,South Atlantic,Virginia,3928.0,2047.0,8501286


#### 1.2.2 Sort by multiple variables

In [None]:
df.sort_values(['region','state'],ascending=[False,True]).head()

Unnamed: 0.1,Unnamed: 0,region,state,individuals,family_members,state_pop
3,3,West South Central,Arkansas,2280.0,432.0,3009733
18,18,West South Central,Louisiana,2540.0,519.0,4659690
36,36,West South Central,Oklahoma,2823.0,1048.0,3940235
43,43,West South Central,Texas,19199.0,6111.0,28628666
15,15,West North Central,Iowa,1711.0,1038.0,3148618
16,16,West North Central,Kansas,1443.0,773.0,2911359
23,23,West North Central,Minnesota,3993.0,3250.0,5606249
25,25,West North Central,Missouri,3776.0,2107.0,6121623
27,27,West North Central,Nebraska,1745.0,676.0,1925614
34,34,West North Central,North Dakota,467.0,75.0,758080


#### 1.2.3 Subsetting

In [82]:
# Subsetting columns
df['state'].head()

0       Alabama
1        Alaska
2       Arizona
3      Arkansas
4    California
Name: state, dtype: object

In [83]:
# Subsetting multiple columns
df[['region','state']].head()

# Or
subset_col = ['region','state']
df[subset_col].head()

Unnamed: 0,region,state
0,East South Central,Alabama
1,Pacific,Alaska
2,Mountain,Arizona
3,West South Central,Arkansas
4,Pacific,California


In [84]:
# Subsetting rows
df[df['family_members'] < 500].head()

Unnamed: 0.1,Unnamed: 0,region,state,individuals,family_members,state_pop
3,3,West South Central,Arkansas,2280.0,432.0,3009733
7,7,South Atlantic,Delaware,708.0,374.0,965479
24,24,East South Central,Mississippi,1024.0,328.0,2981020
26,26,Mountain,Montana,983.0,422.0,1060665
28,28,Mountain,Nevada,7058.0,486.0,3027341


In [85]:
# Subsetting based on text
df[df['state'] == "Arkansas"]

Unnamed: 0.1,Unnamed: 0,region,state,individuals,family_members,state_pop
3,3,West South Central,Arkansas,2280.0,432.0,3009733


In [86]:
# Subsetting based on multiple conditions
under_400_fm = df['family_members'] < 400
region_mountain = df['region'] == "Mountain"
df[under_400_fm & region_mountain]

Unnamed: 0.1,Unnamed: 0,region,state,individuals,family_members,state_pop
50,50,Mountain,Wyoming,434.0,205.0,577601


In [87]:
# Using .isin()
condition = df['state'].isin(['Wyoming'])
df[condition]

Unnamed: 0.1,Unnamed: 0,region,state,individuals,family_members,state_pop
50,50,Mountain,Wyoming,434.0,205.0,577601


## 2. Summary statistics

In [51]:
# Summarising numerical data
df.head()
print(f"Mean: {df['family_members'].mean()}")
print(f"Median: {df['family_members'].median()}")
print(f"Mode: {df['family_members'].mode()}")
print(f"Var: {df['family_members'].var()}")
print(f"Std: {df['family_members'].std()}")
print(f"Sum: {df['family_members'].sum()}")
print(f"Quantile 0.75: {df['family_members'].quantile(0.75)}")
print(f"Min: {df['family_members'].min()}")
print(f"Max: {df['family_members'].max()}")

Mean: 3504.8823529411766
Median: 1482.0
Mode: 0    3250.0
Name: family_members, dtype: float64
Var: 60924453.54588234
Std: 7805.411811421761
Sum: 178749.0
Quantile 0.75: 3196.0
Min: 75.0
Max: 52070.0


### 2.1 Summaries on multiple columns

In [95]:
df[['family_members','individuals']].agg(["mean", "median", "var", "std"])

Unnamed: 0,family_members,individuals
mean,3504.882,7225.784
median,1482.0,3082.0
var,60924450.0,255712900.0
std,7805.412,15991.03


### 2.2 Dropping duplicates