In [1]:
print("Notebook initialized cleanly")


Notebook initialized cleanly


## Data loading and inspection

In [20]:
import pandas as pd

In [21]:
# Loading dataset
file_path = r"C:\Users\wwwsu\Desktop\All folders\data-screening-exercise\data\messy_ice_detention.csv"
df = pd.read_csv(file_path, header=None, encoding='latin1')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,,,,,,,,
1,These statistics are made available to the pub...,,,,,,,
2,"ICE FACILITIES DATA, FY25",,,,,,,
3,"ICE Enforcement and Removal Operations Data, F...",,,,,,,
4,This list is limited to facilities that have a...,,,,,,,


First 6 rows are only the metadata that has be arranged in dataframe, which should be removed just to keep usable data in the dataset.

In [22]:
# Removing first 6 rows
df = df.iloc[6:].reset_index(drop=True)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,Name,City,State,Level A,Level B,Level C,Level D,Last Inspection End Date
1,ADAMS COUNTY DET CENTER,NATCHEZ,MS,1876.461078,266.4311377,6.724550898,4.25748503,45673
2,ADELANTO ICE PROCESSING CENTER,ADELANTO,CA,6.401197605,4.323353293,22.60479042,32.63473054,45491
3,ALAMANCE COUNTY DETENTION FACILITY,GRAHAM,NC,4.329341317,3.461077844,6.083832335,6.185628743,45554
4,ALEXA$NDRIA STAGING FACILITY,ALEXANDRIA,LA,137.5209581,47.41317365,76.00598802,52.48502994,45533


First row is still not the header of dataframe. It should be converted to header.


In [23]:
# Setting first row as header
df.columns = df.iloc[0]
df = df[1:].reset_index(drop=True)
df.head()

Unnamed: 0,Name,City,State,Level A,Level B,Level C,Level D,Last Inspection End Date
0,ADAMS COUNTY DET CENTER,NATCHEZ,MS,1876.461078,266.4311377,6.724550898,4.25748503,45673.0
1,ADELANTO ICE PROCESSING CENTER,ADELANTO,CA,6.401197605,4.323353293,22.60479042,32.63473054,45491.0
2,ALAMANCE COUNTY DETENTION FACILITY,GRAHAM,NC,4.329341317,3.461077844,6.083832335,6.185628743,45554.0
3,ALEXA$NDRIA STAGING FACILITY,ALEXANDRIA,LA,137.5209581,47.41317365,76.00598802,52.48502994,45533.0
4,ALLEGANY COUNTY JAIL,BELMONT,NY,1.221556886,0.018,0.0,0.0,


### Data cleaning 

In [24]:
# Removing unecessary special charaters from 'Name' cloumn 
df['Name'] = df['Name'].str.replace(r'[^a-zA-Z\s]', '', regex=True).str.strip()
df.head(10)

Unnamed: 0,Name,City,State,Level A,Level B,Level C,Level D,Last Inspection End Date
0,ADAMS COUNTY DET CENTER,NATCHEZ,MS,1876.461078,266.4311377,6.724550898,4.25748503,45673.0
1,ADELANTO ICE PROCESSING CENTER,ADELANTO,CA,6.401197605,4.323353293,22.60479042,32.63473054,45491.0
2,ALAMANCE COUNTY DETENTION FACILITY,GRAHAM,NC,4.329341317,3.461077844,6.083832335,6.185628743,45554.0
3,ALEXANDRIA STAGING FACILITY,ALEXANDRIA,LA,137.5209581,47.41317365,76.00598802,52.48502994,45533.0
4,ALLEGANY COUNTY JAIL,BELMONT,NY,1.221556886,0.018,0.0,0.0,
5,ALLEN PARISH PUBLIC SAFETY COMPLEX,OBERLIN,LA,101.3113772,30.25149701,33.76646707,10.66467066,45638.0
6,ATLANTA US PEN,ATLANTA,,17.34131737,3.610778443,3.053892216,2.724550898,
7,BAKER COUNTY SHERIFF DEPT,MACCLENNY,FL,26.22155689,38.64071856,88.65269461,88.73053892,45589.0
8,BERLIN FED CORR INST,BERLIN,NH,1.748502994,0.0898,0.97005988,0.71257485,
9,BLUEBONNET DETENTION FACILITY,ANSON,TX,311.3053892,164.5508982,241.6167665,104.5149701,45638.0


In [25]:
# Identifying blanks cell in 'Name' column 
blanks_names = df['Name'] == ''
df[blanks_names]

Unnamed: 0,Name,City,State,Level A,Level B,Level C,Level D,Last Inspection End Date
116,,ELK RIVER,MN,4.592814371,5.598802395,9.281437126,2.760479042,45414
123,,DOVER,NH,0.0,0.0,49.16167665,41.33532934,45547


Here, taking city and state as reference and reconciling it with the clean data that is available in webpage which is mentioned in the instruction pdf; blanks in the name is needed to be replaced by searching manually. 

* 116 index is SHERBURNE COUNTY JAIL
* 123 index is STE. GENEVIEVE COUNTY SHERIFF/JAIL


In [26]:
# Manually filling the verified facilities name
df.loc[116,'Name' ] = 'SHERBURNE COUNTY JAIL'
df.loc[123,'Name' ] = 'STE. GENEVIEVE COUNTY SHERIFF/JAIL'


In [27]:
# Checking the manually updated rows
df.loc[[116, 123], ['Name', 'City', 'State']]



Unnamed: 0,Name,City,State
116,SHERBURNE COUNTY JAIL,ELK RIVER,MN
123,STE. GENEVIEVE COUNTY SHERIFF/JAIL,DOVER,NH
