### Combining Valid Results

In [None]:
import pandas as pd

# Read data from csv and some data cleaning
valid_results = pd.read_csv("valid_results.csv")

valid_results.columns = valid_results.columns.str.strip()
valid_results.rename(columns = {"gpt_explanation(four questions: 1. Does this text mention location or state?; 2. Is the text related to immigration raids/arrests?; 3. Does this text mention the date and is the date of this immigration raid between {start} and {end}?; 4. Does this text confirm that the raid was conducted by Immigration and Customs Enforcement? Explicitly say yes or no in the first word of your responses along with your explanation.)" : "explanation"}, inplace = True)

valid_results.head()


In [None]:

# Convert date columns to datetime format
valid_results['arrestdate'] = pd.to_datetime(valid_results['arrestdate'])
valid_results['date_start_para'] = pd.to_datetime(valid_results['date_start_para'])
valid_results['date_end_para'] = pd.to_datetime(valid_results['date_end_para'])

# Sort the dataframe by CountyName, ST, and arrestdate to ensure chronological order within each group
valid_results = valid_results.sort_values(by=['CountyName', 'ST', 'arrestdate'])

# Create a helper column for the 'date_diff' (difference between the current row's arrest date and the first arrest date for each group)
valid_results['date_diff'] = valid_results.groupby(['CountyName', 'ST'])['arrestdate'].transform(lambda x: (x - x.min()).dt.days)

# Instead of using `apply`, calculate 'raid_group' using vectorized operations
valid_results['raid_group'] = (valid_results['date_diff'] // 15).astype(int)

# Aggregate the data
agg_df = valid_results.groupby('raid_group').agg(
    CountyName=('CountyName', 'first'),
    ST=('ST', 'first'),
    arrestdate=('arrestdate', 'first'),
    source_title=('source_title', lambda x: list(x.unique())),  # Get unique source titles
    source_link=('source_link', lambda x: list(x.unique())),  # Get unique source links
    publish_date=('publish_date', 'first'),
    explanation=('explanation', lambda x: x.tolist())
).reset_index(drop=True)

# Find the maximum number of sources in any row
max_sources = agg_df['source_title'].apply(len).max()

# Create source columns dynamically based on the maximum number of sources
source_columns = [f'Source_{i+1}' for i in range(max_sources)]

# Split the source_title lists into separate columns dynamically
agg_df[source_columns] = pd.DataFrame(agg_df['source_title'].tolist(), index=agg_df.index)

# Drop the original lists of sources and explanations
agg_df = agg_df.drop(columns=['source_title', 'source_link', 'explanation'])

agg_df.to_csv('aggregated_results.csv', index=False)

agg_df

### Combining Andy's Scraped Data

In [12]:
import pandas as pd

# Read from csv
search_results = pd.read_csv("SearchResultsAndy.csv")
search_results['arrest_date'] = pd.to_datetime(search_results['arrest_date'])

search_results.head()

Unnamed: 0,query,search_pattern,StateCountyFIPS,ST,CountyName,FIPSState,FIPSCounty,arrest_date,title,url,date_published
0,"Immigration raid Alameda, California",pattern1,6001,CA,Alameda,6,1,2015-08-06,"Readers React: A father deported, kids left be...",https://www.latimes.com/opinion/readersreact/l...,2024-11-16T13:57:00.0000000Z
1,"Immigration raid Alameda, California",pattern1,6001,CA,Alameda,6,1,2015-08-06,"Spurned by local law enforcement, ICE stages e...",https://www.latimes.com/local/politics/la-me-i...,2024-10-27T19:58:00.0000000Z
2,"Immigration raid Alameda, California",pattern1,6001,CA,Alameda,6,1,2015-08-06,California gives immigrants here illegally unp...,https://www.latimes.com/local/california/la-me...,2024-11-12T06:30:00.0000000Z
3,"Immigration raid Alameda, California",pattern1,6001,CA,Alameda,6,1,2015-08-06,California's Special Restrictions on Who May C...,https://www.littler.com/publication-press/publ...,2024-11-19T23:54:00.0000000Z
4,"Immigration raid Alameda, California",pattern1,6001,CA,Alameda,6,1,2015-08-06,"UPAC Raids | News, Videos & Articles",https://globalnews.ca/tag/upac-raids/,2024-07-08T20:32:00.0000000Z


In [18]:
import pandas as pd

# Convert date columns to datetime format
search_results['arrest_date'] = pd.to_datetime(search_results['arrest_date'])
search_results['date_published'] = pd.to_datetime(search_results['date_published'])

# Sort the dataframe by CountyName, ST, and arrest_date to ensure chronological order within each group
search_results = search_results.sort_values(by=['CountyName', 'ST', 'arrest_date'])

# Create a helper column for the 'date_diff' (difference between the current row's arrest date and the first arrest date for each group)
search_results['date_diff'] = search_results.groupby(['CountyName', 'ST'])['arrest_date'].transform(lambda x: (x - x.min()).dt.days)

# Calculate 'raid_group' using vectorized operations
search_results['raid_group'] = (search_results['date_diff'] // 15).astype(int)

# Aggregate the data
agg_df = search_results.groupby('raid_group').agg(
    CountyName=('CountyName', 'first'),
    ST=('ST', 'first'),
    arrest_date=('arrest_date', 'first'),
    url=('url', lambda x: list(x.unique())),  # Get unique URLs
    date_published=('date_published', 'first')
).reset_index(drop=True)

# Limit the maximum number of URLs to 3
max_sources = 10  # Set limit to 3

# Function to pad lists to the maximum length
def pad_list(lst, length, fill_value=None):
    return lst + [fill_value] * (length - len(lst)) if len(lst) < length else lst[:length]

# Pad the 'url' lists to the same length for each group, ensuring no extra columns are created
agg_df['url'] = agg_df['url'].apply(lambda x: pad_list(x, max_sources))

# Create columns for URLs dynamically based on max_sources (limit to 3)
url_columns = [f'URL_{i+1}' for i in range(max_sources)]

# Split the 'url' lists into separate columns dynamically
url_df = pd.DataFrame(agg_df['url'].tolist(), columns=url_columns, index=agg_df.index)

# Concatenate the original dataframe with the new URL columns
agg_df = pd.concat([agg_df.drop(columns=['url']), url_df], axis=1)

# Save the aggregated dataframe to a CSV file
agg_df.to_csv('aggregated_search_results.csv', index=False)

# Display the aggregated dataframe
agg_df


Unnamed: 0,CountyName,ST,arrest_date,date_published,URL_1,URL_2,URL_3,URL_4,URL_5,URL_6,URL_7,URL_8,URL_9,URL_10
0,Adams,CO,2014-10-02,2024-11-12 03:36:00+00:00,https://www.techinasia.com/digital-nomads-chas...,https://www.tribune242.com/news/2014/oct/10/ma...,https://siamstartup.com/news/co-working-in-tha...,https://www.huffpost.com/entry/immigration-ice...,https://www.vox.com/2014/10/10/18088638/child-...,https://www.chiangmaicitylife.com/citynews/gen...,https://www.reddit.com/r/Thailand/comments/2hy...,https://www.denverpost.com/2014/10/01/document...,https://www.reddit.com/r/unitedkingdom/comment...,https://www.reddit.com/r/ukpolitics/comments/2...
1,Adams,CO,2014-10-23,2024-11-03 08:09:00+00:00,https://www.buzzfeednews.com/article/adriancar...,https://www.tribune242.com/news/2014/nov/05/pa...,https://www.americanprogress.org/article/10-fa...,https://www.facebook.com/WICIR/posts/call-for-...,https://www.npr.org/2014/11/03/361069785/color...,https://sentinelcolorado.com/opinion/endorseme...,https://www.vox.com/2014/10/29/7083371/swat-no...,https://www.denverpost.com/2014/10/22/northern...,https://www.slideshare.net/slideshow/colorado-...,https://www.reddit.com/r/Colorado/comments/2ko...
2,Adams,CO,2014-11-05,2024-11-14 15:07:00+00:00,https://www.pewresearch.org/race-and-ethnicity...,https://www.tribune242.com/news/2014/nov/05/pa...,https://www.buzzfeednews.com/article/adriancar...,https://www.nytimes.com/2014/11/14/us/obama-im...,https://www.nytimes.com/2014/11/15/us/obama-im...,https://www.nytimes.com/2014/11/16/us/obamas-i...,https://legalservicesincorporated.com/immigrat...,https://daily.jstor.org/influence-wars-america...,https://www.bahamaslocal.com/newsitem/111751/P...,https://www.hamhigh.co.uk/news/crime/21373505....
3,Adams,CO,2014-11-21,2024-08-24 16:09:00+00:00,https://www.cbsnews.com/colorado/news/man-arre...,https://www.cbsnews.com/colorado/news/immigrat...,https://aulawreview.org/blog/fundamentally-unf...,https://www.nytimes.com/2014/12/04/us/politics...,https://www.vox.com/2014/11/20/7255349/obama-i...,https://www.nytimes.com/2014/12/04/us/executiv...,https://www.americanprogress.org/article/remov...,https://www.usnews.com/news/articles/2014/11/2...,https://www.nytimes.com/2014/11/26/us/obamas-i...,https://thelincolnite.co.uk/2014/12/lincoln-ca...
4,Alameda,CA,2015-10-07,2024-11-06 22:20:00+00:00,https://oag.ca.gov/news/press-releases/attorne...,https://www.noozhawk.com/ice_shows_off_santa_m...,https://www.visaliatimesdelta.com/story/news/l...,https://www.nytimes.com/2015/10/11/magazine/is...,https://pembrokeshire-herald.com/18169/illegal...,https://money.cnn.com/2015/10/16/technology/sn...,https://www.politico.com/states/california/sto...,https://www.reddit.com/r/PleX/comments/3ol37z/...,https://blogs.lse.ac.uk/usappblog/2015/10/14/h...,https://www.cnn.com/2015/10/14/world/elian-gon...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,Adams,CO,2018-03-30,2024-09-08 13:52:00+00:00,https://www.vice.com/en/article/trumps-biggest...,https://www.vice.com/en/article/inside-ices-bi...,https://www.courthousenews.com/nearly-100-immi...,https://www.westword.com/news/colorado-health-...,https://www.propublica.org/article/pennsylvani...,https://www.cnn.com/2018/04/12/us/tennessee-im...,https://www.knoxnews.com/story/news/local/tenn...,https://theintercept.com/2018/04/10/ice-raids-...,https://fox17.com/news/local/nearly-100-people...,https://www.seattletimes.com/business/immigrat...
86,Anchorage,AK,2018-05-30,2024-10-30 06:57:00+00:00,https://www.commondreams.org/views/2018/06/13/...,https://www.adn.com/opinions/national-opinions...,https://www.facebook.com/StanfordImmigrationPo...,https://www.independent.co.uk/news/world/ameri...,https://www.mercurynews.com/2018/06/10/immigra...,https://www.independent.co.uk/news/world/ameri...,https://www.nurserymag.com/news/corsos-immigra...,https://thehill.com/latino/391057-undercover-i...,https://www.seattletimes.com/nation-world/ice-...,https://www.urbanacitizen.com/2018/06/10/immig...
87,Adams,CO,2018-05-07,2024-11-19 07:23:00+00:00,https://www.desmoinesregister.com/story/news/i...,https://www.desmoinesregister.com/story/news/i...,https://www.kcrg.com/content/news/Postville-co...,https://www.iowapublicradio.org/ipr-news/2018-...,https://www.univision.com/univision-news/immig...,https://www.desmoinesregister.com/story/news/i...,https://americasvoice.org/press_releases/a-tal...,https://www.facebook.com/ColorinColorado.org/p...,https://www.kcrg.com/content/news/Immigration-...,https://www.cbpp.org/research/administration-a...
88,Arapahoe,CO,2018-05-31,2024-07-14 00:59:00+00:00,https://www.cbsnews.com/news/immigration-raids...,https://www.nbcnews.com/news/us-news/immigrati...,https://www.washingtontimes.com/news/2018/jun/...,https://www.gardencentermag.com/news/corsos-im...,https://www.independent.co.uk/news/world/ameri...,https://lawandcrime.com/immigration/ice-arrest...,https://www.timesrepublican.com/news/todays-ne...,https://www.cnn.com/2018/06/06/us/ice-undocume...,https://www.americamagazine.org/faith/2018/06/...,https://apnews.com/general-news-4f224c22096644...
