In [42]:
import pandas as pd

# pd.set_option('display.max_colwidth', None)

# Raw Layer

## Find URLs that have the same host

> Inputs in this section:<br>
urls_658.csv (ideal list of URLs)

In [26]:
import pandas as pd

from urllib.parse import urlparse

In [2]:
df = pd.read_csv("urls_658.csv")
df.columns = ["name", "url"]

In [3]:
df[["hostname"]] = df["url"].apply(lambda x: urlparse(x).hostname)
df[["path"]] = df["url"].apply(lambda x: urlparse(x).path)

In [4]:
df1 = df.groupby(['hostname'])['url'].size().to_frame(name = 'count').reset_index()
df1.loc[df1['count']>1]

Unnamed: 0,hostname,count
31,communitynews.org,10
34,dailyvoice.com,22
54,hudsonreporter.com,3
62,magic983.com,2
65,medium.com,2
70,mybeachradio.com,2
73,newjersey.news12.com,2
88,patch.com,101
99,rennamedia.com,20
124,thepressgroup.net,2


> *Bunch of manual work later, reward received:*<br>
XLSX file containing distinct host URLs and same host URLs (urls.xlsx).

## Output contents of each URL to different CSVs

> Inputs in this section:<br>
data.csv (data got from ARCH)<br>
urls.xlsx (output from previous section which holds distinct and duplicate hosts)

In [12]:
# Run only if the data needs to be downloaded again
# !pip install gdown --user

# import gdown

# gdown.download(id="1z0NIFs--FCaD_q9W3B_IstX1rsX5kDW1", output="data.csv", quiet=False)

In [None]:
# !pip install openpyxl --user
# !pip install xlrd --user

In [1]:
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [4]:
def process_hosts(urls, urls_list, output_path, starting_number = 0):
    for i in range(starting_number, len(urls_list)):
        url = urls_list[i]
        file_name = url.split("//")[1].replace('/', '_')
        output = spark.sql(f"SELECT crawl_date, domain, url, content FROM data WHERE url LIKE '{url}%'")
        if not output.rdd.isEmpty():
            temp_df = output.toPandas()
            temp_df.to_csv(f"duplicate_articles/{output_path}/{i}_{file_name}.csv", index=False)
            temp_df = temp_df.drop_duplicates(subset='content', keep="first")
            temp_df.to_csv(f"distinct_articles/{output_path}/{i}_{file_name}.csv", index=False)
        print(f"Done with {i}. {url}")
    print("*" * 20 + "FINISHED" + "*" * 20)

In [3]:
data = spark.read.csv('data.csv', header=True)
data.createOrReplaceTempView("data")

In [None]:
# distinct host urls
urls_df = pd.read_excel('urls.xlsx', sheet_name=0, index_col=None)
process_hosts(urls_df, urls_df['url'].tolist(), "distinct_host", 0)

In [None]:
# duplicate host urls
urls_df = pd.read_excel('urls.xlsx', sheet_name=1, index_col=None)
process_hosts(urls_df, urls_df['url'].tolist(), "duplicate_host", 0)

In [7]:
# Fix the duplicate URLs
import pandas as pd

links = {
  "147_communitynews.org_.csv": ('https://communitynews.org/bordentown-current',
                                  'https://communitynews.org/ewing-observer',
                                  'https://communitynews.org/hamilton-post',
                                  'https://communitynews.org/hopewell-express',
                                  'https://communitynews.org/lawrence-gazette',
                                  'https://communitynews.org/princeton-echo',
                                  'https://communitynews.org/robbinsville-advance',
                                  'https://communitynews.org/trenton-downtowner',
                                  'https://communitynews.org/west-windsor-news'),
  "188_www.jerseyshoreonline.com_.csv": ('https://www.jerseyshoreonline.com/thehowelltimes',
                                          'https://www.jerseyshoreonline.com/thejacksontimes',
                                          'https://www.jerseyshoreonline.com/themanchestertimes'),
  "195_www.mycentraljersey.com_.csv": ('http://www.mycentraljersey.com/news/courier-news'),
  "232_www.northjersey.com_.csv": ('https://www.northjersey.com/?fbclid=IwAR3MTLRt4BFrGFRnLoOh4To_lTomR1s4fpjql8CMiCYf9KcT3c1VYNqerLs'),
  "243_www.pressofatlanticcity.com_.csv": ('https://www.pressofatlanticcity.com/currents_gazettes/downbeach',
                                          'https://www.pressofatlanticcity.com/currents_gazettes/egg_harbor_township',
                                          'https://www.pressofatlanticcity.com/currents_gazettes/galloway_township',
                                          'https://www.pressofatlanticcity.com/currents_gazettes/hamilton_township',
                                          'https://www.pressofatlanticcity.com/currents_gazettes/mainland',
                                          'https://www.pressofatlanticcity.com/currents_gazettes/middle_township',
                                          'https://www.pressofatlanticcity.com/currents_gazettes/pleasantville_absecon',
                                          'https://www.pressofatlanticcity.com/currents_gazettes/site',
                                          'https://www.pressofatlanticcity.com/currents_gazettes/upper_township'),
  "315_www.wnyc.org_.csv": ('https://www.wnyc.org/series/new-jersey-public-radio/')
}

for file_name, urls in links.items():
    path = "duplicate_articles"
    df = pd.read_csv(f"{path}/duplicate_host/{file_name}")
    df1 = df[~df.url.str.startswith(urls)]
    df1.to_csv(f"{path}/{file_name}", index=False)
    
    path = "distinct_articles"
    df = pd.read_csv(f"{path}/duplicate_host/{file_name}")
    df1 = df[~df.url.str.startswith(urls)]
    df1.to_csv(f"{path}/{file_name}", index=False)

print("*" * 20 + "FINISHED" + "*" * 20)

********************FINISHED********************


> At this point, we have data for each domain as seperate CSV files.There are two folders created:<br>
&nbsp;distinct_articles - The original data with duplicates removed.<br>
&nbsp;duplicate_articles - The original data as is

***This should be the starting point for any analysis.***

## Analysis

### How many domains are there in the original dataset used as input?

### How many crawls were done for each domain?

### How many articles were fetched per domain?

> Inputs in this section: Folders _distinct_articles_ and _duplicate_articles_

The caveat here is that irrespective of the number of times the site was crawled, the content might not be same.<br>
Another way to think about this is that beacuse of different crawl dates, the same articles might have been crawled twice.<br>
Esesentially, what this means is that number of times crawled is not related/proportional to the number of articles.<br>

In [9]:
!rm -rf `find -type d -name .ipynb_checkpoints`

In [21]:
import os
import pandas as pd

# How many distinct articles were crawled per domain?
count = dict()
paths_to_folders = ['distinct_articles/distinct_host/', 'distinct_articles/duplicate_host/']
for folder in paths_to_folders:
    for csv_file in os.listdir(folder):
        df = pd.read_csv(folder + csv_file)
        count[csv_file.split("_", 1)[1][:-4].replace("_", "/")] = df.shape[0]
df_articles_count = pd.DataFrame(count, index=[0]).melt(var_name="url", value_name="unique_articles_count")

# How many times was a domain crawled?
count = dict()
paths_to_folders = ['duplicate_articles/distinct_host/', 'duplicate_articles/duplicate_host/']
for folder in paths_to_folders:
    for csv_file in os.listdir(folder):
        df = pd.read_csv(folder + csv_file)
        count[csv_file.split("_", 1)[1][:-4].replace("_", "/")] = df['crawl_date'].nunique()
df_crawl_count = pd.DataFrame(count, index=[0]).melt(var_name="url", value_name="unique_crawl_dates_count")

print("*" * 20 + "FINISHED" + "*" * 20)

********************FINISHED********************


In [22]:
# Merge and save the final output
pd.merge(df_articles_count, df_crawl_count, on='url').to_csv("original_metadata.csv", index=False)

> *Reward received:*<br>
original_metadata.csv containing domains from the initial 658 which has articels, the number of times the domain was crawled, and the number of unique articles each domain has.

# Cleansed Layer

## Fetch Articles

> Inputs in this section is the distinct_articles folder

Due to the issues with the articles, we crawl the URLs and get the articles ourselves. We also perform a group by based on the path of the URL, to eliminate duplicates. The algorithm for this section is as follows:
<ol>
    <li>For each domain, follow steps 2 to 3.</li>
    <li>Create a column called 'new_url' from the 'url' column.</li>
        <ul><li>Now that I think about it, this step _might have been_ redundant as 
            I could have made a copy of the url column and achieved the desired result.</li></ul>
    <li>Group the data based on new_url column we just created.</li>
        <ul><li>To make sure we do not lose any data, we store the data in 'crawl_date' and 'url' as comma seperated values.</li></ul>
    <li>Extract the articles for each 'new_url' columns.</li>
    <li>Save the output to 'temp_results/domain_name.csv'.</li>
</ol>

In [None]:
%%capture
!pip3 install newspaper3k

In [5]:
import os
import glob
import newspaper
import numpy as np
import pandas as pd

from urllib.parse import urlparse

In [2]:
def extract_details(temp):
    try:
        article = newspaper.Article(temp['new_url'])
        article.download()
        article.parse()
        temp['text'] = article.text
    except newspaper.article.ArticleException:
        temp['text'] = "ArticleException"
    except Exception as e:
        temp['text'] = e
    return temp

In [3]:
def small_sites(inp_df, base_path, output_name):
    print(f"Initial Length = {inp_df.shape[0]}")

    # extract URL path
    inp_df['crawl_date'] = inp_df['crawl_date'].astype(str)
    inp_df['new_url'] = inp_df["url"].astype(str).apply(lambda x: base_path+urlparse(x).path)
    inp_df = inp_df.groupby('new_url').agg({'crawl_date': lambda x: ','.join(x), 
                                            'url': lambda x: ','.join(x)}).reset_index()
    print(f"Final Length = {inp_df.shape[0]}")

    # extract articles
    output_df = inp_df.apply(extract_details, axis=1)
    
    # save results
    output_df.to_csv(f"temp_results/{output_name}.csv", index=False)

In [4]:
def large_sites(inp_df, base_path, output_name):
    print(f"Initial Length = {inp_df.shape[0]}")
    
    # extract URL path
    inp_df['crawl_date'] = inp_df['crawl_date'].astype(str)
    inp_df['new_url'] = inp_df["url"].astype(str).apply(lambda x: base_path+urlparse(x).path)
    inp_df = inp_df.groupby('new_url').agg({'crawl_date': lambda x: ','.join(x), 
                                            'url': lambda x: ','.join(x)}).reset_index()
    print(f"Final Length = {inp_df.shape[0]}")
    
    # extract articles for each part
    for g, part_df in inp_df.groupby(np.arange(len(inp_df)) // 100):
        for index, row in part_df.iterrows():
            try:
                article = newspaper.Article(row['url'])
                article.download()
                article.parse()
                part_df['text'] = article.text
            except newspaper.article.ArticleException:
                part_df['text'] = "ArticleException"
            except Exception as e:
                part_df['text'] = e
        part_df.to_csv(f"temp_results/{output_name}_{g}.csv", index=False)
        print(f"Finished with part {output_name}_{g}")

In [None]:
# distinct host files
files = sorted(glob.glob('distinct_articles/distinct_host/*.csv'), key=lambda x: os.path.basename(x).split('_')[0])
for file in files:
    df = pd.read_csv(file)
    file_name = file.rsplit("/", 1)[1]    
    base_path = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(df.at[0, 'url']))
    print(f"Started with {file_name}")
    df = small_sites(df, base_path, file_name[:-4])
    print(f"Finished with {file_name}")

In [None]:
# duplicate host files
files = sorted(glob.glob('distinct_articles/duplicate_host/*.csv'), key=lambda x: os.path.basename(x).split('_')[0])
for i in range(0, len(files)):
    file=files[i]
    df = pd.read_csv(file)
    file_name = file.rsplit("/", 1)[1]    
    base_path = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(df.at[0, 'url']))
    print(f"Started with {file_name}")
    df = small_sites(df, base_path, file_name[:-4])
    print(f"Finished with {file_name}")

In [3]:
# sanity check to make sure we did not lose any data in between
!ls distinct_articles/distinct_host | wc -l
!ls distinct_articles/duplicate_host | wc -l
!ls temp_results/ | wc -l

218
218
436


> *Reward received:*<br>
temp_results/*.csv containing articles on a URL grouped level

## Clean Data
This section produces the final set of articles required for text analysis by applying various cleaning rules.

> Inputs in this section are the files from 'temp_results' folder.

In [1]:
import glob
import pandas as pd

In [2]:
texts = [
    "All Categories Adult Community Apartment Commercial/Investment Condo Co-op Horse Farms Land for Sale Luxury Home Manufactured Mobile Multi-Family Rental Senior Adult Care Single Family Townhouse Vacation Homes All Towns Asbury Park Barnegat/Waretown Basking Ridge Bayonne Belmar/Lake Como Berkeley Heights Bernardsville & Bedminster Bloomfield Bordentown Bridgewater/Raritan Camden Chatham Clark Coral Springs Cranford Denville Doylestown East Brunswick East Hanover/Florham Park East Orange/Orange Edison Elizabeth Fair Lawn/Glen Rock Flemington/Raritan Franklin Township Greater Olean Hackensack Hackettstown Halston Media Hamilton/Robbinsville Hasbrouck Heights/Wood-Ridge/Teterboro Hawthorne Hazlet & Keyport Hillsborough Hillside Hoboken Holmdel & Colts Neck Jersey City Katonah/Lewisboro Kenilworth Linden Little Egg Harbor & Tuckerton Livingston Madison Mahopac Middletown Millburn/Short Hills Milltown/Spotswood Montclair Montville Morristown Mountainside Mount Laurel Newark New Brunswick New Providence Newton North Plainfield/Green Brook/Watchung North Salem Nutley Paramus Parkland Parsippany Passaic Valley Paterson Phillipsburg Piscataway Plainfield Princeton Rahway Randolph Raritan Bay Red Bank Ridgewood Roselle Roselle Park Roxbury Scotch Plains/Fanwood SOMA Somers Somerville South Brunswick South Plainfield Sparta Springfield Stafford/LBI Summit Testville Union Verona/Cedar Grove Waltham Warren Wayne West Essex Westfield West Orange Woodbridge/Carteret Yorktown",
    "You must be logged in to access this page. If you don't have an account, you can sign up using the links below the login form.",
    "All Categories Antiques Apartments for Rent Autos Business Childcare Children clothes Community Computer/iPhone Training Editing/Proofreading Education Environment Estate Sale Financial Industry Food & Drink For Sale Free Furniture for Sale Garage Sale General Manager Geriatric Home care Health and Wellness Help Wanted Hiring/Jobs Home-Based Business Home Services Lawyer Looking for... Lost and found Musical Instruments Music Lessons Nanny Office Share Open House Outdoor Participants Wanted Part Time Jobs Pets Public Notices Real estate Rentals Seminars Senior Adult Care Services Offered Tickets Toys Tutoring Vacation Rental Volunteer All Towns Asbury Park Barnegat/Waretown Basking Ridge Bayonne Belmar/Lake Como Berkeley Heights Bernardsville & Bedminster Bloomfield Bordentown Bridgewater/Raritan Camden Chatham Clark Coral Springs Cranford Denville Doylestown East Brunswick East Hanover/Florham Park East Orange/Orange Edison Elizabeth Fair Lawn/Glen Rock Flemington/Raritan Franklin Township Greater Olean Hackensack Hackettstown Halston Media Hamilton/Robbinsville Hasbrouck Heights/Wood-Ridge/Teterboro Hawthorne Hazlet & Keyport Hillsborough Hillside Hoboken Holmdel & Colts Neck Jersey City Katonah/Lewisboro Kenilworth Linden Little Egg Harbor & Tuckerton Livingston Madison Mahopac Middletown Millburn/Short Hills Milltown/Spotswood Montclair Montville Morristown Mountainside Mount Laurel Newark New Brunswick New Providence Newton North Plainfield/Green Brook/Watchung North Salem Nutley Paramus Parkland Parsippany Passaic Valley Paterson Phillipsburg Piscataway Plainfield Princeton Rahway Randolph Raritan Bay Red Bank Ridgewood Roselle Roselle Park Roxbury Scotch Plains/Fanwood SOMA Somers Somerville South Brunswick South Plainfield Sparta Springfield Stafford/LBI Summit Testville Union Verona/Cedar Grove Waltham Warren Wayne West Essex Westfield West Orange Woodbridge/Carteret Yorktown",
    "You May Also Be Interested In",
    "NEW JERSEY — The IRS is taking notice of the impact inflation is having on taxpayers. As a result, your take-home pay may go up next year, and you may be able to set aside more for retirement. The IRS has boosted the thresholds for all seven federal income tax brackets, applying to tax year 2023 ... Read more »",
    """By TAPINTO STAFF NEWARK/TRENTON, NJ — The FBI says it has found the person responsible for making a "broad threat" against New Jersey synagogues and that person "no longer poses a danger to the community." On Twitter, the FBI's Newark Office released a statement, saying "Upon receipt of threat information against an unspecified New Jersey area synagogue, the FBI notified ...""",
    "The Patch Local Directory is the best way to get your business seen on the Patch. Basic listings are free. With your listing you also gain access to post classifieds and events. For information about our premium listing features please click here to contact our sales team.",
    "Whether you’re a small business or a national chain, advertising on Patch enables you to target a relevant audience, build community trust in your business, and increase website traffic to boost brand awareness. Fill out the below form and our team will contact you to customize a plan specifically for your business that meets your advertising goals.",
    "We're Sorry! The page you were looking for could not be found. You may be able to find it by returning to the homepage or by visiting our search page. We apologize for the inconvenience.",
    "0 This post was contributed by a community member. The views expressed here are the author's own.",
    "New Jersey Date Night is a series on NJ Flavor by relationship writer Craig Rogers. New Jersey is a state known for firsts, from the first college football game to the first US town to have electricity. One of the places where lots of firsts happened is the oldest township in our state ... Read more »",
    "Copy and paste this URL into your WordPress site to embed",
    "By signing up, you agree to TAPinto's Terms of Use and Privacy Policy",
    "According to recent data, 85% of Americans own a smartphone**. Smartphones are routinely used to publish pictures and video via websites and apps, which enables users to share their participation in an event, presence at a location, or highlight a relationship or connection with other people. The ... Read more »",
    "For decades, New Jersey has been kicking the can down the road with regard to infrastructure. Well, as of now even that road is in such bad shape, that the can can’t be kicked any further. The American Society of Civil Engineers gave New Jersey a failing grade of D+. Almost 10% of our bridges ... Read more »",
    "For decades, New Jersey has been kicking the can down the road with regard to infrastructure. Well, as of now even that road is in such bad shape, that the can can’t be kicked any further. The American Society of Civil Engineers gave New Jersey a failing grade of D+. Almost 10% of our bridges ... Read more »",
    "I often say that NJ TRANSIT has a commitment not just to the customers who ride our trains and buses, but also to the communities we serve. When it comes to climate change, that community is global — and NJ TRANSIT is working to change the way we work and deliver our services to reduce our impact ... Read more »",
    "A Delaware County man was charged with trespassing after he ran onto the field at Citizens Bank Park during game five of the 2022 World Series on Thursday, Nov. 3, police said. Luke Lulevitch, 20, of Nether Providence Township, was arrested by Philadelphia police after a brief jaunt through the ou… Read More",
    "A Hunterdon County school board candidate who is running unopposed appeared on Twitter donning a costume he wore to a Halloween event — a shirt that said, “where is Nancy?” accompanied by a skeleton and his wife in a zombie suit, NJ.com reports. “These people walk among us. Not good,” reads the ca… Read More",
    "A Jersey City redevelopment project is one step closer to breaking ground since the $70 million land sale was finalized last month. Represented by GRID Real Estate, The Albanese Organization is planning on bringing 670 residential units to the 1.83-acre parcel at 286 Cole St., which had previ… Read More",
    """Acceptance of terms of use and amendments Each time you use or cause access to this web site, you agree to be bound by these Terms of use, as amended from time to time with or without notice to you. In addition, if you are using a particular service on this web site or accessed via this web site, you will be subject to any rules or guidelines applicable to those services, and they will be incorporated by reference within these Terms of use. Please read the site's Privacy policy, which is incorporated within these Terms of use by reference. The site editor's service This web site and the services provided to you on and via this web site are provided on an "AS IS" basis. You agree that the site editor reserves the right to modify or discontinue provision of this web site and its services, and to remove the data you provide, either temporarily or permanently, at any time, without notice and without any liability towards you, The site editor will not be held responsible or liable for timeliness, removal of information, failure to store information, inaccuracy of information, or improper delivery of information. Your responsibilities and registration obligations In order to use this web site or certain parts of it, you may be required to register for a user account on this web site; in this case, you agree to provide truthful information when requested, and -- if a minimum age is required for eligibility for a user account -- you undertake that you are at least the required age. By registering for a user account, you explicitly agree to this site's Terms of use, including any amendments made by the site editor that are published herein. Privacy policy Registration data and other personally identifiable information that the site may collect is subject to the terms of the site editor's Privacy policy. Registration and password You are responsible for maintaining the confidentiality of your password, and you will be responsible for all usage of your user account and/or user name, whether authorized or not authorized by you. You agree to immediately notify the site editor of any unauthorized use of your user account, user name or password. Your conduct You agree that all information or data of any kind, whether text, software, code, music or sound, photographs or graphics, video or other materials ("content"), made available publicly or privately, will be under the sole responsibility of the person providing the said content, or of the person whose user account is used. You agree that this web site may expose you to content that may be objectionable or offensive. The site editor will not be responsible to you in any way for content displayed on this web site, nor for any error or omission. By using this web site or any service provided, you explicitly agree that: (a) you will not provide any content or conduct yourself in any way that may be construed as: unlawful; illegal; threatening; harmful; abusive; harassing; stalking; tortious; defamatory; libelous; vulgar; obscene; offensive; objectionable; pornographic; designed to interfere with or disrupt the operation of this web site or any service provided; infected with a virus or other destructive or deleterious programming routine; giving rise to civil or criminal liability; or in violation of an applicable local, national or international law; (b) you will not impersonate or misrepresent your association with any person or entity; you will not forge or otherwise seek to conceal or misrepresent the origin of any content provided by you; (c) you will not collect or harvest any information about other users; (d) you will not provide, and you will not use this web site to provide, any content or service in any commercial manner, or in any manner that would involve junk mail, spam, chain letters, pyramid schemes, or any other form of unauthorized advertising or commerce; you will not use this web site to promote or operate any service or content without the site editor's prior written consent; (e) you will not provide any content that may give rise to the site editor being held civilly or criminally liable, or that may be considered a violation of any local, national or international law, including -- but not limited to -- laws relating to copyrights, trademarks, patents, or trade secrets. Submission of content on this web site By providing any content to this web site: (a) you agree to grant the site editor a worldwide, royalty-free, perpetual, non-exclusive right and license (including any moral rights or other necessary rights.) to use, display, reproduce, modify, adapt, publish, distribute, perform, promote, archive, translate, and to create derivative works and compilations, in whole or in part. to use, copy, modify, transmit, sell, exploit, create derivative works from, distribute, and/or publicly perform or display such material, in whole or in part, in any manner or medium (whether now known or hereafter developed), for any purpose that we choose. Such license will apply with respect to any form, media, technology already known at the time of provision or developed subsequently. Also, in connection with the exercise of these rights, you grant us, and anyone authorized by us, the right to identify you as the author of any of your postings or submissions by name, email address or screen name, as we deem appropriate. You understand that the technical processing and transmission of the Site, including content submitted by you, may involve transmissions over various networks, and may involve changes to the content to conform and adapt it to technical requirements of connecting networks or devices. You will not receive any compensation of any kind for the use of any materials submitted by you. (b) you warrant and represent that you have all legal, moral, and other rights that may be necessary to grant the site editor the license specified in this section 7; (c) you acknowledge and agree that the site editor will have the right (but not obligation), at the site editor's entire discretion, to refuse to publish, or to remove, or to block access to any content you provide, at any time and for any reason, with or without notice. Third-party services Goods and services of third parties may be advertised and/or may be made available on or through this web site. Representations made regarding products and services provided by third parties will be governed by the policies and representations made by these third parties. The site editor will not in any manner be liable for or responsible for any of your dealings or interaction with third parties. Indemnification You agree to indemnify and hold harmless the site editor and the site editor's representatives, subsidiaries, affiliates, related parties, officers, directors, employees, agents, independent contractors, advertisers, partners, and co-branders, from any claim or demand, including reasonable legal fees, that may be filed by any third party, arising out of your conduct or connection with this web site or service, your provision of content, your violation of these Terms of use, or any other violation by you of the rights of another person or party. DISCLAIMER OF WARRANTIES YOU UNDERSTAND AND AGREE THAT YOUR USE OF THIS WEB SITE AND OF ANY SERVICES OR CONTENT PROVIDED (THE "SERVICE") IS AT YOUR OWN RISK. SERVICES AND CONTENT ARE PROVIDED TO YOU "AS IS", AND THE SITE EDITOR EXPRESSLY DISCLAIMS ALL WARRANTIES OF ANY KIND, EITHER IMPLIED OR EXPRESS, INCLUDING BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THE SITE EDITOR MAKES NO WARRANTY, EITHER IMPLIED OR EXPRESS, THAT ANY PART OF THE SERVICE WILL BE UNINTERRUPTED, ERROR-FREE, VIRUS-FREE, TIMELY, SECURE, ACCURATE, RELIABLE, OR OF ANY QUALITY, NOR IS IT WARRANTED EITHER IMPLICITLY OR EXPRESSLY THAT ANY CONTENT IS SAFE IN ANY MANNER FOR DOWNLOAD. YOU UNDERSTAND AND AGREE THAT NEITHER THE SITE EDITOR NOR ANY PARTICIPANT IN THE SERVICE PROVIDES PROFESSIONAL ADVICE OF ANY KIND AND THAT ANY ADVICE OR ANY OTHER INFORMATION OBTAINED VIA THIS WEB SITE MAY BE USED SOLELY AT YOUR OWN RISK, AND THAT THE SITE EDITOR WILL NOT BE HELD LIABLE IN ANY WAY. Some jurisdictions may not allow disclaimers of implied warranties, and certain statements in the above disclaimer may not apply to you as regards implied warranties; the other terms and conditions remain enforceable notwithstanding. LIMITATION OF LIABILITY YOU EXPRESSLY UNDERSTAND AND AGREE THAT THE SITE EDITOR WILL NOT BE LIABLE FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, CONSEQUENTIAL OR EXEMPLARY DAMAGES; THIS INCLUDES, BUT IS NOT LIMITED TO, DAMAGES FOR LOSS OF PROFITS, GOODWILL, USE, DATA OR OTHER INTANGIBLE LOSSES (EVEN IF THE SITE EDITOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES), RESULTING FROM (I) THE USE OF SERVICES OR THE INABILITY TO USE SERVICES, (II) THE COST OF OBTAINING SUBSTITUTE GOODS AND/OR SERVICES RESULTING FROM ANY TRANSACTION ENTERED INTO ON THROUGH SERVICES, (III) UNAUTHORIZED ACCESS TO OR ALTERATION OF YOUR DATA TRANSMISSIONS, (IV) STATEMENTS BY ANY THIRD PARTY OR CONDUCT OF ANY THIRD PARTY USING SERVICES, OR (V) ANY OTHER MATTER RELATING TO SERVICES. In some jurisdictions, it is not permitted to limit liability and, therefore, such limitations may not apply to you. Reservation of rights The site editor reserves all of the site editor's rights, including but not limited to any and all copyrights, trademarks, patents, trade secrets, and any other proprietary right that the site editor may have in respect of this web site, its content, and goods and services that may be provided. The use of the site editor's rights. and property requires the site editor's prior written consent. By making services available to you, the site editor is not providing you with any implied or express licenses or rights, and you will have no rights to make any commercial use of this web site or provided services without the site editor's prior written consent. Notification of copyright infringement If you believe that your property has been used in any way that could be considered a copyright infringement or a violation of your intellectual property rights, the site editor's copyright agent may be contacted via: copyright@dailyvoice.com Applicable law You agree that these Terms of use and any dispute arising out of your use of this web site or products or services provided will be governed by and construed in accordance with local laws applicable at the site editor's domicile, notwithstanding any differences between the said applicable legislation and legislation in force at your location. By registering for a user account on this web site, or by using this web site and the services it provides, you accept that jurisdiction is granted to the courts having jurisdiction over the site editor's domicile, and that any disputes will be heard by the said courts.""",
    "Already reeling from the death of a local firefighter following an intense house blaze, the community of area responders sadly learned that they'd lost another. Upper Saddle River Firefighter Alex Moss, 54, had dedicated more than 30 years of his life to the service of others. Colleagues official… Read More",
    "Content Producer Paul Milo has worked as a reporter in northern New Jersey for more than 20 years, first for North Jersey Media Group / NorthJersey.com, then at Patch.com and at NJ.com / The Star-Ledger. He has written about a 106-year-old World War I veteran, the Sept. 11 attacks and a 15-year-old stock manipulator who was the youngest person ever civilly charged by the Securities and Exchange Commission, among many other stories. He has interviewed governors, presidential candidates and celebrities including Ryan Seacrest and comedian Kevin Hart. In addition to his work for New Jersey-based outlets, he has also written for BeliefNet, Editor and Publisher, The Huffington Post, The Boston Globe and other publications, as well as freelance assignments for global chemical company BASF. His first book, Your Flying Car Awaits, was published by HarperCollins in 2010. As part of the book’s promotional efforts he was interviewed by The New York Times and on WNYC radio’s The Brian Lehrer Show.",
    """Daily Voice takes your right to privacy seriously, and we want you to feel comfortable using this web site. This privacy policy deals with personally-identifiable information (referred to as "data" below) that may be collected by this site. This policy does not apply to other entities that are not owned or controlled by Daily Voice (and we suggest that you review those sites' policies to learn more about their privacy practices), nor does it apply to persons that are not employees or agents of Daily Voice, or that are not under Daily Voice's control. Daily Voice is based in the United States. If you voluntarily submit Personal Information to us while located or residing in a country outside the United States, you understand that you are voluntarily agreeing to the use and processing of such information as provided in this Privacy Policy, including transfer of your information to the United States, a jurisdiction which may not offer the same framework for the protection of personal information as the jurisdiction in which you are located. Please take time to read this site's Terms of use . Collection of data Certain visitors to this website interact in ways that require Daily Voice to gather personally-identifying information. The amount and type of information that Daily Voice gathers depends on the nature of the interaction. For example, registration for an account on this site requires a valid e-mail address. Please be aware that the e-mail address you provide and any other information you enter may render you personally identifiable, and may possibly be displayed on this web site intentionally (depending on choices you make during the registration process, or depending on the way in which the site is configured) or unintentionally (subsequent to a successful act of intrusion by a third party). As on many web sites, Daily Voice may also automatically receive general information that is contained in server log files, such as your IP address and cookie information. Information about how advertising may be served on this site is set forth below. When you use Daily Voice, we may automatically recognize information like your IP address, what type of device you're using, high-level location information (e.g., city or state), and your user behavior (such as how long you stay on the website and which links you click on). Some of the third-party vendors that we use to provide services for Daily Voice (e.g. analytics) may also use their own tracking technologies on our digital properties to help us run and optimize our platforms. These third parties may collect information about your online activities over time and across the Daily Voice platform and other online properties. Users who wish to opt out of Google Analytics data collection may use Google's own browser add-on to do so. Use of data Data may be used to customize and improve your user experience on this site, to allow us to better respond to your customer service inquiries, to administer contests, promotions, surveys or other site features, or to send you periodic emails or other communications. Efforts will be made to prevent your data being made available to third parties unless (i) provided for otherwise in this Privacy Policy; (ii) your consent is obtained, such as when you choose to opt-in or opt-out for the sharing of data; (iii) a service provided on our site requires interaction with a third party, or is provided by a third party, such as an application service provider; (iv) pursuant to legal action or law enforcement; (v) it is found that your use of this site violates Daily Voice's policy, Terms of use, or other usage guidelines, or if it is deemed reasonably necessary by Daily Voice to protect Daily Voice's legal rights and/or property; or (vi) this site is purchased by a third party, in which case that third party will be able to use the data in the same manner as set forth in this policy. In the event you choose to use links displayed on this web site to visit other web sites, you are advised to read the privacy policies published on those sites. Note that aggregated and non-personally identifiable visitor information may be provided to other parties for marketing, advertising, or other uses. Cookies Like many web sites, this web site sets and uses cookies to enhance your user experience -- to remember your personal settings, for instance. Advertisements may display on this web site and, if so, may set and access cookies on your computer; such cookies are subject to the privacy policy of the parties providing the advertisement. However, the parties providing the advertising do not have access to this site's cookies. These parties usually use non-personally-identifiable or anonymous codes to obtain information about your visits to this site. If you disable cookies in your browser, some features of the site experience may not function properly. Third-party links Occasionally, at Daily Voice's discretion, we may include or offer third-party products or services on our website. These third-party sites have separate and independent privacy policies. The tracking technologies used by such third parties are subject to their own privacy policies, and we are not involved in or responsible for their activities or use of any data they collect. We therefore have no responsibility or liability for the content and activities of these linked sites. Nonetheless, we seek to protect the integrity of our site and welcome any feedback about these sites. Google We use Google Analytics and Google advertising products on our website. Google's advertising requirements can be summed up by Google's Advertising Principles. They are put in place to provide a positive experience for users. See: https://support.google.com/adwordspolicy/answer/1316548?hl=en. Google, as a third-party vendor, uses cookies to serve ads on our site. Google's use of cookies enables it to serve ads to our users based on previous visits to our site and other sites on the Internet. Users may opt-out of the use of Google cookies by visiting the Google Ad and Content Network privacy policy. We have implemented the following: (a) Remarketing with Google AdSense (b) Google Display Network Impression Reporting (c) Demographics and Interests Reporting (d) DoubleClick Platform Integration Daily Voice, along with third-party vendors such as Google use first-party cookies (such as the Google Analytics cookies) and third-party cookies (such as the DoubleClick cookie) or other third-party identifiers together to compile data regarding user interactions with ad impressions and other ad service functions as they relate to this website. Users can set preferences for how Google advertises to you using the Google Ad Settings page. Alternatively, you can opt out by visiting the Network Advertising Initiative Opt Out page or by using the Google Analytics Opt Out Browser add on. LiveRamp When you use our website, we share information that we collect from you, such as your email (in hashed form), IP address or information about your browser or operating system, with our partner/service provider, LiveRamp Inc. LiveRamp returns an online identification code that we may store in our first-party cookie for our use in online and cross-channel advertising and it may be shared with advertising companies to enable interest-based and targeted advertising. To opt out of this use, please click here. Common ID Daily Voice uses a module called Common ID, which stores a unique ID for each user on our sites. This ID may be provided to advertising partners. Daily Voice allows its users to opt out of Common ID if they choose. To opt out of Common ID, click here. Information Security and Storage Daily Voice uses services that feature industry-standard digital, and training-based security measures to protect user information against theft and misuse. All Daily Voice websites are protected by Secure Sockets Layer (SSL) security features. We May Share Information in Limited Circumstances (a) With Vendors. At the present time, Daily Voice does not share any of its viewers' Personal Information with any vendors, except as we discuss above in section 5. (b) For Mailing Lists. We may occasionally share users' email addresses with carefully selected businesses and organizations. If you do not want your address exchanged in this way, please contact us at the address or email address set forth below to ask that your name and address be removed from any shared mailing lists. It is Daily Voice policy not to sell or exchange email addresses with any other company or organization except with content partners. (c) As Required by Law. If we determine on a good faith basis that we are required by law (e.g., pursuant to a subpoena or judicial order) to share your personal or other information, we will endeavor to share it only to the extent we determine is reasonably necessary to comply with the request. Minors Daily Voice might not allow persons who are aged thirteen or younger to become members of this site. For more information, please contact the site administrator. Retaining Your Information Because we value the longevity of our relationships with our supporters, we do generally retain user information in our database. If you would like to access, transfer, review, or correction of or to your Personal Information, or to request that it be deleted or restricted in use, please contact us — depending on your location or country of residence, you may have a legal right to make these requests and to contact supervisory authorities if you are not satisfied with our response. Editing or deleting your account information You are provided with the ability to edit the information stored for your user account information during registration, by visiting your user account control panel. You can request that your user account be deleted; to do so, please contact the site administrator. Content or other data that you may have provided, and that is not stored within your user account, such as articles published, may continue to remain on the site at Daily Voice's discretion, even after your user account is deleted. Please see the site's Terms of use for more information. Changes to this privacy policy Changes may be made to this policy from time to time, and in Daily Voice's sole discretion. Daily Voice encourages visitors to frequently check this page for any changes to its privacy policy. Your continued use of this site after any change in this privacy policy will constitute your acceptance of such change. NO GUARANTEES While this privacy policy states standards for maintenance of data, and while efforts will be made to meet the said standards, Daily Voice is not in a position to guarantee compliance with these standards. There may be factors beyond Daily Voice's control that may result in disclosure of data. Consequently, Daily Voice offers no warranties or representations as regards maintenance or non-disclosure of data. Notification in case of a breach We will comply with all pertinent laws that require us to notify you in the event your personal data is lost, stolen, or inadvertently disclosed.""",
    """Eight styles of women’s shoes are being recalled across the globe by Clarks due to concerns about toxins inside the product, the retailer announced. The US Consumer Product and Safety Commission (CPSC) issued an alert regarding a recall of approximately 113,000 shoes under the "Breeze" style name … Read More""",
    """High pressure will bring dry and mild weather for the remainder of the week and into the start of the "Fall Back" weekend, according to the National Weather Service. There will be sunshine and warmer-than-normal temperatures on Wednesday, Nov. 2, with highs in the mid to upper 60s and calm w… Read More""",
    "It took four years for doctors to figure out what was wrong with Jasmin Perdomo. And still, that didn't solve her problem. Perdomo was 27 and living in Puerto Rico when symptoms presented …",
    "Notification Push Notifications Notification Sound Notification permissions are disabled. Enable them in your device's settings.",
    "One of America's top-touring illusionists will be making an appearance at the Westfield Garden State Plaza. David Caserta will be hosting his Halloween-themed show Haunted Illusions from 5 p.m. 7 p.m. on Monday, Oct. 31. The show will be followed by a Mall-Wide Trick-or-Treat. The free and safe e… Read More",
    "Reporter Valerie Musson is a passionate and enthusiastic digital newswriter and journalist from Rochester, New York. She graduated from the State University of New York at Fredonia with a bachelor’s degree in Journalism. With past experience in video production, digital marketing and SEO, Valerie’s work has been published across a host of different publications. She has written for The Rockland County Times, CNY Vision, and Freelance Weekly. She has also produced marketing content for Mention, Small Business Marketing Tools, Green and Clean Mom, My Boat Life, and many other sites. For more from Valerie, visit her professional writing portfolio at valeriemusson.contently.com.",
    "Senior Content Editor Born in Newark and raised in Hudson County, Jerry DeMarco is a 38-year veteran of the news industry and a renowned breaking news reporter. He has covered local, county and state police, as well as the FBI, ATF and other federal agencies. DeMarco joined Daily Voice in 2015 after six years running Cliffview Pilot, one of the nation's few independent breaking news sites. As an independent publisher, he helped establish Local Independent Online News, a national trade organization. Prior to launching his own site, DeMarco served as a reporter for the Bergen Record, Stamford Advocate, Rockland Journal News, News Tribune, and Trenton Times. DeMarco has won national recognition for his investigative work, including Clarion, Heywood Broun and Deadline Club awards. He's changed public policy and managed several major projects -- including one that helped put a corrupt public official in federal prison and another that forced a sitting New Jersey attorney general from office for her participation in a Bergen County traffic stop involving her boyfriend. DeMarco regularly lectures at a graduate course in media relations at Fairleigh Dickinson University and previously was an adjunct professor at Lehman College in the Bronx. He has a Bachelor's Degree in English from St. Peter's University in Jersey City.",
    "Thank you for sharing your views with Daily Voice. We encourage free expression, regardless of viewpoint, but require that such expressions be respectful of other commenters and relevant to the stories on which they are posted. We reserve the right to remove, hide from public view, or edit any comment for any reason. In particular, you may not submit a comment that is abusive, defamatory, disrespectful, illegal, offensive or disparaging (whether on the basis of disability, ethnicity, gender, nationality, race, religion, sexual orientation, traits with which people are born, or otherwise). You also may not submit a comment that attacks or threatens another person, threatens or promotes violence, wishes for harm to befall another person, invades another person’s privacy or proprietary rights, or uses expletives (including veiled profanity). You may not stalk or harass another person, dominate the conversation, discourage participation by others, or mock, bait, bicker, taunt, or belittle others. Please observe the following simple rules: Please be civil in addressing and referencing other Daily Voice commenters and stay focused on the subject at hand. Objectionable content is forbidden. Such comments can be reported by clicking on the report button found on every comment. For more on what we consider objectionable, refer to section 7. Daily Voice reserves the right to remove any comment for any reason. Daily Voice may also bar individuals who break our commenting rules. Any comments you post and your use of comments are subject to the dailyvoice.com Terms of Use. Comments with spam or self-promotional links are not allowed. Self-promotional links are permitted on your profile page. Your comments and profile may not imply any connection to any person or organization to which you are not, in fact, connected. Comments in all-capital letters, all-bold, or all-italic postings, excessive blank space and objectionable profile photos are not permitted. We will also hide from view any comments made by a reader suspected of using a fake name or of creating multiple accounts.",
    "The Newark police officer who was shot in the neck while tracking down a shooting suspect has been released from the hospital after a two-day stay. Just before being wheeled out the front doors into the crowd, Johnny Aquino was greeted by fellow Newark officer Jabril Paul, who was shot in the leg … Read More",
    "The ex-Phillipsburg councilman accused of trafficking the social security numbers, banking info, and other personal details of at least seven of his coworkers has vehemently denied any wrongdoing, calling the accusations an “egregious overreach and abuse of power” involving Mayor Todd Tersigni. Ro… Read More",
    "The man found moaning and dangling from a power line tower in Pennsylvania Friday, Nov. 4 was trying to steal wire when he suffered an electric shock so powerful that it threw him from his perch, authorities told various news outlets.The man was cutting 100 feet of wire from the top of a tower that he'd secured himself to in Plainfield Township, when he suffered an arc that traveled ...",
    "Writer Vira Mamchur Schwartz is the quintessential jack-of-all-trades... and master of a few of them. Vira’s editorial career started in technology publishing, writing and editing at the Ziff-Davis publication Computer Shopper. Later she served as Managing Editor at CD-ROM World and then Internet World, before moving to freelance writing. Her articles have appeared in the New York Times, L.A. Times, Folio, Portable Computing, Internet Shopper and other websites and magazines that no longer exist. She ghost-wrote and designed an award-winning diet and fitness book for a popular fitness personality and, currently, loves to tell stories about people and animals.",
    "“Downton Abbey” carried us away to a different time where we could imagine ourselves in white-tie formality or in elegant gowns with elbow-length-gloved hands holding martinis amongst royalty in a grand home’s halls. The martinis (or any drink), of course, should be made with Highclere Castle Gin, a brand launched in 2019 in partnership with the Earl and Countess of Carnarvon—owners of ...",
    "By CHILD CARE CENTERS CAN BEGIN APPLYING FOR GRANTS OF UP TO $200,000 ON NOVEMBER 15, 2022 BLOOMFIELD, NJ - The New Jersey Economic Development Authority (NJEDA) will begin accepting applications for grants from the $54.5 million New Jersey Child Care Facilities Improvement Program on Tuesday, November 15, 2022, First Lady Tammy Murphy announced on Wednesday. Building on the Murphy Administration’s comprehensive strategy to support the state’s vital child care ...",
    "Edison in the Hood by Nadia UddinIf you had the opportunity to ask the dearly departed questions, t… Read More",
    "MADISON, NJ - How we long for the days of close family ties, of joy and forgiveness, of connecting in real time with our friends and family. And who knew we would be living in a world of masks and distance, when just trying to feel ‘normal’ is such a challenge? Thanks to Bonnie Monte and The ... Read more »",
    "More than ever, success in business is determined by reach in your community. Daily Voice prov… Read More",
    "NEW JERSEY — For the first time ever, the average price of a gallon of unleaded gas in New Jersey is over $5. AAA says drivers are paying $5.01 on Tuesday, June 7, 2022. That's up 3 cents from yesterday and nearly $2 from this time last year. AAA says the rising cost of oil and high demand for ... Read more »",
    "Rivertowns, NY | News | Mar 2019 Every parent wants to crack the code on their picky eaters — without having to force-feed them. Here are some techniques that will help.",
    "By NEW JERSEY CLEAN COMMUNITIES COUNCIL Do you have a great idea to help reduce the amount of single-use plastics in New Jersey? If so, the New Jersey Clean Communities Council (NJCCC) wants to hear from you. The NJCCC, a statewide non-profit litter abatement organization serving 21 counties and 558 municipalities, is rolling out a small grant program for New Jersey-based non-profit organizations to support education and outreach ...",
    "In 1987, two men put on a show for the ages in front of more than 90,000 people. In 1989, a father and son saw that show for the first time and never forgot it. Parenting is full of moments. Some good, some not so much. As parents, we are trying to make the most of our time with our kids. Read more »",
    """RAHWAY, NJ — David Brighouse gives back to a place that has given so much to him. "I feel like we have an obligation to give back," Brighouse said. "I owe something to the community." He was born in Rahway — at Rahway Hospital, when babies were born there — and raised there too. Brighouse was ... Read more »""",
    "Senior Reporter Equal parts Lois Lane, J. Jonah Jameson and Vicki Vale, Zak! has traveled from coast-to-coast covering small and big town community news in both in print and online for nearly a decade since his graduation from the award-winning Roy H. Park School of Communications at Ithaca College in 2010. Before settling into the Hudson Valley, Zak! previously spent time in the New York Daily News newsroom and worked for hyperlocal digital news outlets on Long Island. He wrote sports for The Ithacan and The Ithaca Journal before taking his talents to the Pacific Northwest, where he served as the sports editor at the Shoshone News-Press until heading back east to work with Daily Voice.",
    "The man found moaning and dangling from a power line tower in Pennsylvania Friday, Nov. 4 was trying to steal wire when he suffered an electric shock so powerful that it threw him from his perch, authorities told various news outlets. The man was cutting 100 feet of wire from the top of a tower th… Read More",
    "1 This post was contributed by a community member. The views expressed here are the author's own.",
    "Cats in the Navy by Scot Christenson. (Naval Institute Press, 2022) Cats in the Navy is one of those unusual little gems that has appeal for several types of people. If you are a cat lover, check! This book is for you! If you enjoy historical photographs, check! You might want a copy of ... Read more »",
    "TRENTON, NJ — New Jersey has announced five new monkeypox vaccination sites in Hudson, Middlesex, Morris and Passaic counties. The locations are for residents without a confirmed exposure who think they may have been exposed or are at a high risk for having been exposed to the virus. There are ... Read more »",
    "By MARK J. BONAMO NEWARK, NJ — The Roman Catholic Archbishop of Newark will be speaking at an upcoming interfaith conference about a subject that a sorely divided society debates daily — forgiveness. Cardinal Joseph Tobin will join Jewish, Muslim, Protestant, and Hindu clergy at the event, entitled Faith in Action: An Interfaith Conference on Monday, Oct. 24, at Kean ...",
    "Music has the incredible power of boosting both emotional and physical health – and that’s just by listening! If you are a musician and music is your livelihood, you reap additional benefits. Playing music provides a total brain workout that strengthens many brain functions.Regardless of which instrument you play, there is a repetitive nature that comes with playing music, so it’s ...",
    "Our Son, Steve Jr! She graduated from SUNY Purchase College, Magna Cum Laude, with her CS Degree! Can't wait to see what the future holds for him! We wish you the best of luck and success and sincere congratulations on your graduation and commencement! Love from Mom, and Dad. Read more »",
    "Across America, US | News | Jan 2018 Valentine's Day isn't just a holiday to swoon over the ladies — treat the number one man in your life to these fun and practical gifts.",
    "Before you pack your bags and head out to the nearest beach or snow-capped mountain, take a minute to think about what your plumbing will do while you're gone. Will it stay in perfect working order? Will it run up your water bill? Put down the suitcase and run through this list to put your plumbing ... Read more »",
    "By TONY GALLOTTO NEWARK, NJ — Lakeland Bank has made a $13 million deal with federal authorities to avoid prosecution for alleged discrimination against people of color who tried to get mortgages, loans and credit in Newark and minority areas in three New Jersey three counties. This settlement coincides with news that Provident Bank, based in Woodbridge, would acquire all Lakeland Bancorp’s ...",
    "Edison in the Hood by Nadia Uddin If you had the opportunity to ask the dearly departed questions, to retract those last words said in anger, to tell them you love them, would you? Can tec…",
    "NEW PROVIDENCE, NJ — With well more than 20 million local, engaged readers in the past year and a network of nearly 100 online local news and digital marketing platforms in New Jersey, New York, Florida and Pennsylvania, TAPinto can help your business reach the right audience. More than 240,000 ... Read more »",
    "When it came to naming Notre Dame High’s new turf soccer field in honor of Mike Perone, there was no decision to be made. It was more automatic than a penalty kick with no goalie defending.",
    """As chief meteorologist, my mission is to provide the most honest, accurate weather forecasts possible. I grew up in Jackson, where my weather obsession earned me the nickname "Weatherman Dan." I earned my atmospheric science degree from Cornell University. My career as a TV meteorologist included stops in Oklahoma, in Central New York and at News 12 New Jersey. I've also done research and programming work for the Northeast Regional Climate Center and NJ State Climate Office. I am proud to hold the American Meteorological Society's prestigious Certified Broadcast Meteorologist designation. I live in Union County with my wife, Amy, and young sons, Jackson, Griffin, and Nathan. Things I love: family, wine, storm chasing, Broadway musicals and Disney World. Follow me on Facebook ("Meteorologist Dan Zarrow") and Twitter (@DanZarrow) for even more weather insight and information!""",
    """By JAMES MCQUEENY TRENTON, NJ — Millions of Chinese revolutionaries were encouraged to riot by Mao decades ago with his "Red Book" sayings, inciting crowds to rampage throughout the country. Well, Trenton has one too, spilling the secrets of state government. "Fitzgerald's Legislative Manual" is literally a thick scarlet red book published annually for 150 years, though its ...""",
    "Create a multi-media advertising campaign with reach and frequency by combining radio and digital solutions. Pairing on air spots with digital ads significantly increases consumer recall and traffic to your website. Celebrity Endorsement An endorsement from one of our DJs on-air and online carries so much weight because it is like a recommendation from a friend. Over 50% of radio listeners say they trust brands, products and services that a personality talks about.",
    "Equal Employment Opportunity Policy Townsquare Media, Inc. is an Equal Opportunity Employer (EOE). The Company provides equal employment opportunities to all employees and applicants without regard to race, color, religious creed, sex, national origin, ancestry, citizenship, status, pregnancy, childbirth, physical disability, mental disability, age, military status or status as a Vietnam-era or special disabled veteran, marital status, registered domestic partner or civil union status, gender, medical condition, sexual orientation, or for any other reason covered by applicable federal, state and local laws. In addition, the Company complies with applicable state and local laws governing nondiscrimination in employment in every location in which the Company has facilities. This policy applies to all terms and conditions of employment, including, but not limited to, hiring, placement, promotion, termination, layoff, recall, transfer, leaves of absence, compensation and training.",
    "Health News How To Combat Cold And Flu How To Combat Cold And Flu (NAPSI)—According to the U.S. Centers for Disease Control and Prevention (CDC), chances are good you’ll get a cold or the flu this… Helping Older People to Maintain Independence Helping Older People to Maintain Independence (NAPSI)—For most older adults, the ability to live independently is associated with the ability to drive. That’s one reason the prospect of…",
    "NEW JERSEY — The New Jersey Forest Service says foliage in all but one of the state parks are either at peak or near peak right now. If you're thinking about heading out this weekend to take some colorful pictures, you picked the right time. TAPinto Nutley reports on why the foliage is peaking ... Read more »",
    "Today's world is one of immediate gratification. Everyone wants what they desire faster and easier. Efficient financial management can help you meet your personal financial objectives within your means.These tips will help to enhance your personal finance efforts.Step 1: Pay Yourself First• It is an excellent practice to make sure the first one paid is yourself. That is to say, you ...",
    "Unless otherwise noted, all information presented in our local crime coverage is based on information received from area law enforcement agencies, their public information officers and/or other public records. If you believe that an item is in error or if you are aware of new facts in a case, we are happy to make amendments or updates to our reporting once we are presented with satisfactory official documentation or affidavit citing this information. Please note that arrests, criminal charges, and mugshots are all matters of public record, and may not be removed from our publications. If you have relevant documents to submit, or if you have noticed an inaccuracy in our published content that does not involve the above concerns, please contact our editor. We recognize that the matters we report deeply affect those involved and that honoring our responsibility to the general interest of the local community requires the highest standard of ethics. We thank you for your interest and cooperation in this endeavor.",
    "Whether your indoor garden has outgrown its location or you are looking to expand your garden, a bit of pinching, pruning or propagating may be the answer. Grooming houseplants keeps your indoor garden looking its best and plants contained to the available space. You can use some of the trimmings ... Read more »",
    "You're always looking to grow your business. Let us show you how Townsquare's online marketing solutions can improve your business's online presence and drive success.",
    """Before becoming a digital reporter and fill-in news anchor for New Jersey 101.5, my NJ journalism roots began as faux interviews on a Fisher Price tape recorder. Relatives might say my "gift of gab" grew from there, leading me to radio and beyond. As a wife and momma, I firmly believe that life’s too short to drink bad coffee. I'm also a fan of the beach and karma.""",
    """Ever since I was a precocious 5 year old who read the Bergen Record and then published his own neighborhood newspaper I've always had an interest in the news. I learn something new on this job everyday and that's what keeps it interesting. I've been with NJ101.5 for the past 10 years on the digital team writing news and working behind the scenes on a variety of projects.""",
    "Get up-to-the-minute news sent straight to your device.",
    """Hey there! I'm the midday news anchor for the Dennis and Judi Show on NJ 101.5. You may have heard my name before as I have co-hosted a number of morning shows on rock radio stations up and down the Jersey Shore as well as New York City. I also dabbled in business for a while at The Wall Street Journal Radio Network. While I am a Staten Island girl, I've been living in the Garden State (near the beach) for almost 20 years. I have a teenage son (yikes!). I'm a die-hard Bruce Springsteen fan. I love the New York Mets (my dad was a lefty relief pitcher for the Mets in the 1960s). I love animals, (especially my cat Gingerbread, my rabbit, Shadow and an adorable bloodhound named Esther). I enjoy going to concerts (I love live music). I'm a beach girl through and through (you will find me there a lot). I enjoy watching The Brick Memorial Marching Mustangs in action and doing the announcing for their home shows. I love the color purple, anything coconut-scented and a good red wine. Italian food (my heritage) and seafood are my faves. I'll travel anywhere. Just don't ever ask me to attend a Yankees game or go camping...the answer will always be no. LOL. That's me in a nutshell! Talk to you on the radio!""",
    """I am New Jersey 101.5's Afternoon News Anchor. I was formerly Senior Producer of Morning News and Special Programming here for four years, and Evening News Anchor for another three. I'm proud to be a member of our on-air and digital teams which have been honored with a Regional Edward R. Murrow Award, and also by the New Jersey and Keystone chapters of the Society of Professional Journalists and the New Jersey Broadcasters Association. I got my start in radio at The College of New Jersey, on student-run WTSR-FM, and graduated from TCNJ in 2009. Before coming to 101.5, I did news and sports for WGHT-AM in Pompton Lakes and WYNY-FM in Milford, PA. In a previous life as a child actor, I appeared in the Broadway cast of "Beauty and the Beast" from 1994 to 1996. I live in Hunterdon County with my wife, Kristen.""",
    """I am a proud Jersey girl through and through. I call it the beach, not the shore. If it's not from New Jersey (or New York) it's not real pizza, the same goes for bagels. I don't know how I would survive without Wawa or without being a short drive to the beach. Thanks for letting me take you through your workday and beyond.""",
    """I have been covering New Jersey news for more than 25 years, and have served as New Jersey 101.5’s lead investigative reporter since 2000. I've reported on a number of breaking news stories, including the COVID-19 pandemic, the 9/11 Terror Attacks, the Jersey based Anthrax Attack Investigation, the resignation of Governor Jim Mcgreevey, Hurricane Irene and Superstorm Sandy as well as the terror bomb attack down the shore. I have covered the campaigns of Governors McGreevey Corzine, Christie and Phil Murphy, as well as U.S. Senators Frank Lautenberg Bob Menendez and Cory Booker. Additionally, I broke the story – in a series of reports – on New Jersey’s crumbling, dangerous bridges. I am proud to have received numerous awards for coverage of New Jersey stories, and been featured as an expert reporter on CNN’s Nancy Grace show, covering various topics including the Prom Mom Baby dumped in Trash segment.""",
    """I'm equally as likely to be spotted playing sports with my (bad) kickball, dodgeball, and bowling teams in Philadelphia, as I am to invite you out to see a rom-com movie in the suburbs. I love my dog (Biden), Reba McEntire, the Chicago Cubs, and eating out (aka not having to cook). My hobbies include watching reruns of Frasier, all things pop culture, and booking vacations on cruise ships that I can't really afford. In addition to being on-air with Chris & the Crew, I'm the digital and music guy here at 94.5 PST, which really means I get to have fun all day long at the station.""",
    """I'm the digital managing editor for NJ1015.com. I got my start writing obits for The Jersey Journal (just like John Travolta in "Perfect," a 1985 movie you've never seen and don't recommend you see, either). I then spent years covering Central Jersey as an investigative reporter. I take it personally if you badmouth Newark, the Real Housewives, or Rutgers. Follow me on Twitter @BichaoNJ. Call or email me confidentially at 609-775-9793 or sergio.bichao@townsquaremedia.com""",
    """The best local coverage, unlimited Sign up for a digital subscription to The Press of Atlantic City now and take advantage of a great offer.""",
    """Townsquare is a community-focused digital media, digital marketing solutions and radio company focused outside the Top 50 markets in the U.S. Our assets include Townsquare Interactive, a digital marketing services subscription business providing web sites, search engine optimization, social platforms and online reputation management for approximately 24,950 SMBs; Townsquare IGNITE, a proprietary digital programmatic advertising technology with an in-house demand and data management platform; and Townsquare Media, our portfolio of 322 local terrestrial radio stations in 67 cities with corresponding local news and entertainment websites and apps including legendary brands such as WYRK.com, WJON.com, and NJ101.5.com along with a network of national music brands including XXLmag.com, TasteofCountry.com, UltimateClassicRock.com and Loudwire.com.""",
    "By MARK J. BONAMO On a warming planet increasingly impacted by climate change, New Jersey is preparing to move away from fossil fuels and toward green energy alternatives. Gov. Phil Murphy’s energy master plan aims to achieve 100% clean energy for the state by reducing carbon emissions to zero by 2050. The proposed Coastal Wind Link project hopes to contribute to the ...",
    """By TAPINTO STAFF NEWARK, NJ — The FBI's Newark office is investigating a "broad threat" to synagogues in New Jersey and is asking people to be alert. New Jersey Attorney General Matthew Platkin says law enforcement will be increasing patrols, both marked and unmarked, in "sensitive areas." “The FBI has received credible information of a broad threat to synagogues in ...""",
    """Claire contributes news and list-based articles for ScreenCrush and PopCrush, where she gets to flex her in-depth knowledge of film and pop culture. She holds a B.F.A. from Chapman University's Dodge College of Film & Media Arts, where she received hands-on filmmaking experience while studying under top-notch industry professionals.""",
    """Hailing from Tucson, Arizona by way of Tallahassee, Florida, I am living a dream, being a multi-media personality and DJ! Though it was by accident, I found my passion in radio and DJing, after moving from Tucson to Tallahassee to attend Florida A&M University. I was elected to student government, where I met Lady J, who was also in student government and a radio personality at the college radio station. In 1997, Lady J invited me, a freshman to play a small role on her radio show, that later ended, but it was just the beginning of my journey in radio. Prior to moving to Buffalo, I most recently served as morning show co-host on WPEG in Charlotte, NC for 5 years. I also spent 2 years as a entertainment report co-host on television station WBTV! I am currently entertaining and informing the Queen City of WNY with the "2 To 6 Takeover" weekdays from 2-6 pm on the oldest urban station in the country, The People's Station, 93.7 WBLK!""",
    """Hey everyone, I'm Gary Guida! I have been fortunate enough to be Lite Rock’s Brand Manager and afternoon guy since 1999. I love keeping you company every afternoon from 3pm- 7pm. I also look forward to helping you have a stress-free ride home with The Lite Rock Ride at 5. Submit your three favorite Lite Rock songs, and maybe I'll play them back for your ride home at 5:20.""",
    """Hey guys! My name is Nicole Murray and I am a born-and-raised Jersey girl who is so grateful to be your midday host! I started my broadcasting career back in 2015 while attending Rutgers University where I graduated with a double major of Journalism & Media Studies and Theater with a minor in Spanish. I became 94.3 The Point’s Midday Host back in 2017 and since then, I have taken on additional responsibilities including Midday Host of Lite Rock 96.9 WFPG, voicing Townsquare Media’s HOT AC National Weekend Midday Show and now, the host of the nationally-syndicated entertainment show known as Popcrush Nights. You will quickly learn that I have three top obsessions: coffee, wine, and fur babies. I will talk your ears off about my dog, Carolina on the daily and yes, I would label myself as a crazy dog mom. I would also describe myself as creative, bubbly and competitive so even if it is just a game of Uno, watch out. I can’t wait to get to know you and hope that you will take the time to get to know me between 10 AM and 3 PM Monday through Sunday! (Yes, 7 days a week! Lol) Instagram/Twitter/TikTok: MissMurray943""",
    """Hey there! I'm the host of the Cat Country Morning Show with Joe and Jahna, heard weekdays from 5:30 - 10:00am! I've been doing mornings on Cat Country since the beginning - 1998! I've seen almost everyone in concert - so, if you need a recommendation on who to see, just ask me! If I had the opportunity to eat lunch with anyone living or dead, I would choose someone living. Our studio hotline is always open for you at 609-383-1073. When I'm not on the radio, you'll find me sleeping.""",
    """I've been working with Jeff Deminski, or Heff-ay, since our first run at New Jersey 101.5 from 1994 to 1999. After 12 years in Michigan, we came back in 2011. I have three children (two of whom were born in the Garden State), with one at Montclair State and two in high school in Ocean County. I live in Jackson, where I enjoy grilling burgers 12 months a year on my Big Green Egg.""",
    """Jacklyn is a "Jack of All Trades" with a background in photography, social media, and journalism. Her work has been published on The Recording Academy's Grammy.com, PopCrush, LiveNation's Ones To Watch, Stage Right Secrets, among other outlets. She also works concert and live event production, learning as many aspects of the music industry as she can. In her downtime, you'll most likely find her befriending an animal or planning her next Disney vacation.""",
    """Jahna Michal, here! The more theatrical half of Cat Country Mornings ;) Love ya, Joe! For starters, I'm a true South Jersey gal, born and raised in Gloucester County! I grew up on adventure, the outdoors, and my first love - Wawa! My faith, my family, and my friends are the most important to me. My passion for country music stems back to my childhood where Shania, Faith, and Tim were all in constant rotation. My love of country really exploded after I took a trip to Nashville. Now THAT was a time to be alive! I'm pretty much an open book, so feel free to ask me anything! When I'm not bantering with Joe, I'm probably working out, on the hunt for some amazing guac, seeking out a new paddle boarding spot, watching a Harry Potter movie, or playing with a dog somewhere - I'm easy to spot!""",
    """Joe Cutter is the senior news anchor at New Jersey 101.5. I have toiled daily in the newsroom since 1989 as an anchor, correspondent, field reporter and news director. I have also learned not to call State government between noon and 1:30, because they are out to lunch.""",
    """Matt Singer is the editor and critic of the website ScreenCrush.com. For five years, he was the on-air host of IFC News on the Independent Film Channel, hosting coverage of film festivals and red carpets around the world. A member of the New York Film Critics Circle, he’s been a frequent contributor to the television shows CBS This Morning Saturday and Ebert Presents At the Movies, and his writing has also appeared in print and online at The Village Voice, The Dissolve, and Indiewire. His first book, Marvel’s Spider-Man: From Amazing to Spectacular, is on sale now.""",
    """NEW BRUNSWICK, NJ – The man who was sentenced to life in a New Brunswick courtroom for the 1973 murder of State Trooper Werner Foerster in one of the highest-profile trials in the city’s history will soon be released from prison. The New Jersey Supreme Court has ordered parole for Sundiata Acoli ... Read more »""",
    """NEW JERSEY — This may be the Garden State, but this time of year our attention shifts from tomatoes and zucchini in home gardens to oaks and maples as the forest and parks being their annual show. According to the NJ Forest Service, the show is beginning in the northwestern part of the state — ... Read more »""",
    """NUTLEY, NJ - A 90-year-old Nutley resident is at risk of losing her home of 60 years. Her family has set up a GoFundMe campaign to help keep the family home secure. Due to life events, Doris fell behind on tax payments. Not wanting to burden those close to her with the issue, Doris has kept ... Read more »""",
    "No Closings have been reported at this time",
    "Sorry, doesn’t look like there are any such events coming up",
    """Value Vault is a participant in the Target Affiliate Marketing Program, an affiliate advertising and marketing program that pays advertising fees to sites that advertise and link to Target.com. Also, as an Amazon Associate, we earn from qualifying purchases made on Amazon.com. Prime Day ends today! Here's a list of great items we've found for you. Be sure to check back since we'll be posting awesome deals as we find them! Here are direct links to some popular Prime Day categories. Featured Amazon Deals Whether you've cut the cord or just want to binge shows, this Fire Stick deal is too good to pass up! Protecting your home has never been easier! Here are direct links to some popular Target Deal Days categories. Featured Target Deals Get the weather, stream your favorite music, or control your smarthome with just your voice! Weak WiFi signals are a thing of the past. Curated Feed We’ll continue to update this list of products throughout the event, so check back! Please note final discount may not be reflected below. Be sure to click through to see the savings! Designed for work and play the ASUS Chromebook Flip is a computer and tablet in one that you won’t want to miss. Never overcook a meal again with the Anova Culinary Sous Vide Cooker with Bluetooth! Get perfectly cooked eggs, your way, every time with this Dash Deluxe Egg Cooker! Using infra-red technology, this Phillips Smoke-Less Indoor Grill cooks steaks, chicken, vegetables and more to perfection every time! The SodaStream Jet lets you make sparkling water with the push of a button! Whether you’re needing extra space to store water bottles and soda or a small fridge for your dorm-room, this Whirlpool Stainless Steel Mini Refrigerator is going to get the job done and look great doing it. The KitchenAid Artisan Stand Mixer is a classic staple in any kitchen. Whether you’re making bread, baking a birthday cake or whipping up cookies, this mixer handles it all. Go ahead and jump in the water to cool off with this swim tracking smartwatch by Fitbit. Snap a photo because this amazing Nikon D3500 Bundle won’t last long. Settle in to big savings on the Coleman Big-N-Tall Quad Camping Chair. Worrying about drinking contaminated water is a thing of the past thanks to LifeStraw Water Filter. There’s nothing weird about snagging this BigMouth Inc. Stranger Things Hawkins Rec Center Cooler on the cheap! Make the most of hot summer days out on the water with the SwimWays Spring Float. Keep an eye on your pets at home, talk to them and toss them a treat for being a good boy with the Furbo Dog Camera! Hurry before the deal ends! Save time and money with the Bissell BARKBATH QT – QuietTone Portable Dog Bath and Grooming System! Keep track of the whole family's fitness gains with this smart scale that tracks up to 10 different users. Clinically proven to improve gum health in only two weeks, the Philips Sonicare toothbrush is a must have. Vacuuming and washing your floors is simpler than ever, and now you can do it without the hassle of a cord! Breathe easier with this sleek, powerful air purifier. Keeping your floors clean has never been easier! Refresh your living space for much less today with this modern sofa from Project 62™!""",
    "What I’ve learned in my travels to Europe and South America in the last couple of years is how refreshing it is to get away from English. Not being able to understand the banter of language around you is a chance to luxuriate in your own thoughts. You are in a crowd, but not part of it, free of ... Read more »"
    ]

urls = ['http://radio.rutgers.edu/', 'http://thestute.com', 'http://unionnewsdaily.com/', 'http://www.starandwave.com/', 'https://www.goleader.com/tiw/', 'https://wmbctv.com/quest-63-2/', 'https://phl17.com/jobs-at-phl17/', 'http://www.mcccvoice.org/awards/', 'http://www.hammontongazette.com/', 'https://www.goleader.com/20mar12/', 'https://newtownpress.com/2018/04/']

urls_pattern = """/author/|/our-partners/|/djs/|/apps/|/app/|/jobs/|/terms/|/vip-faq/|/faq/|/tour/|/about/|/aboutus/|/about-us/|/vip-terms/|/privacy/|/contact/|/faq.cfm|/djs.cfm|/jobseeo|/the-station/|/history|/news.cfm|/employment/|/about.cfm|/privacy|/alumni.cfm|/advertising/|/help/|/otm-about/|/contest-rules/|/about.asp|/specialty.cfm"""


In [16]:
!rm -r intra_domain_cleaned
!rm -r intra_domain_rejected
!mkdir intra_domain_cleaned
!mkdir intra_domain_rejected

In [None]:
files = glob.glob('temp_results/*.csv')
for file in files:
    df = pd.read_csv(file, dtype='string')
    file_name = file.rsplit("/", 1)[1]
    print(f"Processing file {file_name}. Initial Length = {df.shape[0]}")

    # remove exceptions and blanks
    df['text'] = df['text'].fillna('')
    df = df[~df['text'].isin(['ArticleException', ''])]
    # remove articles based on URL condition
    df = df[~df['url'].str.contains(urls_pattern)]
    df = df[~df['url'].isin(urls)]
    # remove extra line breaks and multiple spaces
    df['text'] = df['text'].astype(str).replace(r'\n',' ', regex=True) 
    df['text'] = df['text'].astype(str).replace(r'\s+', ' ', regex=True)
    df['text'] = df['text'].astype(str).map(lambda x: x.strip())
    # remove typical junk texts
    df = df[~df['text'].isin(texts)]
    # remove articles which ends in '...'. This usually means the articles had a 'READ MORE' section.
    df = df[~df['text'].str.endswith("...")]
    # remove articles which does not meet a minimum word requirement
    df = df[~(df['text'].str.split().str.len()<200)]
    # Get earliest crawl date
    df['oldest_crawl_date'] = df['crawl_date'].astype(str).str.split(",").apply(lambda x: min(x))
    # Get latest crawl date
    df['latest_crawl_date'] = df['crawl_date'].astype(str).str.split(",").apply(lambda x: max(x))
    # rename columns
    df.rename(columns={'new_url':'article_url', 'url':'article_url_grouped'}, inplace=True)
    
    # save the data
    if df.empty:
        df.to_csv(f"intra_domain_rejected/{file_name}", index=False)
    else:
        df.to_csv(f"intra_domain_cleaned/{file_name}", index=False)
    print(f"Final Length = {df.shape[0]}. Done with {file_name}")

In [18]:
# sanity check to make sure we did not lose any data in between
!ls temp_results/ | wc -l
!ls intra_domain_cleaned/ | wc -l
!ls intra_domain_rejected/ | wc -l

436
229
207


> *Rewards received:*<br>
intra_domain_cleaned/_domain-name_.csv containing cleaned articles per domain<br>
intra_domain_rejected/_domain-name_.csv containing no articles per domain

## Catgeorize all Articles

> Inputs in this section are the files from 'intra_domain_cleansed' folder.<br>Use a cluster with the configuration 8 vCPU + 32 GiB + 1 GPU for this section.<br>Might take around 24 hrs to complete.

In [None]:
# %%capture
!pip3 install transformers
# !pip3 install torch torchvision torchaudio

In [1]:
import os
import glob
import torch
import pandas as pd

from tqdm import tqdm
from transformers import pipeline
from torch.utils.data import DataLoader

In [2]:
device = torch.device("cuda:0")
hypothesis_template = "This text is about {}."
candidate_labels = ['Business', 'Politics', 'Sports', 'World', 'Automobile',
                    'Miscellaneous', 'Education', 'Health', 'Environment', 'Crime']

In [3]:
# define dataset class
class LocalNewsDataset():
    def __init__(self, file_path):
        self.data = pd.read_csv(file_path)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data.iloc[idx]['text']

In [4]:
# %%capture
# !rm -r categorized_data
# !mkdir categorized_data

In [None]:
bart_pipeline = pipeline("zero-shot-classification", model="joeddav/bart-large-mnli-yahoo-answers", device=device)
files = sorted(glob.glob('intra_domain_cleaned/*.csv'), key=lambda x: os.path.basename(x).split('_')[0])
for i in range(0, len(files)):
    path = files[i]
    file_name = path.rsplit("/", 1)[-1]
    print(f"Processing file {i}. {file_name}")
    # create an object
    local_news_dataset= LocalNewsDataset(path)
    # create data loader
    dataloader = DataLoader(local_news_dataset, batch_size=4, num_workers=4, shuffle=False)
    # infer the category
    bart_labels, bart_scores = list(), list()
    for item in tqdm(dataloader):
        # infer from model 1
        bart_inferences = bart_pipeline(item, candidate_labels, multi_label=True, hypothesis_template=hypothesis_template)
        for inference in bart_inferences:
            bart_labels.append(inference['labels'][0])
            bart_scores.append(inference['scores'][0])
    # save the results
    df = pd.read_csv(path)
    df['bart_label'] = bart_labels
    df['bart_score'] = bart_scores
    df.to_csv(f"categorized_data/{file_name}", index=False)

> *Rewards received:*<br>
categorized_data/_domain-name_.csv containing cleaned articles per domain with article category and score

## Analysis

### How many times did each article get crawled?

### How many overlapping domains were there per article?

### Were these part of the same host or different hosts?

In [34]:
# TODO: may have to rewrite code
# group by text again on an aggregated set to eliminate cross domain duplicate articles
all_files = glob.glob("intra_domain_cleaned/*.csv")
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

# de-dup articles
df['crawl_date'] = df['crawl_date'].astype(str)
df = df.groupby('text').agg({'crawl_date': lambda x: ','.join(x), 
                             'article_url_grouped': lambda x: ','.join(x), 
                             'article_url': lambda x: ','.join(x)}).reset_index()
# rename columns
df.rename(columns={'article_url':'inter_domain_urls', 'article_url_grouped':'inter_domain_urls_grouped'}, inplace=True)
# join urls and new urls
df["distinct_urls"] = df["inter_domain_urls"] + "," + df["inter_domain_urls"]
# de-dup urls and new urls
df['distinct_urls'] = df['distinct_urls'].astype(str).str.split(',').apply(set)
df['distinct_urls'] = df['distinct_urls'].apply(lambda x: ','.join(x))
# Get earliest crawl date
df['inter_domain_oldest_crawl_date'] = df['crawl_date'].astype(str).str.split(",").apply(lambda x: min(x))
# Get latest crawl date
df['inter_domain_latest_crawl_date'] = df['crawl_date'].astype(str).str.split(",").apply(lambda x: max(x))

# save the results
df.to_csv("cleansed_article_data.csv", index=False)

# Visualizations

> Visualization for 1 & 2: https://datastudio.google.com/s/miTImH41YiM

## 1. Number of News Articles per Source and Category

In [5]:
import glob
import pandas as pd

In [None]:
# prepare data for count visualization
final_df = pd.DataFrame()
for path in glob.glob("categorized_data/*.csv"):
    df = pd.read_csv(path)    
    file_name = path.rsplit("/", 1)[1][:-4]
    domain_name = '/'.join(file_name.split("_")[1:])
    
    data = [
            [domain_name,'Business',0],
            [domain_name,'Politics',0],
            [domain_name,'Sports',0],
            [domain_name,'World',0],
            [domain_name,'Automobile',0],
            [domain_name,'Miscellaneous',0],
            [domain_name,'Education',0],
            [domain_name,'Health',0],
            [domain_name,'Environment',0],
            [domain_name,'Crime',0]
    ]
    temp_df = pd.DataFrame(data=data, columns=['domain', 'topic', 'bart_count'])

    df = df.groupby("bart_label").size().to_frame(name = 'count').reset_index()
    for index, row in df.iterrows():
        temp_df.loc[temp_df['topic'] == row['bart_label'], 'bart_count'] = row['count']
    final_df = pd.concat([final_df, temp_df], ignore_index=True)
    
temp_df = final_df.groupby("topic").sum().reset_index()
temp_df['domain'] = 'overall'
final_df = pd.concat([final_df, temp_df], ignore_index=True)

final_df.to_csv("no_of_news_articles_per_source_and_category.csv", index=False)

## 2. Overall range of News Articles per Category

In [6]:
import pandas as pd

In [12]:
df = pd.read_csv("no_of_news_articles_per_source_and_category.csv")
df = df[df['domain'] != 'overall'] 
# get max value so we can define bins
print(df['bart_count'].max())

1271


In [13]:
# define range for each count
bins = [0, 1, 5, 10, 25, 50, 100, 200, 300, 500, 800, 1300]
labels = ['0', '1 to 5', '6 to 10', '11 to 25', '26 to 50', '51 to 100', '101 to 200', '201 to 300', '301 to 500', '501 to 800', '801 to 1300']
df['range'] = pd.cut(df['bart_count'], bins=bins, include_lowest=True, labels= labels)

In [14]:
final_df = pd.DataFrame()
topics = ['Business', 'Politics', 'Sports', 'World', 'Automobile', 
           'Miscellaneous', 'Education', 'Health', 'Environment', 'Crime']

for topic in topics:
    topic_df = df[df['topic'] == topic] 
    topic_df = topic_df.groupby('range').domain.agg([('count', 'count'), ('domains', ','.join)]).reset_index()
    topic_df['topic'] = topic
    final_df = pd.concat([final_df, topic_df], ignore_index=True)

final_df.to_csv("range_of_news_articles_per_category.csv", index=False)

## 3. Choropleth Map Visualization

In [29]:
%%capture
!pip install geopandas --user
!pip install folium matplotlib mapclassify --user

In [1]:
import pandas as pd
import geopandas as gpd

from shapely import wkt

### Prepare data for Visualization

In [3]:
# prepare data
df = pd.read_csv("no_of_news_articles_per_source_and_category.csv")
df = df[df['domain'] != 'overall']

municipal_boundaries_df = gpd.read_file('municipal_boundaries_of_nj.geojson', 
                                        GEOM_POSSIBLE_NAMES="geometry", KEEP_GEOM_COLUMNS="NO")

domain_municipality_mapping_df = pd.read_csv("domain_municipality_mapping.csv")
replace_values= {
  'www.pressofatlanticcity.com/currents_gazettes/egg_harbor_township/': 'www.pressofatlanticcity.com/currents/gazettes/egg/harbor/township/',
  'www.pressofatlanticcity.com/currents_gazettes/galloway_township/': 'www.pressofatlanticcity.com/currents/gazettes/galloway/township/',
  'www.pressofatlanticcity.com/currents_gazettes/mainland/': 'www.pressofatlanticcity.com/currents/gazettes/mainland/',
  'www.pressofatlanticcity.com/currents_gazettes/hamilton_township/': 'www.pressofatlanticcity.com/currents/gazettes/hamilton/township/',
  'www.pressofatlanticcity.com/currents_gazettes/downbeach/': 'www.pressofatlanticcity.com/currents/gazettes/downbeach/',
  'www.mycentraljersey.com/news/courier-news/': 'www.mycentraljersey.com/',
  'www.gloucestercitynews.net/': 'www.gloucestercitynews.net/clearysnotebook/gloucester/city/news/',
  'starnewsgroup.com/the-ocean-star-e-edition/': 'starnewsgroup.com/'
}
for index, row in domain_municipality_mapping_df.iterrows():
    if row['domain'].startswith("http://"):
        domain_municipality_mapping_df.at[index, 'domain'] = row['domain'][7:]
    elif row['domain'].startswith("https://"):
        domain_municipality_mapping_df.at[index, 'domain'] = row['domain'][8:]
for index, row in domain_municipality_mapping_df.iterrows():
    if row['domain'] in replace_values:
        domain_municipality_mapping_df.at[index, 'domain'] = replace_values[row['domain']]

df_domains = set(sorted(df['domain']))
domain_municipality_mapping_df_domains = set(domain_municipality_mapping_df['domain'])

In [4]:
# get the list of domains which does not have location data
print(f"The following {len(df_domains-domain_municipality_mapping_df_domains)} domains does not have location data:")
print(str(df_domains-domain_municipality_mapping_df_domains)[1:-1])

The following 77 domains does not have location data:
'latribunanj.com/', 'www.mainlinemedianews.com/mainlinetimes/', 'wmbctv.com/', '1057thehawk.com/', 'www.1077thebronc.com/', 'am970theanswer.com/', 'wpst.com/', 'www.my9nj.com', 'wbjb.org/home.php/', 'www.newsindiatimes.com', 'www.wwfm.org/', 'www.princetonmagazine.com/', 'wondradio.com/', 'www1.nyc.gov/site/media/radio/radio-home.page', 'literock969.com/', 'thepakistaninewspaper.com/', 'www.tygodnikplus.com', 'www.ntd.tv', '920thejersey.com/', 'www.indiaabroad.com', 'www.irishcentral.com/', 'www.jerseyvoices.com', 'www.northjersey.com/', 'newjerseybuzz.com/', 'gujaratdarpan.com', 'forward.com/', 'njrevolutionradio.com/', 'www.theyeshivaworld.com/', 'www.jewishlinknj.com/', 'www.irishecho.com', 'www.posteaglenewspaper.com/', 'philadelphia.cbslocal.com/', '6abc.com/', 'thepositivecommunity.com/', 'www.tristatevoice.com/', 'pocono967.com/', 'njbmagazine.com/', 'wtsr.org/', 'www.mainlinemedianews.com/mainlinesuburbanlife/', 'www.pressof

In [5]:
# keep only relevant domains in location data
domain_municipality_mapping_df = domain_municipality_mapping_df[domain_municipality_mapping_df['domain'].isin(df_domains)]

# assign geo locations to each source (explode data)
# format required:  municipality_id, municipality_name, municipality_gnis, domain, topic, bart_count
print(f"Size before joining: {df.shape[0]}")
exploded_df = pd.merge(df, domain_municipality_mapping_df, on='domain', how='inner')
print(f"Size after joining: {exploded_df.shape[0]}")

# validation: if this is 77 it means we haven't lost anything during the 'inner' join
print(f"Number of domains without location data = {len(set(df_domains)-set(exploded_df['domain'].unique()))}")

Size before joining: 2240
Size after joining: 18440
Number of domains without location data = 77


In [6]:
# for each geo location, aggregate on domain(s) and get sum of all bart_count, i.e, do a group by
functions = {'bart_count':['sum'], 'domain':[','.join]}
normalized_df = exploded_df.groupby(['municipality_name', 'municipality_gnis', 'topic']).agg(functions).reset_index()
normalized_df.columns = normalized_df.columns.droplevel(1)
normalized_df['municipality_gnis'] = normalized_df['municipality_gnis'].apply(str)

# validation: checking if there are any municipalities with more than 10 count
validaiton_df = normalized_df.groupby(['municipality_name', 'municipality_gnis']).count().reset_index()
display(validaiton_df[validaiton_df['topic']!=10].head(20))

Unnamed: 0,municipality_name,municipality_gnis,topic,bart_count,domain


In [7]:
print(f"Size before joining: {normalized_df.shape[0]}")

final_df = pd.merge(normalized_df, municipal_boundaries_df[['geometry', 'MUN_LABEL', 'GNIS']], 
                    left_on=['municipality_name', 'municipality_gnis'], right_on=['MUN_LABEL', 'GNIS'], 
                    how='inner')
print(f"Size after joining: {final_df.shape[0]}")

# delete the extra column
del final_df['MUN_LABEL']
del final_df['GNIS']

# rename columns
final_df = final_df.rename(columns ={'bart_count':'number_of_articles', 'domain': 'domains'})

# validation: check if any municipality did not have location data
print("Number of domains without location data are:")
print(str(set(normalized_df['municipality_name'].unique())-set(final_df['municipality_name'].unique()))[1:-1])

Size before joining: 5320
Size after joining: 5290
Number of domains without location data are:
'Pine Valley Borough', 'Ocean Grove'


In [20]:
# get count of articles in each municipality
overall_df = final_df.groupby(["municipality_name", "municipality_gnis", "geometry"])['number_of_articles']\
                             .agg('sum').reset_index()
overall_df['topic'] = 'overall'
# get domains list
overall_domains_df = final_df.groupby(["municipality_name", "municipality_gnis", "geometry"])['domains']\
                             .apply(lambda x : ','.join(set(x))).reset_index()


# join the dataframes to get the final dataframe with all the 'overall' information
print(f"Size before joining: {overall_df.shape[0]}")
overall_df = pd.merge(overall_df, overall_domains_df, on=['municipality_name', 'municipality_gnis', 'geometry'], how='inner')
print(f"Size after joining: {overall_df.shape[0]}")

# add the records to original dataframe
final_df = pd.concat([final_df, overall_df]).sort_values(by=['municipality_name', 'topic'], ascending=True)

Size before joining: 529
Size after joining: 529


In [19]:
# save this data
final_df.to_csv("map_data.csv", index=False)

### Draw the Maps

In [2]:
# read the map data we created
map_df = pd.read_csv("map_data.csv")

# cast data to proper data types
map_df['geometry'] = gpd.GeoSeries.from_wkt(map_df['geometry'])
map_df = gpd.GeoDataFrame(map_df, geometry='geometry')
map_df.crs = "EPSG:4326" # set intial crs code
map_df = map_df.to_crs("EPSG:3424") # cast to New Jersey coordinates
print(map_df.dtypes)

municipality_name       object
municipality_gnis        int64
topic                   object
number_of_articles       int64
domains                 object
geometry              geometry
dtype: object


In [3]:
# draw the maps
cmaps = ['Accent','Blues','BrBG','BuGn','BuPu','CMRmap','Dark2','GnBu','Greens','Greys','OrRd','Oranges','PRGn','Paired',
         'Pastel1','Pastel2','PiYG','PuBu','PuBuGn_r','PuOr','PuRd','Purples','RdBu','RdGy','RdPu','RdYlBu','RdYlGn','Reds',
         'Set1','Set2','Set3','Spectral','Wistia','YlGn','YlGnBu_r','YlOrBr','YlOrRd','afmhot','autumn','binary','bone',
         'brg','bwr','cividis','cool','coolwarm','copper','cubehelix','flag','gist_earth','gist_gray','gist_heat',
         'gist_ncar','gist_rainbow','gist_stern','gist_yarg','gnuplot','gnuplot2_r','gray','hot','hsv','inferno','jet',
         'magma','nipy_spectral','ocean','pink','plasma','prism','rainbow','seismic','spring','summer','tab10','tab20',
         'tab20b','tab20c','terrain','turbo','twilight','twilight_shifted','viridis','winter']
for topic in map_df['topic'].unique().tolist():
    for cmap_color in cmaps:
        map_df[map_df['topic']==topic].explore(column='number_of_articles',
                                               cmap=cmap_color,
                                               legend=True,
                                               tooltip=['municipality_name', 'municipality_gnis', 
                                                        'domains', 'number_of_articles'],
                                               style_kwds=dict(color='black')) \
                                      .save(f'maps/{cmap_color}/{topic.capitalize()}.html')

In [None]:
%%capture
!zip -r maps.zip maps