In [None]:

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# Introduction:
___
Next door requires users to register their address, and allows them to view posts of other based on geographic location. Those seeking to replicate this project will need a Nextdoor account in the region they wish to study, and will need to provide their own account information in the code below.

Next door is intended as a community social network, and much of the content pertains to city or neighborhood wide news, community events, or pleas to pick up pet droppings from someones favorite walking trail. However the site's user based geo-restrictions create a unique virtual environment where users can feel safe that their post's wont travel far outside of their communities. This has the capacity to effect the tone and types of conversations users have whether that be for the better or worse, and presents itself as a social space ripe for study. 

Use of Nextdoor content also allows us to cut through much of the noise that we would typically obtain when collecting data for the study of sentiments by geographic region. On sites like twitter , or facebook, users geo-graphic info is often incorrect or intentionally misleading, and users from one area posting on groups from another and other similar cases can quickly obscure meaningful results. 

Collecting data from Nextdoor has it's own challenges though. Without an API we were left to web-scrapping. I used selenium for it's ability to interact with Nextdoor.com's JS heavy webpages. The reduced scrapping speed of selenium is actually comes with the benifit of allowing us to lightly monitor the incoming data giving us time to get familiar with the data. 

This study was conducted on media content originating from over a dozen neighborhoods in the east Denver Area.

|posts|comments|
|---|---|
|1,100|16,000|

#  Building A Web Scrapper
___
___
Below are the functions used to scrape nextdoor.com for content using the sites search function. I had difficulties putting this into a package that , if you are reading this an know how to solve, I would welcome you to submit a pull request. Next door offers a unique insight into neighborhood sentiment.

In [None]:

def nxtdr_logout():
    driver.get('https://nextdoor.com/logout/?ucl=1')




def nxtdr_login(usrnm, pwd):
    
    

    time.sleep(2)
    log_in = driver.find_element_by_class_name("css-1d8yfou")
        # click log in button 
    log_in.click()

        # enter email 
    email = driver.find_element_by_id('id_email')
    email.send_keys(usrnm)
    email.send_keys(Keys.TAB)

        # enter pwd
    enter_pwd = driver.find_element_by_id("id_password")
    enter_pwd.send_keys(pwd)
        # log in 
    enter_pwd.send_keys(Keys.RETURN)
    






def query(keywords):
    ''' Searches Keyword on ND
        Must be logged in to run 
    '''
    url = 'https://nextdoor.com/'
    driver.get(f'{url}search/?query={keywords}')


    
    
def search( keywords, reps ):
    query(keywords)
    time.sleep(10)
    load_more(reps)
    links = get_links()
    return links
get_ipython().run_line_magic('time', '')








def load_more(reps):

    # click the load more button at the bottom of the page # of times in range(reps)'''

    for i in range(reps):
        try:
            show_more = driver.find_element_by_class_name('css-1on4yel')
            show_more.click()
            # allowing time for page to load
            time.sleep(3)
        except:

            show_more = driver.find_element_by_class_name('content-results-list-item-see-more-link')
            show_more.click()
            # allowing time for page to load
            time.sleep(3)

        else:
            continue



In [None]:



def scrape(usrnm, pwd, keywords):
    nxtdr_login(usrnm,pwd)
    time.sleep(3)
    links = search(keywords,100)
    time.sleep(2)
    
    links_df = pd.DataFrame()
     # fill out href column with contents of links list 
    links_df['href'] = [ link for link in links]
     # clean href 
    links_df['href'] = [ href[0] for href in links_df['href'].str.split('&')]
    # drop duplicates posts
    links_df.drop_duplicates( inplace = True,   keep='first')
    # only take user posts not ads etc.
    usr_post = links_df['href'].str.contains(' ?post=')
    links_df = links_df[usr_post]
    return links_df




def get_links():
    
    # all links on webpage ( web element )
    feed = driver.find_elements_by_tag_name('a')

    links = []

        # the first five links are advertisements
    for post in feed[5:]:
        link = post.get_attribute('href')
        links.append(link)
    # returns list of links from full loaded webpage resulting 
    # from keyword search.  
    return links


def get_post(href):
    # open post in browser
    driver.get(href)
           # give time to load
    time.sleep(3)
        # container for post and comments 
    post_container = driver.find_element_by_class_name('css-1dkvlfs')
        # container for post
    main_post_container = post_container.find_element_by_class_name('cee-media-body')
        # actual text of main post 
    main_post = main_post_container.find_element_by_class_name('Linkify').text
        # post id 
    post_id = href.split('=')[1]

    # meta info
        # location
    main_post_location = main_post_container.find_element_by_tag_name('button').text
        # author name
    meta = main_post_container.find_elements_by_tag_name('a')
    main_post_author = meta[0].text

        # post date 
        # list of entities that mess up date pull 
    date_fix_list = ['City of Denver','News','Denver Police Department']

    if main_post_author in date_fix_list:
        main_post_date = meta[2].text
    else:
        main_post_date = meta[1].text

        # post to append to post_df
    post = {'post_id': post_id ,
           'author' : main_post_author, 'date': main_post_date,
           'location': main_post_location, 'post': main_post}
    
    return post




def get_comments(href):
        # container for post and comments 
    post_container = driver.find_element_by_class_name('css-1dkvlfs')
        # container for post
    main_post_container = post_container.find_element_by_class_name('cee-media-body')
        # the boxes around each of the comments on the post
    comment_windows = post_container.find_elements_by_class_name('js-media-comment')
        # post id 
    
    
        # creating a list of comments
    comments = []
    for i in range(len(comment_windows)):
        comment = comment_windows[i].find_element_by_class_name('css-1srqc6z').text
        comment = comment.split('\n')
        comments.append(comment[1])
        
        # creating a list of locations
    locations = []
    for i in range(len(comment_windows)):
        location = comment_windows[i].find_element_by_class_name('comment-detail-scopeline').text
        locations.append(location)
        
        
        # creating a list of authors 
    authors = []
    for i in range(len(comment_windows)):
        author = comment_windows[i].find_element_by_class_name('author-menu-box-container').text
        authors.append(author)
        comments_df = pd.DataFrame(columns = ['post_id','author','location','date','comment'])
    
    
        # creating list of dates
    dates = []
    comment_dates = driver.find_elements_by_class_name('css-9p9z55')
    for comment in comment_dates:
        dates.append(comment.text)
        
    post_ids= []
    for i in range(len(comment_windows)):
        post_id = href.split('=')[1]
        post_ids.append(post_id)
  
    comments_df = pd.DataFrame()
        # set columns of dataframe
    comments_df['post_id'] = [post_id for post_id in post_ids]
    comments_df['location'] = [loc for loc in locations]
    comments_df['date'] = [date for date in dates]
    comments_df['author'] = [auth for auth in authors] 
    comments_df['comment'] = [com for com in comments]

    return comments_df


# In[31]:


def get_content(df):   
    # PATH = './chromedriver.exe'
    # driver = webdriver.Chrome(PATH)
    
    posts_df = pd.DataFrame(columns = ['post_id','author','location','date','post'])
    comments_df = pd.DataFrame(columns = ['post_id','author','location','date','comment'])
    comments_master_df = pd.DataFrame(columns = ['post_id','author','location','date','comment'])
    
    for href in df['href']:
        #comments_temp_df = pd.DataFrame(columns = ['post_id','author','location','date','comment'])

        try:
            post = get_post(href)
            posts_df = posts_df.append(post, ignore_index=True)
            comments_df = get_comments(href)
            comments_master_df = comments_master_df.append(comments_df)
        except: 
            continue
    return posts_df, comments_master_df   



def run_scrape():
    print('Ready To Scrape Nextdoor!')
    usr_email = input('What is your email:')
    pwd = input('Enter Pwd:')
    keyword = input('Input Search Term:')
    
    href_list = scrape(f'{usr_email}', f'{pwd}', f'{keyword}' )
    posts, comments = get_content(href_list)
    
    return posts, comments




# Web Scrapper V2
___
___
 
It appeared during my scrape that the nextdoor.com server noticed my activity and began responding with altered HTML and URL formats. This may have also been due to routine maintenance on the server. In either case, here is the code that was written to get around the changes when they were encountered. This will return a dataframe where the post_id is in a different format than the previous version.



In [None]:



def scrape_2(usrnm, pwd):
    '''
    get links for given key word'''
    nxtdr_login(usrnm,pwd)
    time.sleep(3)
    links = search('homeless',100)
    time.sleep(2)
    
    links_df = pd.DataFrame()
    # fill out href column with contents of links list 
    links_df['href'] = [ link for link in links]
     # clean href 
    links_df['href'] = [ href[0] for href in links_df['href'].str.split('?')]
    # drop duplicates posts
    links_df.drop_duplicates( inplace = True,   keep='first')
    # only take user posts not ads etc.
    usr_post = links_df['href'].str.contains(f'/p/')
    links_df = links_df[usr_post]
    return links_df



def get_post_2(href):
    '''
    scrape post from given link 
    '''
    # open post in browser
    driver.get(href)
           # give time to load
    time.sleep(1)
        # container for post and comments 
   # post_container = driver.find_element_by_class_name('css-1dkvlfs')
        # container for post
    main_post_container = driver.find_element_by_class_name('cee-media-body')
        # actual text of main post 
    main_post = main_post_container.find_element_by_class_name('Linkify').text
        # post id 
    post_id = href.split('/p/')[1]

    # meta info
        # location
    main_post_location = main_post_container.find_element_by_tag_name('button').text
        # author name
    meta = main_post_container.find_elements_by_tag_name('a')
    main_post_author = meta[0].text

        # post date 
        # list of entities that mess up date pull 
    date_fix_list = ['City of Denver','News','Denver Police Department']

    if main_post_author in date_fix_list:
        main_post_date = meta[2].text
    else:
        main_post_date = meta[1].text

        # post to append to post_df
    post = {'post_id': post_id ,
           'author' : main_post_author, 'date': main_post_date,
           'location': main_post_location, 'post': main_post}
    
    return post



def get_comments_2(href):
    ''' scrape comments from given link'''
    time.sleep(1)   
    # clicking the seem more comments button 
    see_more_comments = driver.find_element_by_class_name('see-previous-comments-button-paged')
    see_more_comments.click()
   
        # the boxes around each of the comments on the post
    comment_window = driver.find_element_by_class_name('css-1cefqj0')
    # tag containing authors and locations 
    comments_meta = comment_window.find_elements_by_class_name('css-15h9wih')
    # actual text of comments
    comments_text = comment_window.find_elements_by_class_name('_1aEnMjGe')
    # the date of the comments
    comments_dates = comment_window.find_elements_by_class_name('css-9p9z55')
    
    
        # creating a list for all of our data types
    post_ids, dates, authors, locations, comments = [],[],[],[],[]

    # iterate through web elements and add their contents to lists 
    for comment in comments_text:
        comments.append(comment.text)

    for date in comments_dates:
        dates.append(date.text)

    for meta in comments_meta:
        meta = meta.text.split(' • ')
        author = meta[0]
        location = meta[1]
        authors.append(author)
        locations.append(location)


    for i in range(len(comments_dates)):
        post_id = href.split('/p/')[1]
        post_id = post_id.split('?')[0]
        post_ids.append(post_id)
      
    # create dataframe
    comments_df = pd.DataFrame()

    # Fill values of dataframe with list values.
    comments_df['post_id'] = [post_id for post_id in post_ids]
    comments_df['location'] = [loc for loc in locations]
    comments_df['date'] = [date for date in dates]
    comments_df['author'] = [auth for auth in authors] 
    comments_df['comment'] = [com for com in comments]

    return comments_df



def get_content_2(df):   

    posts_df = pd.DataFrame(columns = ['post_id','author','location','date','post'])
    comments_df = pd.DataFrame(columns = ['post_id','author','location','date','comment'])
    comments_master_df = pd.DataFrame(columns = ['post_id','author','location','date','comment'])
    
    for href in df['href']:
        try:
            post = get_post_2(href)
            time.sleep(2)
            posts_df = posts_df.append(post, ignore_index=True)
            comments_df = get_comments_2(href)
            comments_master_df = comments_master_df.append(comments_df)
        except: 
            continue
    return posts_df, comments_master_df 

___
___
# Scraping For Data w/ Selenium 
___
___

I created a package for scrapping posts comments and related meta-data by search term using selenium. The code below was used to execute selenium functions from scrape.py in order to create our dataset. You will need to provide your own username and password. An up to date version of Chrome and compatible driver are also needed.

This code can take up to a few hours to run once started, and will not respond to a UserAbort.

In [None]:
url = 'https://nextdoor.com/'

# set driver
PATH = '../chromedriver'
driver = webdriver.Chrome(PATH)

In [None]:
user1 = scrape('user-email','password','homeless')

p_df, c_df = get_content(user1)

p_df.to_csv('../data/user_data/vol_1_posts.csv')
c_df.to_csv('../data/user_data/vol_1_comments.csv')

In [None]:
nxtdr_logout()

In [None]:
user2 = scrape('user-email','password','homeless')

p1_df, c1_df = get_content(user2)

p1_df.to_csv('../data/user_data/vol_2_posts.csv')
c1_df.to_csv('../data/user_data/vol_2_comments.csv')