Team Member 1 Name : Emmanuel Johnson<br>
Team Member 1 UBID : egnanach<br>
Team Member 1 PersonNumber : 50290792<br>
Team Member 2 Name : Venktesh Kaviarasan<br>
Team Member 2 UBID : venktesh<br>
Team Member 2 PersonNumber : 50289400

# Packages

Common Packages

In [78]:
import sys
import os
import glob
import csv
import json
import pandas as pd

Packages for Crawling Articles from New York Times and Common Crawl

In [79]:
import requests
import codecs
import time
import gzip
from io import BytesIO
from bs4 import BeautifulSoup
from urllib.parse import urlparse

Packages for getting data from Twitter

In [80]:
import tweepy
import jsonpickle
from tqdm import tqdm

Packages for Processing the crawled data

In [81]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer as ss
from nltk.stem import WordNetLemmatizer as wn
#from nltk.stem import LancasterStemmer as ls

# Common Helper Functions

In [14]:
def saveFile(fileName, content):
    file = open(fileName,"w") 
    file.write(content)
    file.close()

In [15]:
def saveUrlIndex(query, result_urls, ctype):
    fName = "../urls/"+query+"-"+ctype
    with codecs.open("%s-links.csv" % fName,"wb",encoding="utf-8") as output:
        columns = ["URL"]
        writer = csv.DictWriter(output,fieldnames=columns)
        writer.writeheader()
        for url in result_urls:
            writer.writerow({"URL":url})

In [16]:
fnConfig = {
    'golf': 'g',
    'baseball': 'bsb',
    'basketball': 'bkb',
    'football': 'fb'
}

In [82]:
ccConfig = {
    "mlb": "baseball",
    "nfl": "football",
    "nba": "basketball",
    "golf": "golf"
}

# New York Times Crawling

In [17]:
NYT_APP_KEY = "WTjnKw4UrXjAY7E7FrIGX9HKYj1lbbY8"
NYT_APP_SECRET = "e28wPvKPBTf2GcAd"

In [18]:
#New York Times API search request URL
def formUrl(query, fromDate, toDate, fl, page=0):
    url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?begin_date="+fromDate+"&end_date="+toDate+ \
"&fl="+fl+"&q="+query+"&api-key="+NYT_APP_KEY+"&page="+str(page)
    return url

In [19]:
def getNYTUrls(url):
    page = 0
    result_url = list()
    while True:
        response = requests.get(url)
        if response.status_code == 200:
            resp = json.loads(response.content)
            print("#### ", len(resp["response"]["docs"]), " ####")
            if len(resp["response"]["docs"]) == 0:
                break
            for d in resp["response"]["docs"]:
                if d["web_url"] not in result_url:
                    result_url.append(d["web_url"])
        page += 1
        url = "&".join(url.split("&")[:-1])+"&page="+str(page)
        print(page, url)
        if ((page+1) % 10) == 0:
            time.sleep(61)
    return result_url

In [20]:
def nytMain(query, folder):
    #query = "golf" or "baseball" or "basketball" or "football"
    url = formUrl(query, "20190101", "20190331", "web_url,document_type")
    
    #Get all the related urls for the given query
    result_urls = getNYTUrls(url)
    print("Total New York Times URLs : "+str(len(result_urls)))
    #Save the resulting url in a csv file
    saveUrlIndex(query, result_urls, 'nyt')
    
    #Scrape the content of the urls recieved using nyt api
    counter = 0
    for url in result_urls:
        fileName = "../"+folder+"/nyt/"+query+"/"+str(counter)+".txt"
        response = requests.get(url)
        if response.status_code == 200:
            parser = BeautifulSoup(response.content, 'html.parser')
            #Search for section tag with name attribute as articleBody
            article = parser.find("section", {"name":"articleBody"})
            if article:
                #Get all the p tag texts
                paras = article.find_all("p")
                if len(paras) > 0:
                    content = ""
                    for p in paras:
                        content += str(p.text.encode('utf-8').strip(), 'utf-8')
                    saveFile(fileName, content)
        counter += 1

# Common Crawl Crawling

In [21]:
#https://www.bellingcat.com/resources/2015/08/13/using-python-to-mine-common-crawl/
def getCCUrls(domain, index_list):
    record_list = []
    for index in index_list:
        print("Current Index : " + index)
        cc_url  = "http://index.commoncrawl.org/CC-MAIN-"+index+"-index?url="+domain+"&output=json"
        print("Current URL : " + cc_url)
        response = requests.get(cc_url)
        if response.status_code == 200:
            records = response.content.splitlines()
            for record in records:
                record_list.append(json.loads(record))
    print("# Records Found : " + str(len(record_list)))
    return record_list

In [52]:
def getHtmlDoc(record):
    offset, length = int(record['offset']), int(record['length'])
    offset_end = offset + length - 1
    prefix = 'https://commoncrawl.s3.amazonaws.com/'
    resp = requests.get(prefix + record['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})
    raw_data = BytesIO(resp.content)
    f = gzip.GzipFile(fileobj=raw_data)
    data = f.read()
    response = ""
    if len(data):
        try:
            warc, header, response = data.strip().decode(encoding='utf-8', errors='strict').split('\r\n\r\n', 2)
        except:
            pass
        
    return response

In [83]:
def ccMain(query, folder, domains):
    #mlb - baseball || nfl - football || nba - basketball || golf - golf
    #query = "golf" or "mlb" or "nfl" or "nba"
    
    #Common crawl index list : 2019-04 for Jan, 2019-09 for Feb and 2019-13 for Mar (YYYY-Week)
    index_list = ["2019-04","2019-09","2019-13"]
    
    #For the given domains, collect all records present for the given date range (index_list)
    record_list = list()
    for domain in domains:
        record_list += getCCUrls(domain, index_list)
    print("Total Common Crawl URLs : ", len(record_list))
    
    #Scrape the content using the url data from the records recieved from common crawl
    result_urls = list()
    temp = list()
    counter = 0
    for record in record_list:
        fileName = "../"+folder+"/cc/"+ccConfig[query]+"/"+str(counter)+".txt"
        url = urlparse(record['url'])
        #Ignore http urls because it's a duplicate of a https url
        if url.scheme == "http":
            continue
        urlString = url.geturl()
        #Remove query strings from the url
        strippedUrl = urlString[:urlString.find('?')]
        urlPath = url.path
        #Only scrape if the url is not scraped before (removes duplicates)
        if urlPath not in temp:
            result_urls.append(strippedUrl)
            temp.append(urlPath)
            html_content = getHtmlDoc(record)
            parser = BeautifulSoup(html_content)
            #Search for div tag with itemprop attribute as articleBody
            article = parser.find("div", {"itemprop":"articleBody"})
            if article:
                #Get the text from all the p tag with class name as p-text
                paras = article.find_all("p", {"class":"p-text"})
                if len(paras) > 0:
                    content = ""
                    for p in paras:
                        content += str(p.text.encode('utf-8').strip(), 'utf-8')
                    saveFile(fileName, content)
        counter += 1
    print(len(result_urls))
    saveUrlIndex(query, result_urls, 'cc')

# Twitter Data Collection

In [65]:
def credentials_checker():
    twitter_cred = dict()
    twitter_cred['CONSUMER_KEY'] = "GlQmi0BC7y1jspiPu3xWtcUon"
    twitter_cred['CONSUMER_SECRET'] = "KAnScl27IGPom4nVYw44126ursS7c1TiKlfG6TRkGBFieeK0Sm"
    twitter_cred['ACCESS_KEY'] = "1357145954-ZIsxSkVYa8ixkMVMwA0Wm6pFhoF4MYsM1whvU8c"
    twitter_cred['ACCESS_SECRET'] = "QuJ6qR8bfThPuAjjHnyQz0gjknLd6o8BJW1Iqh7NO00M3"

    with open('twitter_credentials.json', 'w') as secret_info:
        json.dump(twitter_cred, secret_info, indent=4, sort_keys=True)

    with open("twitter_credentials.json") as cred_data:
        info = json.load(cred_data)
        consumer_key = info['CONSUMER_KEY']
        consumer_secret = info['CONSUMER_SECRET']
        access_key = info['ACCESS_KEY']
        access_secret = info['ACCESS_SECRET']
    
    #Create API Endpoint
    auth = tweepy.OAuthHandler(consumer_key,consumer_secret)
    api = tweepy.API(auth,wait_on_rate_limit=True)
    if(not api):
        print("Problem Connecting to API")
    return api

In [66]:
def find_tweets(sport,search_query,api):
    dict_store = "tweet_dict" + "_" + sport
    dict_store = {}
    query_term = sport + "_query"
    query_term = search_query + "-filter:retweets"
    max_tweets = 12000
    tweet_count = 0

    #Using Tweepy Cursor to fetch Tweets
    print("Collecting Tweets")
    for tweet in tqdm(tweepy.Cursor(api.search, q=query_term, geocode="39.7837304,-100.4458825,2000mi",lang='en').items(max_tweets)):
        if tweet.id in dict_store.keys():
            continue
        else:
            dict_store[tweet.id] = tweet.text
            tweet_count += 1
    print("Downloaded {0} tweets".format(tweet_count))
    return dict_store

In [67]:
def tweet_writer(sport,myDict):
    print("Writing the tweets into a file")
    with open("../new_data/tw/tw_"+fnConfig[sport]+".csv", "w") as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["tweet_Id","tweet_text"])
        for key, value in myDict.items():
            writer.writerow([key, value])

In [68]:
def tweetMain(sport):
    print("Checking Credentials & Creating end point")
    API=credentials_checker()
    
    #sport = input("Specify the Sport Name to Collect Tweets   ")
    query_generator = sport + "_query"
    dict_generator = sport + "_dict"
    
    print("Verifying Input......")
    if(sport == "basketball"):
        query_generator = ('NBA OR "NCAA MBB" OR NCAA OR OR Basketball OR "NCAA WBB" OR #NBAPlayoffs OR #NCAA OR #USBasketball OR #BasketballYouth')
        dict_generator = find_tweets(sport,query_generator,API)
        tweet_writer(sport,dict_generator)
    
    elif(sport == "football"):
        query_generator = ('"USA Football" OR NFL OR #USAFootball OR #NFL OR #NFLDraftNews OR #Quarterbacks OR #AmericanFootball')
        dict_generator = find_tweets(sport,query_generator,API)
        tweet_writer(sport,dict_generator)
    
    elif(sport == "golf"):
        query_generator = ('LPGA OR "Golf USA" OR "Masters Tournament" OR "USC Men Golf" OR #golfUSA OR #PGATour OR #golfnews OR #ANWAgolf OR #golf OR #TeamTitleist OR #golfchannel OR #AggieGolf OR #golfweek')
        dict_generator = find_tweets(sport,query_generator,API)
        tweet_writer(sport,dict_generator)

    elif(sport == "baseball"):
        query_generator = ('MLB OR "Minor League Baseball" OR baseball OR #BlueJays OR #Padres OR #Rockies OR #Mariners OR #GeauxTigers OR #IUBase OR #HailState OR #Pirates OR #ClawsUp')
        dict_generator = find_tweets(sport,query_generator,API)
        tweet_writer(sport,dict_generator)
    
    else:
        print("Wrong Search Term Cannot Process Request")

# Data Processing

In [57]:
letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

In [32]:
def stemSentence(sentence):
    words = word_tokenize(sentence)
    final = ""
    sb = ss("english")
#     wnl = wn()
    for w in words: 
        root = sb.stem(w)
#         root = wnl.lemmatize(w)
        final += root
        final += " "
    return final

In [49]:
def processArticles(files, dest, fn):
    counter = 0
    for f in files:
        sample = open(f,"r")
        refinedFinal = ""
        print("Processing File # {}".format(counter), end="\r", flush=True)
        for line in sample.readlines():
            lowerLine = line.lower()
            noUrls = re.sub(r"http\S+", "", lowerLine)
            noUnderscore = noUrls.replace("_", "")
            #remove digits
            noNoLine = re.sub(r'\d+', '', noUnderscore)
            #remove punctuations
            words = re.findall(r'\w+', noNoLine, flags = re.UNICODE)# | re.LOCALE
            #remove stop words
            important_words = filter(lambda x: x not in stopwords.words('english') and x.isdigit() == False and x not in letters, words)
            refined = " ".join(important_words)
            #Get root words for the given words
            refinedFinal += stemSentence(refined)
            refinedFinal += "\n"
        saveFile(dest+fn+str(counter)+".txt", refinedFinal)
        counter += 1
        sample.close()

In [39]:
def processTweets(csvFile, dest):
    fn = csvFile.split(".csv")[0].split("/")[-1]
    df = pd.read_csv(csvFile)
    tweets = df.tweet_text
    refinedFinal = ""
    count = 0
    tempList = list()
    for tweet in tweets:
        print("Processing Tweet # {}".format(count), end="\r", flush=True)
        lowerLine = tweet.lower()
        noUrls = re.sub(r"http\S+", "", lowerLine)
        noUnderscore = noUrls.replace("_", "")
        #remove digits
        noNoLine = re.sub(r'\d+', '', noUnderscore)
        #remove punctuations
        words = re.findall(r'\w+', noNoLine, flags = re.UNICODE)# | re.LOCALE
        #remove stop words
        important_words = filter(lambda x: x not in stopwords.words('english') and x.isdigit() == False and x not in letters, words)
        refined = " ".join(important_words)
        #Get root words for the given words
        refinedStemmed = stemSentence(refined)
        if refinedStemmed in tempList:
            continue
        tempList.append(refinedStemmed)
        refinedFinal += refinedStemmed
        refinedFinal += "\n"
        count += 1
    saveFile(dest+fn+".txt", refinedFinal)

In [27]:
def processDataMain(directory, dest, ctype):
    if ctype == "twitter":
        processTweets(directory, dest)
    else:
        files=glob.glob(directory+"*.txt")
        fn = ctype+"_"+fnConfig[directory.split("/")[-2]]+"_"
        processArticles(files, dest, fn)

# Main Pipeline

### Crawling
-------------------------------------------------------------

New York Times<br>
Common Crawl<br>
Twitter

In [28]:
#Crawl New York Times articles for the given query
destinationDir = "../new_data"
nQuery = "golf"
nytMain(nQuery, destinationDir)

In [29]:
#Crawl Common Crawl Index for the given query
#We use usatoday.com as a source for our sports news from USA
cQuery = "golf"
domains = ["usatoday.com/story/sports/"+cQuery+"/2019/01/*","usatoday.com/story/sports/"+cQuery+"/2019/02/*",
           "usatoday.com/story/sports/"+cQuery+"/2019/03/*"]
ccMain(cQuery, destinationDir, domains)

In [60]:
tQuery = "golf"
twMain(tQuery)

### Processing Data
-------------------------------------------------------------

In [43]:
#Process the crawled data
#Twitter
csvFile = "../new_data/tw/tw_fb.csv"
tDest = "../data/tw/"
processDataMain(csvFile, tDest, "twitter")

Processing Tweet # 15858

In [55]:
#New York Times or Common Crawl
articleDir = "../new_data/nyt/golf/"
aDest = "../data/nyt/"
processDataMain(articleDir, aDest, "nyt")

Processing File # 295