In [2]:
import pandas as pd
import numpy as np
import requests
import itertools
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import matplotlib.pyplot as plt

#read the file newsCorpora.csv which stores list of urls with corresponding categories
trustlist = pd.read_csv("/home/xiaotianzhou/Downloads/newsCorpora.csv", sep='\t', names = ["ID", "TITLE", "URL", "PUBLISHER", "CATEGORY", "STORY", "HOSTNAME", "TIMESTAMP" ])

In [3]:
trustlist.shape

(422419, 8)

In [4]:
trustlist.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [5]:
# This function will integrate the elements in an array into a single element
def concatenate_list_data(list):
    result = ''
    for element in list:
        result += element
    newlist = [result]
    return newlist

#url_list = trustlist['URL']
#url = url_list[1]
#url

#page1 = requests.get(url)
#page1

#soup1 = BeautifulSoup(page1.content, 'html.parser')

#paragraph = soup1.select("p")

#content1 = [con.get_text() for con in paragraph]
#print(content1)

# remove empty elements
#while '' in content1:
   #content1.remove('')

#content1 = concatenate_list_data(content1)
#print(content1)

#data1 = pd.DataFrame({
    #"content": content1,
    #"label": "Trust"
#})
#data1

In [7]:
#Definition of function which scrape contents from each web page and return a list of instances 
def generate_instance(list):
    content = []
    for url in list:
        content1 = []
        # make a GET request to a web and download html contents, ignore the exception
        try:
            page = requests.get(url)
        except:
            continue
        # parse the document with BeautifulSoup library
        soup = BeautifulSoup(page.content, 'html.parser')
        # get all paragraph tags <p>
        paragraph = soup.select("p")
        # extract all of the text and push into the list of instances
        content1 = [con.get_text() for con in paragraph]
        content1 = concatenate_list_data(content1)
        content.extend(content1)
        # remove empty elements
        while '' in content:
            content.remove('')
        # generate a list with size 1000
        if len(content) < 1000:
            continue
        else:
            break
    return content
                

In [5]:
# split the list of urls based on their category
test = trustlist[trustlist.CATEGORY == "m"]
#test.shape
test

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
4207,4208,Grown-ups: Put down the smartphones at mealtime,http://www.ketknbc.com/news/grown-ups-put-down...,KETK,m,dZphfxYr-HPiv2MLDkthjDyocSQCM,www.ketknbc.com,1394547690684
4208,4209,Cellphone addiction may damage parent-child bo...,http://gadgets.ndtv.com/mobiles/news/cellphone...,NDTV,m,dZphfxYr-HPiv2MLDkthjDyocSQCM,gadgets.ndtv.com,1394547690827
4209,4210,Parents won't stop using smartphones even whil...,http://www.mnn.com/family/family-activities/st...,Mother Nature Network,m,dZphfxYr-HPiv2MLDkthjDyocSQCM,www.mnn.com,1394547691052
4210,4211,Smartphones making parents ignore their kids,http://timesofindia.indiatimes.com/life-style/...,Times of India,m,dZphfxYr-HPiv2MLDkthjDyocSQCM,timesofindia.indiatimes.com,1394547691288
4211,4212,Smartphones can loosen emotional bonding with ...,http://www.utahpeoplespost.com/2014/03/smartph...,The Utah People's Post,m,dZphfxYr-HPiv2MLDkthjDyocSQCM,www.utahpeoplespost.com,1394547691482
4212,4213,Parents Distracted By Smartphones Ignore Their...,http://www.physiciansnews.com/2014/03/10/smart...,Physicians News Digest,m,dZphfxYr-HPiv2MLDkthjDyocSQCM,www.physiciansnews.com,1394547691650
4213,4214,Smartphones may threaten parent-child emotiona...,http://economictimes.indiatimes.com/news/news-...,Economic Times,m,dZphfxYr-HPiv2MLDkthjDyocSQCM,economictimes.indiatimes.com,1394547691868
4214,4215,Cell phone addiction may kill parent-child bond,http://gulfnews.com/news/world/usa/cell-phone-...,gulfnews.com,m,dZphfxYr-HPiv2MLDkthjDyocSQCM,gulfnews.com,1394547692102
4215,4216,Parents often glued to cellphone while kids ea...,http://www.foxnews.com/health/2014/03/11/paren...,Fox News,m,dZphfxYr-HPiv2MLDkthjDyocSQCM,www.foxnews.com,1394547692320
4216,4217,Mobile Phones Affecting Parenting Skills: Study,http://topnews.ae/content/220508-mobile-phones...,TopNews Arab Emirates,m,dZphfxYr-HPiv2MLDkthjDyocSQCM,topnews.ae,1394547692497


In [7]:
url_list = test['URL']
#url_list
#page = requests.get("http://main.omanobserver.om/\?p=63376")

In [204]:
#for loop
content = []
for url in url_list:
    content1 = []
    try:
        page = requests.get(url)
    except:
        continue
    soup = BeautifulSoup(page.content, 'html.parser')
    soup.
    paragraph = soup.select("p")
    content1 = [con.get_text() for con in paragraph]
    content1 = concatenate_list_data(content1)
    content.extend(content1)
    # remove empty elements
    while '' in content:
        content.remove('')
    if len(content) < 1000:
        continue
    else:
        break

len(content)


1000

In [205]:
# Create dataframe and read into a csv file
data1 = pd.DataFrame({
    "content": content,
    "label": "Health"
})
data1.to_csv('Class_Health.csv', sep = '\t')

In [6]:
test2 = trustlist[trustlist.CATEGORY == "b"]
test2

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027
5,6,Plosser: Fed May Have to Accelerate Tapering Pace,http://www.nasdaq.com/article/plosser-fed-may-...,NASDAQ,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.nasdaq.com,1394470372212
6,7,Fed's Plosser: Taper pace may be too slow,http://www.marketwatch.com/story/feds-plosser-...,MarketWatch,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.marketwatch.com,1394470372405
7,8,Fed's Plosser expects US unemployment to fall ...,http://www.fxstreet.com/news/forex-news/articl...,FXstreet.com,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.fxstreet.com,1394470372615
8,9,US jobs growth last month hit by weather:Fed P...,http://economictimes.indiatimes.com/news/inter...,Economic Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,economictimes.indiatimes.com,1394470372792
9,10,ECB unlikely to end sterilisation of SMP purch...,http://www.iii.co.uk/news-opinion/reuters/news...,Interactive Investor,b,dPhGU51DcrolUIMxbRm0InaHGA2XM,www.iii.co.uk,1394470501265


In [9]:
url_list2 = test2['URL']
#url_list2

In [12]:
#for loop
content2 = []
for url in url_list2:
    content1 = []
    try:
        page = requests.get(url)
    except:
        continue
    soup = BeautifulSoup(page.content, 'html.parser')
    paragraph = soup.select("p")
    content1 = [con.get_text() for con in paragraph]
    content1 = concatenate_list_data(content1)
    content2.extend(content1)
    # remove empty elements
    while '' in content2:
        content2.remove('')
    if len(content2) < 1000:
        continue
    else:
        break

len(content2)

1000

In [13]:
data2 = pd.DataFrame({
    "content": content2,
    "label": "Business"
})
data2.to_csv('Class_Business.csv', sep = '\t')

In [8]:
test3 = trustlist[trustlist.CATEGORY == "t"]
test4 = trustlist[trustlist.CATEGORY == "e"]

In [9]:
url_list3 = test3['URL']
#url_list3
url_list4 = test4['URL']
#url_list4

In [10]:
content3 = generate_instance(url_list3)
data3 = pd.DataFrame({
    "content": content3,
    "label": "Science"
})

content4 = generate_instance(url_list4)
data4 = pd.DataFrame({
    "content": content4,
    "label": "Entertainment"
})

In [13]:
data3.to_csv('Class_Science.csv', sep = '\t')
data4.to_csv('Class_Entertainment.csv', sep = '\t')