In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json
import pandas as pd
import numpy as np
from datetime import date, datetime, timedelta
import os
import re

In [None]:

eenadu_archives = 'https://www.eenadu.net/archivesdet/home/'

In [None]:
category_links = {
    'eenadu_cinema' : 'https://www.eenadu.net/archivespage/cinemamore/', 
    'eenadu_sports' : 'https://www.eenadu.net/archivespage/sportsmore/', 
    'eenadu_crime' : 'https://www.eenadu.net/archivespage/crimemore/',
    'eenadu_business' : 'https://www.eenadu.net/archivespage/businessmore/',
    'eenadu_national' : 'https://www.eenadu.net/archivespage/nationalmore/'}

# class="article-box-list no-space-t no-space-b"

In [None]:
def readout_buffer(response):
    response.text = response.read()
    return response.text.decode('utf-8')

def get_data_from_url(url):
    try:
        r = urlopen(url)
        doc = readout_buffer(r)
    except Exception as e:
        # print(e)
        doc = ""
    return doc

In [None]:
def save_json(d, path):
    with open(path, 'w') as fp:
        json.dump(d, fp)
    
def read_json(path):
    with open(path, 'r') as fp:
        return json.load(fp)

In [None]:
# news_urls = dict.fromkeys(category_links.keys(), [])
news_urls = {k:[] for k in category_links.keys()}
N_days = 365
for i in range(0, N_days):
    current_date = date.today() - timedelta(days = i)
    current_date = current_date.strftime('%d-%m-%Y')
    for key in category_links:
        current_category_url = category_links[key] + current_date
        # open the main link
        doc = get_data_from_url(current_category_url)
        # parse html
        soup = BeautifulSoup(doc, 'html.parser')
        # print(soup)
        # filter body
        div = soup.find('ul', {'class': "article-box-list no-space-t no-space-b"})
        # print(div)
        # find all ahref tags
        if div:
            anchors = div.find_all('a');
            anchor_links = [anchor['href'] for anchor in anchors]
            news_urls[key].extend(anchor_links)

In [9]:

for key, values in news_urls.items():
    print(key, '--->', len(set(values)))

eenadu_cinema ---> 3122
eenadu_sports ---> 5693
eenadu_crime ---> 3058
eenadu_business ---> 4058
eenadu_national ---> 4596


In [None]:
save_json(news_urls, "links.json")

In [None]:
!cp -r 'links.json' 'drive/My Drive/nlp-telugu/'

In [6]:
!pip install selectolax
from selectolax.parser import HTMLParser
def get_details(url):
    doc = get_data_from_url(url)
    try: 
        html_doc = HTMLParser(doc)
        t = '\n '.join(n.text() for n in html_doc.css("title"))
        a = '\n '.join(n.text() for n in html_doc.css("span.text-justify"))
    except:
        t = ""
        a = ""
    return [t, a]

Collecting selectolax
[?25l  Downloading https://files.pythonhosted.org/packages/b9/6d/ad7ae4b4be8d43799019d5d4312b82cddf2540bc4334be6c327d8d7dc6c4/selectolax-0.2.3-cp36-cp36m-manylinux2010_x86_64.whl (1.7MB)
[K     |▏                               | 10kB 27.1MB/s eta 0:00:01[K     |▍                               | 20kB 6.3MB/s eta 0:00:01[K     |▋                               | 30kB 8.9MB/s eta 0:00:01[K     |▊                               | 40kB 5.8MB/s eta 0:00:01[K     |█                               | 51kB 7.1MB/s eta 0:00:01[K     |█▏                              | 61kB 8.4MB/s eta 0:00:01[K     |█▍                              | 71kB 9.6MB/s eta 0:00:01[K     |█▌                              | 81kB 7.6MB/s eta 0:00:01[K     |█▊                              | 92kB 8.4MB/s eta 0:00:01[K     |██                              | 102kB 9.3MB/s eta 0:00:01[K     |██                              | 112kB 9.3MB/s eta 0:00:01[K     |██▎                          

In [None]:
import json

with open('drive/My Drive/nlp-telugu/links.json') as f:
  news_urls = json.load(f)



In [8]:

from datetime import datetime
import multiprocessing as mp
import multiprocessing.dummy as mpd
import time

start = datetime.now()
cpu_cores = mp.cpu_count()
print('parallelising the task on {} cpu cores'.format(cpu_cores))

count = 0
# divide pool
pool = mpd.Pool(processes=cpu_cores)

# to store (title, article, category)
data_rows = []

# iter over
for key, url_list in news_urls.items(): 
    for row in pool.imap(get_details, url_list):
        row.append(key)
        data_rows.append(row)
    # print/save
    print("Done for {} ({}) ---> {}".format(key, len(data_rows), datetime.now() - start))
# close the pool
pool.close()
pool.join()

parallelising the task on 2 cpu cores
Done for eenadu_cinema (3122) ---> 0:23:47.677009
Done for eenadu_sports (8815) ---> 0:58:31.849614
Done for eenadu_crime (11873) ---> 1:20:23.420670
Done for eenadu_business (15931) ---> 1:49:35.672724
Done for eenadu_national (20527) ---> 2:20:27.336229


In [9]:
df = pd.DataFrame(data_rows, columns = ['title', 'text', 'category'])
df.to_parquet('telugu_news_dataset.parquet', index = None)
df.shape, df.columns

((20527, 3), Index(['title', 'text', 'category'], dtype='object'))

In [10]:
df['category'].value_counts()

eenadu_sports      5693
eenadu_national    4596
eenadu_business    4058
eenadu_cinema      3122
eenadu_crime       3058
Name: category, dtype: int64

In [None]:
!cp -r 'telugu_news_dataset.parquet' 'drive/My Drive/nlp-telugu/'

In [12]:
df.head()

Unnamed: 0,title,text,category
0,పెళ్లి వార్తలను ఖండించిన కీర్తి సురేష్‌,\n \n\nచెన్నై : తనకు త్వరలో పెళ...,eenadu_cinema
1,ప్రభాస్‌ గురించి ఆ హీరోయిన్‌తో ఫ్యాన్స్‌ చర్చ,\n \n\nహైదరాబాద్‌: అభిమాన స్టార...,eenadu_cinema
2,250 మురికివాడ కుటుంబాలకు రకుల్‌ సాయం,\n లాక్‌డౌన్‌ ముగిసేంత వరకు ఫుడ...,eenadu_cinema
3,యువతలో మరో కోణాన్ని చూశా,\n \n\nయుద్ధం జరుగుతున్నప్పుడు ...,eenadu_cinema
4,మోదీ అభినందనలు,\n \n\nకరోనాపై పాటతో సందేశం ఇచ్...,eenadu_cinema
