In [44]:
import selenium.webdriver as webdriver
import time
import sqlite3

In [45]:
PATH = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH)

In [46]:
def insertVaribleIntoArticleTable(article_id , article, label):
    try:
        sqliteConnection = sqlite3.connect('ArticlesDatabase.db')
        cursor = sqliteConnection.cursor()
        print("Connected to SQLite")

        sqlite_insert_with_param = """insert into articles_tb(article_id,article, label) values (?,?,?);"""

        data_tuple = (article_id,article,label)
        cursor.execute(sqlite_insert_with_param, data_tuple)
        sqliteConnection.commit()
        print("Python Variables inserted successfully in articles_tb")
        
        cursor.close()

    except sqlite3.Error as error:
        print("Failed to insert Python variable into sqlite table", error)
    finally:
        if sqliteConnection:
            sqliteConnection.close()
            print("The SQLite connection is closed")

In [47]:
### code for extracting required article text given its link
def get_article_text(link):
    driver.get(link)
    time.sleep(1)
    txt = ""
    try:
        content = driver.find_element_by_css_selector('[id^=content-body]')
        para = content.find_elements_by_tag_name('p')
    except :
        return txt
    else:
        for i in range(len(para)):
            txt = txt + " " + para[i].text
        return txt

In [48]:
### if article text is extracted successfully then update column used = 1 for link of article extracted in articles_link_tb table 
def update_finished_article(art_id):
    try:
        sqliteConnection = sqlite3.connect('ArticlesDatabase.db')
        cursor = sqliteConnection.cursor()
        print("Connected to SQLite")

        query = "UPDATE articles_link_tb SET used = 1 WHERE art_id =" + str(art_id)

        cursor.execute(query)
        sqliteConnection.commit()
        print("Python Variables updated successfully in articles_link_tb")
        
        cursor.close()

    except sqlite3.Error as error:
        print("Failed to update Python variable into sqlite table", error)
    finally:
        if sqliteConnection:
            sqliteConnection.close()
            print("The SQLite connection is closed")

In [49]:
import nltk
from nltk.corpus import stopwords 
import re
import string
import contractions
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [50]:
### preprocessing of article text extracted from link
def process_text(text):
    
    expanded_words = []    
    for word in text.split():
      # using contractions.fix to expand the shotened words
      expanded_words.append(contractions.fix(word))   

    text = ' '.join(expanded_words)
    
    text = text.lower().replace('\n',' ').replace('\r','').replace('-', ' ').strip()  
    text = re.sub(' +', ' ', text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r'\d+', '', text)   # remove numbers
    
    
    stop_words = set(stopwords.words('english'))   ### remove english words from text
    word_tokens = word_tokenize(text) 
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
    
    lemmas = []
    for word in filtered_sentence:
        lemmas.append(lemmatizer.lemmatize(word, pos ='v'))  ### Lemmatization by verb
    
    text = " ".join(lemmas)
    
    return text

In [51]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by the db_file
    :param db_file: database file
    :return: Connection object or None
    """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as e:
        print(e)

    return conn

In [54]:
### list of queries used for extract articles by its topic and by its size

QUERY = []

In [55]:
### write as much as queries you want by topic

query = """SELECT art_id,link,label FROM articles_link_tb WHERE used = 0 AND label = "Society" ORDER BY RANDOM() LIMIT 1000 """
QUERY.append(query)

query = """SELECT art_id,link,label FROM articles_link_tb WHERE used = 0 AND label = "Cricket" ORDER BY RANDOM() LIMIT 1000 """
QUERY.append(query)

query = """SELECT art_id,link,label FROM articles_link_tb WHERE used = 0 AND label = "Football" ORDER BY RANDOM() LIMIT 1000 """
QUERY.append(query)

query = """SELECT art_id,link,label FROM articles_link_tb WHERE used = 0 AND label = "International" ORDER BY RANDOM() LIMIT 1000 """
QUERY.append(query)

In [56]:
def extract_article(query):
    rows = []
    ### this code extract article links of particular label from article_links_tb table
    try:
        conn = create_connection('ArticlesDatabase.db')
        curr = conn.cursor()
        
        ### modify this query according to the label and its number you want 
        
        curr.execute(query)
        rows = curr.fetchall()

        conn.commit()
        print("Python Variables successfully selected from articles_link_tb " )

        curr.close()

    except sqlite3.Error as error:
        print("Failed to select Python variable from sqlite table", error)
    finally:
        if conn:
            conn.close()
            print("The SQLite connection is closed")
    
    ### this code extract all articles from its given link and then preprocess and store in article_tb table
    for row in rows:
        try:
            article_id = row[0]
            text = get_article_text(row[1])    ### getting text using link
            if text == "" :   
                continue  ## lets say due to some issue text extracted is an empty string then jump to next link for extraction
            article = process_text(text)
            label = row[2]
          
            insertVaribleIntoArticleTable(article_id,article,label)
        except:
            continue
        else:
            ### after insert text in articles_tb update articles_link_tb that this link is used.
            update_finished_article(article_id)

In [None]:
### run this automate articles text extraction

for qu in QUERY:
    extract_article(qu)