## Data Acquisition: Scraping Text From a Quote Based Website

### Importing Packages

In [1]:
%%time

import math
import nltk
import spacy
import re
import warnings

from IPython.display import display
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf
from matplotlib.mlab import PCA as mlabPCA
from sklearn.naive_bayes import BernoulliNB
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn import neighbors
from sklearn.utils import resample
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import KernelPCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift, estimate_bandwidth
import textwrap

from datetime import datetime
from dateutil.parser import parse
from nltk.stem.porter import PorterStemmer
from nltk.corpus import gutenberg, stopwords



# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format
pd.options.mode.chained_assignment = None

# Suppress Warnings
warnings.filterwarnings(
    action="ignore",
    module="sklearn"  
    )

# Set Plot Style
sns.set_style('darkgrid')

CPU times: user 2.13 s, sys: 383 ms, total: 2.52 s
Wall time: 2.49 s


### Scraping Quotes

In [2]:
# Importing in each cell because of the kernel restarts.
import scrapy
from scrapy.crawler import CrawlerProcess

class QTSpider(scrapy.Spider):
    # Naming the spider is important if you are running more than one spider of
    # this class simultaneously.
    name = "QTS"
    
    # URL(s) to start with.
    start_urls = [
        'http://quotes.toscrape.com',
    ]

    # Use XPath to parse the response we get.
    def parse(self, response):
        
        # Iterate over every <article> element on the page.
        for div in response.xpath('//div[@class="quote"]'):
            
            # Yield a dictionary with the values we want.
            yield {
                # This is the code to choose what we want to extract
                # You can modify this with other Xpath expressions to extract other information from the site
                'name': div.xpath('span/small/text()').extract_first(),
                'text': div.xpath('span[@class="text"]/text()').extract_first(),
            }

# Tell the script how to run the crawler by passing in settings.
process = CrawlerProcess({
    'FEED_FORMAT': 'json',         # Store data in JSON format.
    'FEED_URI': 'page.json',  # Name our storage file.
    'LOG_ENABLED': False           # Turn off logging for now.
})

# Start the crawler with our spider.
process.crawl(QTSpider)
process.start()
print('Success!')



Success!


In [3]:
import pandas as pd

firstpage = pd.read_json('page.json', orient='records')
print(firstpage.head(10))

                name                                               text
0    Albert Einstein  “The world as we have created it is a process ...
1       J.K. Rowling  “It is our choices, Harry, that show what we t...
2    Albert Einstein  “There are only two ways to live your life. On...
3        Jane Austen  “The person, be it gentleman or lady, who has ...
4     Marilyn Monroe  “Imperfection is beauty, madness is genius and...
5    Albert Einstein  “Try not to become a man of success. Rather be...
6         André Gide  “It is better to be hated for what you are tha...
7   Thomas A. Edison  “I have not failed. I've just found 10,000 way...
8  Eleanor Roosevelt  “A woman is like a tea bag; you never know how...
9       Steve Martin  “A day without sunshine is like, you know, nig...


### Viewing Most Used Words

In [4]:
%%time

## Vectorizing Text Data 

porter_stemmer = PorterStemmer()

def stemming_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words
  
# Using Bag Of Words

count_vectorizer = CountVectorizer(stop_words='english', tokenizer=stemming_tokenizer, max_features=5)
X = count_vectorizer.fit_transform(firstpage.text)
df_text = pd.DataFrame(X.toarray(), columns=count_vectorizer.get_feature_names())

CPU times: user 13.1 ms, sys: 3.61 ms, total: 16.7 ms
Wall time: 15.4 ms


In [5]:
df_text.head()

Unnamed: 0,better,know,man,miracl,think
0,0,0,0,0,2
1,0,0,0,0,0
2,0,0,0,2,0
3,0,0,0,0,0
4,1,0,0,0,0
