In [39]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import time
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# web scraping

In [26]:
link = "https://www.caranddriver.com/features/a35172303/driven-google-uber-autonomous-car/"

In [27]:
page = requests.get(link)

In [28]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')

In [29]:
soup


<!DOCTYPE html>

<html class="no-js" lang="en-US">
<head>
<title>Driven: Google, Uber, and the Battle to Build an Autonomous Car</title>
<meta charset="utf-8" name="charset"/>
<meta content="IE=edge,chrome=1" http_equiv="X-UA-Compatible" name="x-ua-compatible"/>
<meta content="width=device-width,initial-scale=1,maximum-scale=6" name="viewport"/>
<meta content="no" name="msapplication-tap-highlight"/>
<meta content="#222222" name="theme-color"/>
<meta content="article" property="og:type"/>
<meta content="457407421359595" property="fb:app_id"/>
<meta content="https://www.facebook.com/caranddriver" property="article:publisher"/>
<meta content="@CARandDRIVER" name="twitter:site"/>
<meta content="USl9XYbHtypi4CBcqx2s5oWwZlp2UltKW8sN1DQzzO8" name="google-site-verification"/>
<meta content="Car and Driver" property="og:site_name"/>
<meta content="Driven: Google, Uber, and the Battle to Build an Autonomous Car" name="title"/>
<meta content="Author ﻿Alex Davies talks about his new book, Driven

In [30]:
contents = soup.find_all('p',class_='body-text')

In [32]:
contents[0]

<p class="body-text">Anthony Levandowski stood at the center of the race between Google and Uber <a class="body-link" data-vars-ga-outbound-link="https://www.caranddriver.com/features/a29587219/autonomous-vehicle-enemies/" href="https://www.caranddriver.com/features/a29587219/autonomous-vehicle-enemies/">to build self-driving cars</a>. He was there at the beginning of Google's program in 2009. By the time it became Waymo in 2016, he'd left, founded a self-driving truck company called Otto, and sold that (in 2016) to Uber for $600 million. He was eventually indicted on dozens of federal charges, accused of stealing 14,000 files containing Google's self-driving trade secrets. In August, <a class="body-link" data-vars-ga-outbound-link="https://www.nytimes.com/2020/08/04/technology/levandowski-google-uber-sentencing-trade-secrets.html" href="https://www.nytimes.com/2020/08/04/technology/levandowski-google-uber-sentencing-trade-secrets.html">Levandowski was sentenced to 18 months in prison<

In [34]:
contents[1].get_text()

'“I think there’s a certain…egotism to the tech industry,” says Alex Davies, transportation editor for Business Insider and formerly for WIRED, and the author of Driven, a new book on the subject of the self-driving rivalry between Google and Uber. “The way the tech industry works today, it puts enormous value on the person who says, ‘No, no, no, no, no. You’ve all been doing it wrong. And I have the better way.’ And it rewards that with adulation, and incredible amounts of money.”'

In [6]:
news_contents = []
list_paragraphs = []
for p in np.arange(0, len(contents)):
    paragraph = contents[p].get_text()
    list_paragraphs.append(paragraph)
    final_article = " ".join(list_paragraphs)
news_contents.append(final_article)


In [7]:
news_contents[0]

"Anthony Levandowski stood at the center of the race between Google and Uber to build self-driving cars. He was there at the beginning of Google's program in 2009. By the time it became Waymo in 2016, he'd left, founded a self-driving truck company called Otto, and sold that (in 2016) to Uber for $600 million. He was eventually indicted on dozens of federal charges, accused of stealing 14,000 files containing Google's self-driving trade secrets. In August, Levandowski was sentenced to 18 months in prison.  “I think there’s a certain…egotism to the tech industry,” says Alex Davies, transportation editor for Business Insider and formerly for WIRED, and the author of Driven, a new book on the subject of the self-driving rivalry between Google and Uber. “The way the tech industry works today, it puts enormous value on the person who says, ‘No, no, no, no, no. You’ve all been doing it wrong. And I have the better way.’ And it rewards that with adulation, and incredible amounts of money.” Th

# Text Processing

In [37]:
df = pd.DataFrame()
df['Content'] = news_contents

In [9]:
# \r and \n
df['Content_Parsed_1'] = df['Content'].str.replace("\r", " ")
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("\n", " ")
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("    ", " ")
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace('"', '')

In [10]:
df['Content_Parsed_1']

0    Anthony Levandowski stood at the center of the...
Name: Content_Parsed_1, dtype: object

In [11]:
df['Content_Parsed_2'] = df['Content_Parsed_1'].str.lower()
punctuation_signs = list("?:!.,;“”‘’'")
df['Content_Parsed_3'] = df['Content_Parsed_2']

for punct_sign in punctuation_signs:
    df['Content_Parsed_3'] = df['Content_Parsed_3'].str.replace(punct_sign, '')
df['Content_Parsed_4'] = df['Content_Parsed_3'].str.replace("'s", "")


In [12]:
df

Unnamed: 0,Content,Content_Parsed_1,Content_Parsed_2,Content_Parsed_3,Content_Parsed_4
0,Anthony Levandowski stood at the center of the...,Anthony Levandowski stood at the center of the...,anthony levandowski stood at the center of the...,anthony levandowski stood at the center of the...,anthony levandowski stood at the center of the...


##  Stemming and Lemmatization

In [13]:
wordnet_lemmatizer = WordNetLemmatizer()
nrows = len(df)
lemmatized_text_list = []

for row in range(0, nrows):
    
    # Create an empty list containing lemmatized words
    lemmatized_list = []
    
    # Save the text and its words into an object
    text = df.loc[row]['Content_Parsed_4']
    text_words = text.split(" ")

    # Iterate through every word to lemmatize
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
    # Join the list
    lemmatized_text = " ".join(lemmatized_list)
    
    # Append to the list containing the texts
    lemmatized_text_list.append(lemmatized_text)
df['Content_Parsed_5'] = lemmatized_text_list

In [14]:
# Downloading the stop words list
nltk.download('stopwords')
# Loading the stop words in english
stop_words = list(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tanakitpamornrattanakul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
df['Content_Parsed_6'] = df['Content_Parsed_5']

for stop_word in stop_words:

    regex_stopword = r"\b" + stop_word + r"\b"
    df['Content_Parsed_6'] = df['Content_Parsed_6'].str.replace(regex_stopword, '')

In [16]:
df.loc[0]['Content_Parsed_6']

'anthony levandowski stand   center   race  google  uber  build self-driving cars      begin  google program  2009   time  become waymo  2016 hed leave found  self-driving truck company call otto  sell  ( 2016)  uber  $600 million   eventually indict  dozens  federal charge accuse  steal 14000 file contain google self-driving trade secrets  august levandowski  sentence  18 months  prison   think theres  certain…egotism   tech industry say alex davies transportation editor  business insider  formerly  wire   author  drive  new book   subject   self-driving rivalry  google  uber  way  tech industry work today  put enormous value   person  say      youve     wrong     better way   reward   adulation  incredible amount  money  turn     toxic recipe  success especially   category  complex  autonomous cars  one persons vision cannot possibly triumph   myriad  technological social cultural political economic  logistical challenge   internal battle  dominance  team end  waste  huge amount  tim

In [17]:
df

Unnamed: 0,Content,Content_Parsed_1,Content_Parsed_2,Content_Parsed_3,Content_Parsed_4,Content_Parsed_5,Content_Parsed_6
0,Anthony Levandowski stood at the center of the...,Anthony Levandowski stood at the center of the...,anthony levandowski stood at the center of the...,anthony levandowski stood at the center of the...,anthony levandowski stood at the center of the...,anthony levandowski stand at the center of the...,anthony levandowski stand center race goo...


In [18]:
list_columns = ["Content_Parsed_6"]
df = df[list_columns]

df = df.rename(columns={'Content_Parsed_6': 'Content_Parsed'})

In [19]:
X_new = df['Content_Parsed']

In [36]:
X_new

0    anthony levandowski stand   center   race  goo...
Name: Content_Parsed, dtype: object

In [20]:
loaded_model = pickle.load(open('./Latest-News-Classifier/0. Latest News Classifier/04. Model Training/Models/best_rfc.pickle', 'rb'))
X_train = pickle.load(open('./Latest-News-Classifier/0. Latest News Classifier/03. Feature Engineering/Pickles/X_train.pickle', 'rb'))
y_train = pickle.load(open('./Latest-News-Classifier/0. Latest News Classifier/03. Feature Engineering/Pickles/y_train.pickle', 'rb'))

In [21]:
# Parameter election
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 300

In [22]:
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train
print(features_train.shape)

features_new = tfidf.transform(X_new).toarray()
print(features_new.shape)

(1891, 300)
(1, 300)


In [23]:
features_new

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.2068787 ,
        0.        , 0.        , 0.        , 0.        , 0.10607945,
        0.12131201, 0.        , 0.        , 0.11864119, 0.        ,
        0.        , 0.        , 0.        , 0.29879076, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.12748157,
        0.12113957, 0.        , 0.10075025, 0.        , 0.        ,
        0.        , 0.        , 0.12830089, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.09482641,
        0.        , 0.        , 0.10786921, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.11309078, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [24]:
loaded_model.predict(features_new)

array([0])