# Web Scraping, Data Cleaning, and Sentiment Analysis:

### Importing our Packages

In [1]:
# we will use pandas create our data frame
import pandas as pd
# we will use Beautiful Soup to pull data from the HTML
from bs4 import BeautifulSoup
# requests will allow us to pull data from Yelp
import requests
# use to perform numerical operations 
import numpy as np

###### The *Requests* function will allow us to make request to yelp.

In [2]:
r = requests.get('https://www.yelp.com/biz/roost-italian-dayton')

In [3]:
print(r.status_code)

200


In [4]:
r.text



In [5]:
soup = BeautifulSoup(r.text, 'html.parser')

In [6]:
result = soup.findAll(class_='comment__373c0__1M-px css-n6i4z7')
print(results)

NameError: name 'results' is not defined

In [None]:
reviews = []
for result in results:
    reviews.append(result.find('span').text)

In [None]:
for review in reviews:
    print(review,'\n')

## Making our DataFrame using Pandas
 **Pandas**- pandas provides data structures and operations for manipulating numerical tables and time series.
 
**Array**- an array is a collection of data stored in a spacific location.

In [None]:
df = pd.DataFrame(np.array(reviews), columns =['Reviews'])
df.tail()

# Creating some Metrics

##### .apply is used to add data to our DataFrame
##### .split serperates each word in a string into an item


#### 1.) How many word are in each review?

In [None]:
df['word_count'] = df['Reviews'].apply(lambda x: len(str(x).split ()))

In [None]:
df.head()

#### 2.) How many charerters are in each review?

In [None]:
df['char_count'] = df['Reviews'].str.len()

### What is the average amount of words lenght in each review?  

#### def = Defining a Function 

In [None]:
def average_words(x):
    words = x.split()
    return sum(len(word) for word in words) / len(words)    

In [None]:
df['Reviews'].apply(lambda x: average_words(x))

In [None]:
df['Average_Word_Lenght'] = df['Reviews'].apply(lambda x: average_words(x))

### How Many Stop Word are in Each Review?

**Stopwords** - Stopwords are the words in any language which does not add much meaning to a sentence. They can safely be ignored without sacrificing the meaning of the sentence. 

In [None]:
## importing a stop words list from the "Natural Language Toolkit"
from nltk.corpus import stopwords

In [None]:
## creating our variable
stop_words = stopwords.words('english')

In [None]:
## seeing what is stofred in the stopwords list
stop_words

In [None]:
## How many word are stored in the list
len(stop_words)

In [None]:
df['Reviews'].apply(lambda x: len([word for word in x.split() if word.lower() in stop_words]))

In [None]:
df['stop_word_count'] = df['Reviews'].apply(lambda x: len([word for word in x.split() if word.lower() in stop_words]))

In [None]:
df['stop_word_count'] / df['word_count']

In [None]:
df['stop_word_rate'] = df['stop_word_count'] / df['word_count']

In [None]:
df.head(11)

In [None]:
df.sort_values(by='word_count')

In [None]:
df.describe()

# Staged Based Data Cleaning

In [None]:
## making all the letters lowercase in each review
df['Reviews'].apply(lambda x: ''.join(word.lower() for word in x.split()))

In [None]:
df['lowercase'] = df['Reviews'].apply(lambda x: ''.join(word.lower() for word in x.split()))

In [None]:
# removing exclamations and spaces
df['lowercase'].str.replace('[^\w\s]', '')

Tokenization = blocks of text are divided into sperate words and picies of punctuation

In [None]:
# removing punctuation
df['Punctuation'] = df['lowercase'].str.replace('[^\w\s]', '')

In [None]:
df.head()

In [None]:
# removing stop_words 
df['Punctuation'].apply(lambda x: " ".join( word for word in x.split() if word not in stop_words))

In [None]:
#adding to our df 
df['stopwords'] = df['Punctuation'].apply(lambda x: ''.join (x for x in x.split() if x not in stop_words))

In [None]:
df.head()

In [None]:
# joining our reviews back together
" ".join(df['stopwords'])

In [None]:
# creating an array
pd.Series(" ".join(df['stopwords']).split())

In [None]:
stop_words

In [None]:
freq = pd.Series(''.join(df['review_nopunc_nostop']).split()).value_counts() [:30]