# Task
Find attached html pages and perform interesting tasks on it
- Extract all links from all pages and present them in a structured format (e.g. print a JSON document with all links)
- Download all images to a folder and print interesting statistics to get some insights (e.g. image width/height and size)
- Try to find the frequency of different word categories (nouns, adjectives,...) across all pages.
- Analyze the content in any other interesting way. Skillful visualization of your analysis is a plus!

## 1. Link extraction
### Extract all links from all pages and present them in a structured format (e.g. print a JSON document with all links)

In [49]:
#List of required packages and installation
!pip install bs4
!pip install nltk
!pip install plotly
from bs4 import BeautifulSoup
import json
import os
import html2text
import plotly.plotly as py
import numpy as np
import plotly.tools as tls
import nltk
tls.set_credentials_file(username= 'snehar', api_key = "e96QTBepc3ihn8qB3Ry4")
import plotly.graph_objs as go



In [50]:
def find_links(webpage_path):
    '''
    The Function find_links accept the address of webpage present in the directory
    and return all the hyperlinks present in the source code of the HTML.
    
    The Function uses beautifulsoup library for parsing the HTML document.
    '''
    # Open the HTML document, present in webpage_path and opens it in read mode
    f = open(webpage_path,'r')
    # Use BeautifulSoup as HTML Parser i.e. it understands the HTML tags and helps in navigation
    soup = BeautifulSoup(f, 'html.parser')
    # Created the empty list that will contains the webpage hyperlinks
    webpage_hyperlinks = []
    # The hyperlink can be in 'a' tag as well as 'link' tag. Therefore, 2 for loops are used to search the links for each tags.
    for link in soup.find_all('a', href=True):
        webpage_hyperlinks.append(link.get('href'))
    for link in soup.find_all('link', href=True):
        webpage_hyperlinks.append(link.get('href'))
    # There can be repetative links in the list. Therefore, it is important to get unique hyperlink that can be done using set.
    webpage_hyperlinks = set(webpage_hyperlinks)
    print("Total Links in " +  webpage_path + ' is : ' + str(len(webpage_hyperlinks)))
    return webpage_hyperlinks

In [51]:
def webcontent_sentiment_analysis(address):
    '''
    The function 'webcontent_sentiment_analysis' convert the html to txt and then using nltk package find the sentiment of the content.
    '''
    html = open(address).read()
    text = html2text.html2text(html)

    # nltk.download('vader_lexicon')
    # nltk.download('popular')
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    sid = SentimentIntensityAnalyzer()
    sentiment_summary = {"positive":0,"neutral":0,"negative":0}
    text = text.split('\n')
    for readme in text:
        sentences = nltk.tokenize.sent_tokenize(readme)
        for sentence in sentences:

            sentiment_score = sid.polarity_scores(sentence)
            if sentiment_score["compound"] == 0.0:
                sentiment_summary["neutral"] += 1
            elif sentiment_score["compound"] > 0.0:
                sentiment_summary["positive"] += 1
            else:
                sentiment_summary["negative"] += 1
    return sentiment_summary

In [52]:
# The is the main function, where code execution will start
# Requirements: Please place the folder in the same structure as provided.
# If you want to change the strcture, then provide similar path in 2 lines: i.e. listdir() , and join()
webpage_links = []
webpage_data = {}
file_names = []
webpages_sentiment = []
# The for loop will extract all the webpages according to the folder structure and call 'find_links' function accordingly.
# The function return the set and using the Python's Json package, the data is converted to JSON format.
for file in os.listdir("./wikipedia/wikipedia"):
    if file.endswith(".html"):
        file_names.append(file)
        address_name = os.path.join("./wikipedia/wikipedia", file)
        webpage_links = list(find_links(address_name))
        page = {'webpage_address':address_name,'links':webpage_links}
        webpage_data[file] = page

# Convert the Python Dictonary to JSON format
with open('webpages_link_data.json', 'w', encoding='utf-8') as outfile:
    json.dump(webpage_data, outfile, ensure_ascii=False, indent=4)

Total Links in ./wikipedia/wikipedia/Richard_Stallman.html is : 1429
Total Links in ./wikipedia/wikipedia/Konrad_Zuse.html is : 546
Total Links in ./wikipedia/wikipedia/Blaise_Pascal.html is : 1849
Total Links in ./wikipedia/wikipedia/Alan_Turing.html is : 1870
Total Links in ./wikipedia/wikipedia/Donal_Knuth.html is : 1662
Total Links in ./wikipedia/wikipedia/Grace_Hopper.html is : 1388
Total Links in ./wikipedia/wikipedia/Ada_Lovelace.html is : 1025


## 2 Sentiment Analysis 
### Sentiment Analysis of website content

In [53]:
# The Code will do the sentiment analysis of the webpages.
file_names = []
webpages_sentiment = []
# The for loop will extract all the webpages according to the folder structure and call 'webcontent_sentiment_analysis' function accordingly.
for file in os.listdir("./wikipedia/wikipedia"):
    if file.endswith(".html"):
        file_names.append(file)
        address_name = os.path.join("./wikipedia/wikipedia", file)
        # Sentiment Analysis
        val = webcontent_sentiment_analysis(address_name).values()
        val = list(val)
        # Normalize the value 
        total_val = val[0] + val[1] + val[2]
        val[0] = val[0]/total_val
        val[1] = val[1]/total_val
        val[2] = val[2]/total_val
        webpages_sentiment.append(list(val))
        
traces=[]
for n_page in range(0,len(file_names)):
    traces.append(
        go.Bar(
            x=['Positive', 'Neutral', 'Negative'],
            y=webpages_sentiment[n_page],
            name=file_names[n_page])
    )

plot_data = traces
layout = go.Layout(
barmode='group'
)
fig = go.Figure(data=plot_data, layout=layout)
py.iplot(fig, filename='grouped-bar')


High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~snehar/0 or inside your plot.ly account where it is named 'grouped-bar'


#  Sentiment Analysis Result:

### 1. Grace Hopper has highest positive sentiment content.
### 2. Alan Turing has highest negative sentiment content.