In [4]:
import requests
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
import nltk
from nltk import word_tokenize
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('vader_lexicon')
import bokeh
import bokeh.plotting as bkh
from bokeh.models import HoverTool
bkh.output_notebook()

def preprocessing(sentence):
    
    sentence=word_tokenize(sentence)
    wnl = nltk.WordNetLemmatizer()
    sentence=[wnl.lemmatize(t) for t in sentence]
    sentence=' '.join(sentence)
    return sentence


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [8]:
URL_critics=["https://www.metacritic.com/tv/the-office/season-1/critic-reviews",
      "https://www.metacritic.com/tv/the-office/season-2/critic-reviews",
     "https://www.metacritic.com/tv/the-office/season-3/critic-reviews",
     "https://www.metacritic.com/tv/the-office/season-4/critic-reviews",
     "https://www.metacritic.com/tv/the-office/season-5/critic-reviews",
     "https://www.metacritic.com/tv/the-office/season-6/critic-reviews",
     "https://www.metacritic.com/tv/the-office/season-7/critic-reviews",
     "https://www.metacritic.com/tv/the-office/season-8/critic-reviews",
     "https://www.metacritic.com/tv/the-office/season-9/critic-reviews" 
    ]

URL_users=["https://www.metacritic.com/tv/the-office/season-1/user-reviews",
      "https://www.metacritic.com/tv/the-office/season-2/user-reviews",
     "https://www.metacritic.com/tv/the-office/season-3/user-reviews",
     "https://www.metacritic.com/tv/the-office/season-4/user-reviews",
     "https://www.metacritic.com/tv/the-office/season-5/user-reviews",
     "https://www.metacritic.com/tv/the-office/season-6/user-reviews",
     "https://www.metacritic.com/tv/the-office/season-7/user-reviews",
     "https://www.metacritic.com/tv/the-office/season-8/user-reviews",
     "https://www.metacritic.com/tv/the-office/season-9/user-reviews" 
    ]
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
headers={'User-Agent':user_agent}


#Season wise data
Raw_data_critics=[]
Raw_data_users=[]

for url in URL_critics:
    print(url)
    request=urllib.request.Request(url,None,headers)
    response=urllib.request.urlopen(request)
    data=response.read()
    soup=BeautifulSoup(data,"html.parser")

    
    critics_comments=( soup.find_all('div',attrs={'class':'review_body'}))
    scores=(soup.find_all('div',attrs={'class':('metascore_w medium tvshow positive indiv','metascore_w medium tvshow mixed indiv','metascore_w medium tvshow negative indiv')}))
#     print(len(critics_comments[:len(scores)]),len(scores))
    Raw_data_critics.append(pd.DataFrame([ [ i.text for i in critics_comments[:len(scores)] ] ,  [j.text for j in  scores]  ] ).T)
    
for url in URL_users:
    
    request=urllib.request.Request(url,None,headers)
    response=urllib.request.urlopen(request)
    data=response.read()
    soup=BeautifulSoup(data,"html.parser")
    
    users_comments=( soup.find_all('div',attrs={'class':'review_body'}))
    scores=(soup.find_all('div',attrs={'class':("metascore_w user medium tvshow positive indiv",'metascore_w user medium tvshow positive indiv perfect','metascore_w user medium tvshow mixed indiv','metascore_w user medium tvshow negative indiv' )}))
#     print(len(users_comments[:len(scores)]),len(scores))
    Raw_data_users.append(pd.DataFrame( [ [ i.text for i in users_comments[:len(scores)]  ],[j.text for j in  scores] ]).T)
    

https://www.metacritic.com/tv/the-office/season-1/critic-reviews
https://www.metacritic.com/tv/the-office/season-2/critic-reviews
https://www.metacritic.com/tv/the-office/season-3/critic-reviews
https://www.metacritic.com/tv/the-office/season-4/critic-reviews
https://www.metacritic.com/tv/the-office/season-5/critic-reviews
https://www.metacritic.com/tv/the-office/season-6/critic-reviews
https://www.metacritic.com/tv/the-office/season-7/critic-reviews
https://www.metacritic.com/tv/the-office/season-8/critic-reviews
https://www.metacritic.com/tv/the-office/season-9/critic-reviews


In [9]:
sid = SentimentIntensityAnalyzer()

Season_scores_users=[]
Season_scores_critics=[]

for season_number in range(9):
    total_sum=0
    
    for i in Raw_data_users[season_number][0]:
        ss=sid.polarity_scores(i)
        total_sum=total_sum+ss['compound']
    
    Season_scores_users.append(total_sum*(1/Raw_data_users[season_number].shape[0]))
    
for season_number in range(9):
    total_sum=0
    
    for i in Raw_data_critics[season_number][0]:
        ss=sid.polarity_scores(i)
        total_sum=total_sum+ss['compound']
    
    Season_scores_critics.append(total_sum*(1/Raw_data_users[season_number].shape[0]))
       
    

In [10]:
f = bkh.figure(title="User Reviews",width=600, height=400, tools='box_zoom,reset,save')


f.line([1,2,3,4,5,6,7,8,9], Season_scores_critics)

bkh.show(f)

In [84]:
characters={
    
    "jim":['jim halpert','jim','halpert','john krasinski','john','krasinski'],
    "micheal": ['micheal','scott','steve carell','steve','carell'],
    "dwight": ["dwight schrute","schrute","dwight","rainn wilson","rainn","wilson"],
    "pam": ["pam","beesly","jenna fischer","jenna","fischer"],
    "andy": ["andy bernard","andy","bernard","ed helms",'ed','helms'],
    "angela": ["angela","martin","kinsey"],
    "kevin": ["kevin", "malone" , "brian","baumgartner"],
    "toby":["toby", "flenderson", "paul","lieberstein"],
    "stanley":["stanley", "james", "hudson", "leslie", "david", "baker"],
    "ryan": ['ryan','howard','novak'],
    "kelly": ["kelly","kapoor","mindy", "kaling"],
    "creed": ["creed","bratton"],
    "erin": ["erin","hannon","ellie","kemper"],
    "karen": ["karen","fillipelli", "rashida","jones"],
    "meredith":["meredith","kate","flannery"],
    "phyllis":["phyllis","vance","smith"],
    "oscar":["oscar"]    
}

character_polarity={
    
    "jim":[],
    "micheal": [],
    "dwight": [],
    "pam": [],
    "andy": [],
    "angela": [],
    "kevin": [],
    "toby":[],
    "stanley":[],
    "ryan": [],
    "kelly": [],
    "creed": [],
    "erin": [],
    "karen": [],
    "meredith":[],
    "phyllis":[],
    "oscar":[] 
    
}
   
for season_number in range(9):

    #similar structure to character_polarity dictory ( copying the dictionary instead of referencing it)
    character_polarity_seasonwise={
    
    "jim":[],
    "micheal": [],
    "dwight": [],
    "pam": [],
    "andy": [],
    "angela": [],
    "kevin": [],
    "toby":[],
    "stanley":[],
    "ryan": [],
    "kelly": [],
    "creed": [],
    "erin": [],
    "karen": [],
    "meredith":[],
    "phyllis":[],
    "oscar":[] 
    
    }
 
      
    #iterate through the comments
    for i in Raw_data_critics[season_number][0]:

        for actor in characters.keys():
            if( len(set(characters[actor]) & set(word_tokenize(i.lower())))>1):         
                ss=sid.polarity_scores(i)
                character_polarity_seasonwise[actor].append(ss['compound'])
                
    for i in Raw_data_users[season_number][0]:

        for actor in characters.keys():
            if( len(set(characters[actor]) & set(word_tokenize(i.lower())))>1):         
                ss=sid.polarity_scores(i)
                character_polarity_seasonwise[actor].append(ss['compound'])
                
    
    for a in characters.keys():

        if(len(character_polarity_seasonwise[a])>0):
            character_polarity[a].append(sum(character_polarity_seasonwise[a])/len(character_polarity_seasonwise[a]))
        else:
            character_polarity[a].append(0)


{'jim': [0.8097799999999999, 0, 0.9821, 0, 0, 0, 0, 0, 0], 'micheal': [0.59677, 0.9503666666666666, 0.9781, 0.9741, 0.7066, 0.3614, 0.638275, 0.57558, 0.8859250000000001], 'dwight': [0.6760333333333334, 0.0258, 0.9803999999999999, 0, 0.4391, -0.3939, 0, 0, 0], 'pam': [0.7144, 0, 0, 0, 0, 0, 0.1779, 0, 0], 'andy': [0.9785, 0, 0.9805, 0, 0, 0, 0, 0.8712, 0.9781], 'angela': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'kevin': [0.9837, 0, 0, 0, 0, 0, 0, 0, 0], 'toby': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'stanley': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'ryan': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'kelly': [0.9837, 0, 0, 0, 0, 0, 0, 0.9945, 0], 'creed': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'erin': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'karen': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'meredith': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'phyllis': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'oscar': [0, 0, 0, 0, 0, 0, 0, 0, 0]}
