In [2]:
import numpy as np
import pandas as pd
import requests
import regex as re

from xmltodict import parse
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA

import nltk
import spacy

from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer

from collections import Counter

# HW3

## Text Processing

### Q1

1. Modify the code I wrote in lecture 8 with what you have learnt in lecture 9 and correctly tokenize the text both on the word and sentence level, and by removing the stopwords. Rewrite the `getSummary` function and all the other functions that it depends by maing these corrections.

2. Rewrite the code I wrote for `getKeywords` function making the same corrections.

3. Test your code from parts 1 and 2 on random articles from the Guardian.

4. Rewrite the `getSubjectGuardian` function for another newspaper in English, and test your code from part 1 and 2 on random articles from this new newspaper.

In [4]:
def getSubjectGuardian(subject):
    with requests.get(f'https://www.theguardian.com/{subject}/rss') as link:
        raw = parse(link.text)
    return raw['rss']['channel']['item']
nba = getSubjectGuardian('sport/nba')
film = getSubjectGuardian('film')
fashion = getSubjectGuardian('fashion')
swEN = stopwords.words('english')

def getText(url):
    with requests.get(url) as link:
        raw = BeautifulSoup(link.content,'html.parser')
    return ' '.join([x.text for x in raw.find_all('p')])

Q1 - 1

Tokenize and clean sentences and remove stopwords

In [5]:
def processText(text):
    aa = {'sentences': sent_tokenize(text)}
    aa.update({'cleanedSentences': [re.sub(r'[^\p{Letter}\s]','',sentence.lower()) for sentence in aa['sentences']]})
    return [re.sub(r'[^\w\s]','',x.lower()) for x in aa['cleanedSentences']]

In [6]:
def getMatrix(sentences):
    swEN = stopwords.words('english')
    vectorizer = CountVectorizer(stop_words=swEN)
    return vectorizer.fit_transform(sentences)

def getSummary(text,k):
    sentences = processText(text)
    matrix = getMatrix(sentences)
    projection = PCA(n_components=1)
    weights = projection.fit_transform(matrix.toarray())
    res = list(zip(weights.transpose()[0],range(112),sentences))
    tmp = sorted(res,key=lambda x: x[0],reverse=True)[:k]
    return sorted(tmp, key=lambda x: x[1])

Q1 - 2

Tokenize and clean sentence and remove stopwords

In [7]:
def getKeywords(text,sw,k):
    aa = {'sentences': sent_tokenize(text)}
    aa.update({'cleanedSentences': [re.sub(r'[^\p{Letter}\s]','',sentence.lower()) for sentence in aa['sentences']]})
        
    vectorizer = CountVectorizer(stop_words=sw)
    matrix = vectorizer.fit_transform(aa['cleanedSentences'])
    words = vectorizer.get_feature_names_out()
    
    projection = PCA(n_components=1)
    tmp = projection.fit_transform(matrix.transpose().toarray())
    weights = tmp.transpose()[0]
    
    return sorted(zip(weights,words),key=lambda x: x[0], reverse=True)[:k]


Q1 - 3

Test functions from guardian newspaper

In [132]:
m = np.random.randint(0,4)
text=getText(film[m]['link'])

print(getSummary(text,3))
getKeywords(text,swEN,3)

[(1.9244743946362999, 23, 'this was a hollywood remake of the far superior spanish mystery thriller open your eyes see below in which a wealthy man gets cosmetic surgery to repair his face which has been ruined in a car crash and falls in love with a beautiful woman cruz'), (2.007925972298687, 51, 'noriega plays césar a rich young guy who gets cosmetic surgery after being horribly disfigured in a car crash and then experiences ultrareal hallucinations indistinguishable from reality involving an affair with the fascinating young woman silvia cruz whose dalliance with him led to the crash'), (2.2771688961341443, 53, 'cruz won her best supporting actress oscar for this barcelonaset film from woody allen  a moderate comedy from allens luxurytourist euro period that also included his italian romp to rome with love  in which cruz played a stereotypically conceived italian call girl')]


[(5.578149859879137, 'cruz'),
 (1.3394502996794968, 'spanish'),
 (1.3187815036374662, 'role')]

Q1 -4 

Test functions with another newspaper

In [10]:
def getSubjectNews(subject):
    with requests.get(f'https://news.un.org/feed/subscribe/en/news/topic/{subject}/feed/rss.xml') as link:
        raw = parse(link.text)
    return raw['rss']['channel']['item']
health = getSubjectNews('health')
women = getSubjectNews('women')

In [127]:
m = np.random.randint(0,4)
text=getText(women[m]['link'])

print(getSummary(text,3))
getKeywords(text,swEN,3)

[(1.5527553254319468, 0, 'subscribe audio hub womens rights are human rights and universal in times of war and peace a senior un official told the security council on wednesday urging ambassadors to ensure accountability for conflictrelated sexual violence'), (1.5882317075943726, 35, 'ms karkoutly cofounder of an organization for women lawyers called huquqyat outlined a list of actions for the council that included referring the situation in syria to the international criminal court adopting a resolution on detainees and missing persons investigating and prosecuting perpetrators of sexual violence and ensuring womens rights are at the heart of accountability efforts'), (5.57464828547666, 46, 'these countries were also asked to take a harder look at the prevailing view that supporting investigations of conflict elated sexual violence in ethiopia could somehow derail the proposed reform agenda of the current government\xa0 widespread sexual violence against women and girls in conflict is

[(3.5223668913287107, 'violence'),
 (3.3184825556090654, 'sexual'),
 (1.6612743317623202, 'conflict')]

### Q2

Write a function that returns all named entities (proper names, country names, corporation names only) from a URL. Function should take the URL as the input and must return the list of named entities from that URL. Test your code on random articles from the Guardian. Don't use the NLTK's NER that I demonstrated during the lecture. Use the SpaCY's NER function.

In [121]:
def func1(url):
    with requests.get(url) as link:
        raw = parse(link.text)
        raw2= raw['rss']['channel']['item']
    m = np.random.randint(0,4)
    with requests.get(raw2[m]['link']) as link:
        raw = BeautifulSoup(link.content,'html.parser')
        last=' '.join([x.text for x in raw.find_all('p')])
        NER = spacy.load("en_core_web_sm")
        res = NER(last)
        print([i.text for i in res.ents if i.label_ in ('GPE','ORG','PERSON')])
func1(f'https://www.theguardian.com/film/rss')

['Gavin Millar', 'Alan Bennett', 'Dennis Potter', 'Victoria Wood', 'Dreamchild', 'Alice Liddell', 'Charles Dodgson', 'AKA', 'Lewis Carroll', 'Alice’s Adventures', 'Wonderland', 'Browne', 'Ian Holm', 'Dodgson', 'Millar', 'Alice', 'Alice', 'Jim Henson’s Creature Workshop', 'Millar', 'John Tenniel', 'Browne', 'Millar', 'Browne', 'Dreamchild', 'Andrew Sarris', 'the Village Voice', 'Peggy Ashcroft', 'Lionel Jeffries', 'the Prix Italia', 'Bennett’s Intensive Care', 'Talking Heads', 'Julie Walters', 'Victoria Wood:', 'Pat', 'Margaret', 'Wood', 'Housewife', 'Clydebank', 'Glasgow', 'Gavin', 'Rita', 'Osborne', 'Tom Millar', 'Gavin', 'Birmingham', 'King Edward’s', 'RAF', 'Christ Church', 'Oxford', 'Melvyn Bragg', 'Tempest', 'London', 'Sylvia Lane', 'Oxford', 'BBC', 'Karel Reisz’s', 'Listener', 'Arena Cinema', 'Woody Allen', 'Federico Fellini', 'Jean Renoir', 'François Truffaut', 'Monty Python’s Flying Circus', 'John Cleese', 'Gavin Millaaarrrrr', 'Millar', 'Isabel', 'Oxford', 'Scot', 'Roald Dahl’

### Q3

1. Write a function that returns the most positive and the most negative sentences from a text. The function must take the text as the input and must return a 2-tuple: the first element as the most positive and the second as the most negative sentence with their polarity scores.

2. Test your function on random articles from the Guardian.

In [106]:
def polarity(text):
    analyser = SentimentIntensityAnalyzer()
    sentences = sent_tokenize(text)
    score_list = []
    
    for temp, sent in enumerate(sentences):
        score = analyser.polarity_scores(sent)
        score_list.append({'sentence': sent, 'polarityScore': score['compound']})

    df = pd.DataFrame(score_list).sort_values(by='polarityScore')
    df1 = df.iloc[[0, -1]]
    result = list(df1.itertuples(index=False))
    return result
m = np.random.randint(0,4)
polarity(getText(film[m]['link']))

[Pandas(sentence='That’s questionable but, judging by the photo of the NFT that was posted on Instagram last month, they are at least too Cronenbergian to be destroyed.', polarityScore=-0.7096),
 Pandas(sentence='There you’ll be, quite happy following them on social media when, bang, up pops a sub-Banksy cartoon of a monkey in a wig and a falsely jubilant message celebrating the fact that they’ve joined some sort of club.', polarityScore=0.9134)]