In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# General Idea of this code - I want to compare how similar 3 documents are.
* Clean the text by removing  punctuation and stop words
* Convert Text into a vector
* Compare the vectors of these documents.

The numbers of dimensions in this vector space is the number of different words in the text. The length of the word vector will be longer if there are more occurences of that word.

This technique is called Term frequency–inverse document frequency or TF-IDF. I got most of the code from https://leantechblog.wordpress.com/2020/08/23/how-to-estimate-text-similarity-with-python/.

In [None]:
!pip3 install nltk
!pip3 install gensim
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# How to convert text into a vector

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

phrase_one = 'This is Sparta'
phrase_two = 'This is New York'
vectorizer = TfidfVectorizer ()
X = vectorizer.fit_transform([phrase_one,phrase_two])

vectorizer.get_feature_names()

# Cleaning the data (remove useless words and punctuation)

In [None]:
from string import punctuation
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

language_stopwords = stopwords.words('english')
non_words = list(punctuation)

"""# Cleaning the data (remove useless words and punctuation)"""

def remove_stop_words(dirty_text):
    cleaned_text = ''
    for word in dirty_text.split():
        if word in language_stopwords or word in non_words:
            continue
        else:
            cleaned_text += word + ' '
    return cleaned_text

def remove_punctuation(dirty_string):
    for word in non_words:
        dirty_string = dirty_string.replace(word, '')
    return dirty_string

def process_file(file_name):
    file_content = open(file_name, "r").read()
    # All to lower case
    file_content = file_content.lower()
    # Remove punctuation and spanish stopwords
    file_content = remove_punctuation(file_content)
    file_content = remove_stop_words(file_content)
    return file_content

def process_text(text):
    text_content = text
    # All to lower case
    text_content = text_content.lower()
    # Remove punctuation and spanish stopwords
    text_content = remove_punctuation(text_content)
    text_content = remove_stop_words(text_content)
    return text_content


# Generate a similarity matrix
Vectorise the documents (essentially a matrix now) then compute the similarity matrix which is basically a dot product between the word vectors.
Here article title 1,2 and 3 are all about the same thing according to the dataset labelling.
Article 4 is about a different subject. 

In [None]:
#news = pd.read_csv("../input/uci-news-aggregator.csv")
article1 = "Fed official says weak data caused by weather, should not slow taper"
article2 = "Fed's Charles Plosser sees high bar for change in pace of tapering"
article3 = "US open: Stocks fall after Fed official hints at accelerated tapering"
article4 = "Euro Anxieties Wane as Bunds Top Treasuries, Spain Debt Rallies"

In [None]:
#TF-IDF
vectorizer = TfidfVectorizer ()
X = vectorizer.fit_transform([article1,article2,article3,article4])
similarity_matrix = cosine_similarity(X,X)

print(similarity_matrix)


The similarity matrix shows that the documents are identical to themselves (diagonal elements). It shows that article 4 does not correlate with any of the other articles - THIS IS CORRECT. It shows correlation between articles 1, 2 and 3. (Probably because they all have the word "Fed" and some of them have the word "taper" in them)


In [None]:
!pip3 install newspaper3k
import newspaper
from newspaper import Article

In [None]:
url = "https://www.bbc.co.uk/sport/cricket/57651883"
article = Article(url)
article.download()
article.parse()
article.nlp()
print(article.keywords)

In [None]:
#print(article.text)

In [None]:
nRowsRead = 10000 # specify 'None' if want to read whole file
# uci-news-aggregator.csv has 422419 rows in reality, but we are only loading/previewing the first 1000 rows
news_df = pd.read_csv('../input/all-the-news/articles1.csv', delimiter=',', nrows = nRowsRead)
news_df.head(3)
articles = news_df['content']

In [None]:
text_list = []
for article in articles:
    text = process_text(article) 
    text_list.append(text)
    
vectorizer = TfidfVectorizer ()
X = vectorizer.fit_transform(text_list)
similarity_matrix = cosine_similarity(X,X)
print(similarity_matrix)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
matrix = (similarity_matrix - np.identity(nRowsRead))+ (np.ones(nRowsRead))
log_matrix = np.log(similarity_matrix)
plt.imshow(log_matrix)
plt.colorbar()
plt.show()
    

In [None]:
#Find the best article match
result = np.where(matrix == np.amax(matrix))
print(result[0])

In [None]:
match = news_df.loc[result[0]]
#print(match)
print(match['content'])
match.to_csv(r'./Match.csv', index = True)

This algorithm doesn't work very well for similarity grouping.
Instead try and determine a Vector to the whole article in terms of Category. The aim is to try and make ppls vectors become more aligned.

In [None]:
cnn_paper = newspaper.build('http://cnn.com')

for article in cnn_paper.articles:
     print(article.url)


for category in cnn_paper.category_urls():
     print(category)


cnn_article = cnn_paper.articles[0]
cnn_article.download()
cnn_article.parse()
cnn_article.nlp()

# Sentiment analysis

In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
sentence = "I am Charlie and I like cheese."
print(sid.polarity_scores(sentence))

happy_url = "https://edition.cnn.com/2021/07/01/health/science-of-laughter-scn-wellness/index.html"
article = Article(happy_url)
article.download()
article.parse()
article.nlp()
#print(article.text)
print("The happy article has scores:")
print(sid.polarity_scores(article.text)['compound'])

sad_url = "https://edition.cnn.com/2021/07/02/us/miami-dade-building-collapse-friday/index.html"
article2 = Article(sad_url)
article2.download()
article2.parse()
article2.nlp()
#print(article.text)
print("The Sad article has scores:")
print(sid.polarity_scores(article2.text)['compound'])

physics_url = "https://edition.cnn.com/2021/07/02/world/ocean-twilight-zone-whoi-c2e-scn-spc-intl/index.html"
article3 = Article(physics_url)
article3.download()
article3.parse()
article3.nlp()
#print(article.text)
print("The physics article has scores:")
print(sid.polarity_scores(article3.text))
#This article has a high neutrality score
#But the compound value is too polarised. It will be forced to pick between
#either positive or negative values.

# The problem
This code won't be very good. For example it doesn't take into account synonmyms of words. This is something that could be tackled with word2vec models. Loosely what we do is we say each word contains a small ammount of a different word. So the vector for the word "King" will also contain a small amount of the word "Queen" because they are related.

This is a really interesting article on a free version of GPT-3
https://medium.com/mlearning-ai/text-generation-using-gpt-neo-41877ef586c7
https://medium.com/mlearning-ai/a-graph-based-text-similarity-method-with-named-entity-information-in-nlp-abc7f1201d96