# A Million News Headlines

## Import Module

In [67]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from bs4 import BeautifulSoup
import re
from wordcloud import WordCloud
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from tqdm import tqdm
import os
from sklearn import metrics

# **Import data**

In [68]:
data = pd.read_csv("../input/million-headlines/abcnews-date-text.csv")
data = data.head(500000)

data_sort = data.sort_values("publish_date",axis=0,ascending=True,kind='quicksort',na_position='last')
data_sort.head()

# **Data Processing**

**Remove HTML tags and URL from the reviews**

In [69]:
def html_tag(phrase):
    http_remove = re.sub(r"http\S+", "",phrase)
    html_remove = BeautifulSoup(http_remove, 'lxml').get_text()
    return html_remove

**Remove the words with numbers and special chareacter**

https://stackoverflow.com/a/18082370/4084039

https://stackoverflow.com/a/5843547/4084039

In [70]:
def deleteWords(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

**Removing the words from the stop words list**

https://gist.github.com/sebleier/554280

In [71]:
stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

process the data.

proText is the text after processing

In [72]:
processed_text = []
for i in tqdm(data["headline_text"].values):
    sentance = html_tag(i)
    sentance = deleteWords(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance)
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    sentance = " ".join(i.lower() for i in sentance.split() if i.lower() not in stopwords)
    processed_text.append(sentance)

Claculate the silhouette factor

In [73]:
def evaluateScore(vectorized, labels):
    print("evaluateScore")
    Y = vectorized.toarray()
    calScore = metrics.calinski_harabasz_score(Y, labels)
    silScore = metrics.silhouette_score(Y, labels)
    return calScore, silScore

Do the data visualization

In [74]:
data["Clean_text"] = processed_text
data.head()

# Word Cloud

The word cloud uses words as the basic unit to display the text more intuitively and artistically. The word cloud map, also called word cloud, is a visual display of the "keywords" that appear frequently in the text. The word cloud map filters out a large number of low-frequency and low-quality keywords. The text information, so that the viewer can appreciate the main idea of the text just by swiping the text.

https://github.com/amueller/word_cloud

In [75]:
def word_cloud(cluster_num):
    sentance = []
    num = cluster_num
    sent = final_data["Clean_text"][final_data["labels"]==num]
    for i in sent:
        sentance.append(i)
    sentance = ''.join(sentance)
    wordcloud = WordCloud(background_color="black").generate(sentance)
    print(f"Cluster Number: {num}")
    plt.figure(figsize=(12,9))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()

# Bag-Of-Word
In information retrieval, the BOW model assumes that for a document, its word order, grammar, syntax and other elements are ignored, and it is only regarded as a collection of several words. The appearance of each word in the document is independent and does not depend on whether other words appear. (in no order)

In [76]:
final_data = data[0:100000]
bow = CountVectorizer(ngram_range=(1,2))
bow_vector = bow.fit_transform(final_data["Clean_text"])

clusters = [2,3,4,5,6,7,8,9]
inertia = []
for i in tqdm(clusters):
    k_mean= KMeans(n_clusters=i,n_init=10)
    k_mean.fit(bow_vector)
    inertia.append(k_mean.inertia_)

In [77]:
def printEroor(x,y,z):
    plt.figure(figsize=(x,z))
    sns.set_style(style="darkgrid")
    sns.lineplot(clusters,inertia)
    plt.xlabel("No of clusters",fontsize=x)
    plt.ylabel("Loss",fontsize=x)
    plt.title("Error for clusters",fontsize=y)
    plt.show()

In [78]:
printEroor(12,14,9)

In [79]:
k_mean = KMeans(n_clusters=6,n_init=10)
k_mean.fit(bow_vector)
final_data["labels"] = k_mean.labels_

In [80]:
def NWordCloud(n_clusters):
    print("Number of clusters: {}".format(n_clusters))
    
    plt.figure(figsize=(15,15))
    
    rows = int((n_clusters/2)+1) if type(n_clusters/2)==float else n_clusters/2
    for i in range(0,n_clusters):
        sentance = []
        num = i
        sent = final_data["Clean_text"][final_data["labels"]==num]
        for j in sent:
            sentance.append(j)
        sentance = ''.join(sentance)
        wordcloud = WordCloud(background_color="black").generate(sentance)
        plt.subplot(rows, 2, i+1)
        plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()

In [81]:
NWordCloud(6)

## TFIDF

TF-IDF is a statistical method to assess the importance of a word to a document set or one of the documents in a corpus. The importance of a word increases proportionally to the number of times it appears in the document, but decreases inversely to the frequency it appears in the corpus. Various forms of TF-IDF weighting are often applied by search engines as a measure or rating of the degree of relevance between documents and user queries. In addition to TF-IDF, search engines on the Internet use link analysis-based ranking methods to determine the order in which documents appear in search results

In [82]:
tfidf = TfidfVectorizer()
tfidf_vector = tfidf.fit_transform(final_data["Clean_text"])

clusters = [2,3,4,5,6,7,8,9]
inertia = []
for i in tqdm(clusters):
    k_mean= KMeans(n_clusters=i,n_init=10)
    k_mean.fit(tfidf_vector)
    inertia.append(k_mean.inertia_)

In [83]:
printEroor(12,14,9)

In [84]:
def K_mean(n_clusters,n_init):
    k_mean = KMeans(n_clusters=8,n_init=10)
    k_mean.fit(tfidf_vector)
    final_data["labels"] = k_mean.labels_
    unique_labels = list(final_data["labels"].unique())
    unique_labels.sort()
    

In [85]:
final_data.drop(columns='labels',axis=1,inplace=True)
K_mean(8,10)

In [86]:
NWordCloud(8)

## Average Word2Vector

In [87]:
list_of_sentance=[]
for sentance in tqdm(final_data["Clean_text"]):
    list_of_sentance.append(sentance.split())
x = final_data["Clean_text"]
print(f"Shape of X Train : {x.shape}")

w2v_model=Word2Vec(list_of_sentance,min_count=5,size=50, workers=4)
w2v_words = list(w2v_model.wv.vocab)
w2v_vector = []
for sent in tqdm(list_of_sentance):
    word_count = 0
    word_vector = np.zeros(50)
    for words in sent:
        if words in w2v_words:
            word_count +=1 
            each_word_vect = w2v_model.wv[words]
            word_vector += each_word_vect
    if word_count != 0: 
        word_vector /= word_count
    w2v_vector.append(word_vector)
print(f"Length of w2v_vector: {len(w2v_vector)}")

clusters = [2,3,4,5,6,7,8,9]
inertia = []
for i in tqdm(clusters):
    k_mean= KMeans(n_clusters=i,n_init=10)
    k_mean.fit(w2v_vector)
    inertia.append(k_mean.inertia_)

In [88]:
printEroor(12,14,9)

In [89]:
final_data.drop(columns='labels',axis=1,inplace=True)
K_mean(5,10)

In [90]:
NWordCloud(5)

## TFIDF Weighted V2W

In [91]:
list_of_sentance=[]
for sentance in tqdm(final_data["Clean_text"]):
    list_of_sentance.append(sentance.split())
x = final_data["Clean_text"]
print(f"Shape of final_data: {x.shape}")

model = TfidfVectorizer()
model.fit(final_data["Clean_text"])
dictionary = dict(zip(model.get_feature_names(), list(model.idf_)))
tfidf_feat = model.get_feature_names() 
w2v_model=Word2Vec(list_of_sentance,min_count=5,size=50, workers=4)
w2v_words = list(w2v_model.wv.vocab)


tfidf_w2v_vector = [];
for sent in tqdm(list_of_sentance): 
    sent_vec = np.zeros(50)
    weight_sum =0;
    for word in sent:
        if word in w2v_words and word in tfidf_feat:
            vec = w2v_model.wv[word]
            tf_idf = dictionary[word]*(sent.count(word)/len(sent))
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        sent_vec /= weight_sum
    tfidf_w2v_vector.append(sent_vec)
print(f"Length of tfidf_w2v_vector {len(tfidf_w2v_vector)}")

clusters = [2,3,4,5,6,7,8,9]
inertia = []
for i in tqdm(clusters):
    k_mean= KMeans(n_clusters=i,n_init=10)
    k_mean.fit(tfidf_w2v_vector)
    inertia.append(k_mean.inertia_)

In [92]:
printEroor(12,14,9)

In [93]:
def evaluateScore(vectorized, labels):
    print("evaluateScore")
    Y = vectorized.toarray()
    calScore = metrics.calinski_harabasz_score(Y, labels)
    silScore = metrics.silhouette_score(Y, labels)
    return calScore, silScore

In [94]:
K_mean(5,10)

In [95]:
NWordCloud(5)