In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline

In [8]:
data = pd.read_csv("data/abcnews-date-text.csv", error_bad_lines=False, usecols=["headline_text"])
data.head()

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


In [6]:
data.to_csv('data/abcnews.csv', index=False, encoding='utf-8')

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1226258 entries, 0 to 1226257
Data columns (total 1 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   headline_text  1226258 non-null  object
dtypes: object(1)
memory usage: 9.4+ MB


#### delete dupliate headlines if any

In [10]:
data[data['headline_text'].duplicated(keep=False)].sort_values('headline_text').head(5)

Unnamed: 0,headline_text
57967,10 killed in pakistan bus crash
116298,10 killed in pakistan bus crash
676423,110 with barry nicholls
672958,110 with barry nicholls
911080,110 with barry nicholls


In [11]:
data = data.drop_duplicates('headline_text')

#### TF-IDF

In [14]:
punc = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',"%"]
stop_words = text.ENGLISH_STOP_WORDS.union(punc)
desc = data['headline_text'].values
vectorizer = TfidfVectorizer(stop_words=stop_words)
X = vectorizer.fit_transform(desc)

In [16]:
word_features = vectorizer.get_feature_names_out()
print(len(word_features))
print(word_features[5000:5050])

104399
['afpa' 'afpla' 'afps' 'afr' 'afraid' 'afrcia' 'afresh' 'afri' 'afric'
 'africa' 'african' 'africans' 'africanus' 'africas' 'africaspotted'
 'africulture' 'afrid' 'afridi' 'afrikaners' 'afrin' 'afro' 'aft' 'aftaid'
 'afte' 'after15' 'afterabseiling' 'afterbirth' 'aftercompany' 'afterdark'
 'afterfavourites' 'afterglow' 'afterlife' 'aftermarket' 'aftermath'
 'afternoon' 'afternoons' 'afterparty' 'afterpay' 'afterschool'
 'aftershock' 'aftershocks' 'aftertaste' 'afterthought' 'afterwoman'
 'afterwork' 'aftr' 'aftrica' 'afw' 'afwl' 'ag']


#### Tokenization

In [19]:
stemmer = SnowballStemmer('english')
tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')

def tokenize(text):
    return [stemmer.stem(word) for word in tokenizer.tokenize(text.lower())]

In [20]:
vectorizer2 = TfidfVectorizer(stop_words=stop_words, tokenizer=tokenize)
X2 = vectorizer2.fit_transform(desc)
word_features2 = vectorizer2.get_feature_names_out()
print(len(word_features2))
print(word_features2[:50]) 



72395
["'a" "'i" "'s" "'x" 'aa' 'aaa' 'aaahhh' 'aac' 'aacc' 'aaco' 'aacta' 'aad'
 'aadhaar' 'aadmi' 'aag' 'aagaard' 'aagard' 'aah' 'aalto' 'aam' 'aamer'
 'aami' 'aamodt' 'aandahl' 'aant' 'aap' 'aapa' 'aapt' 'aar' 'aaradhna'
 'aardman' 'aardvark' 'aargau' 'aaron' 'aaronpaul' 'aarwun' 'aat' 'ab'
 'aba' 'abaaoud' 'ababa' 'aback' 'abacus' 'abadi' 'abadon' 'abal' 'abalon'
 'abalonv' 'abama' 'abandon']


In [21]:
vectorizer3 = TfidfVectorizer(stop_words=stop_words, tokenizer=tokenize, max_features=1000)
X3 = vectorizer3.fit_transform(desc)
words = vectorizer3.get_feature_names()



#### KMeans

In [None]:
from sklearn.cluster import KMeans

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(X3)
    wcss.append(kmeans.inertia_)

plt.plot(range(1,11),wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.savefig('elbow.png')
plt.show()

In [25]:
kmeans = KMeans(n_clusters=3, n_init=20) # n_init(number of iterations for clsutering) n_jobs(number of cpu cores to use)
kmeans.fit(X3)
# We look at 3 the clusters generated by k-means.
common_words = kmeans.cluster_centers_.argsort()[:,-1:-26:-1]
for num, centroid in enumerate(common_words):
    print(str(num) + ' : ' + ', '.join(words[word] for word in centroid))

0 : home, nurs, return, new, man, destroy, polic, resid, care, sydney, bushfir, woman, threaten, famili, head, bodi, welcom, power, storm, damag, arriv, owner, age, bring, melbourn
1 : polic, new, say, plan, win, australia, council, govt, australian, report, kill, nsw, warn, court, water, fund, urg, death, open, year, chang, crash, wa, sydney, attack
2 : man, charg, murder, polic, court, face, jail, assault, stab, death, die, drug, guilti, arrest, child, woman, sex, accus, attack, car, miss, crash, kill, alleg, shoot
