In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# For data visualization
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage

# For NLP(text cleaning)
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# For NLP(feature extraction)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# For dimension reduction
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD

# For clustering
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.cluster import DBSCAN, AgglomerativeClustering

# For file handeling operations
import os
from glob import glob
from tqdm import tqdm

# To supress warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
abs_filepaths = glob("../input/bbc-news-summary/BBC News Summary/News Articles/*/*.txt")

# Read it and store it in a list
news_articles = []

for abs_filepath in tqdm(abs_filepaths, colour='yellow'):
    try:
        # Open the file
        f = open(abs_filepath,"r")
        # Read the contents of the file
        news_article = f.read()
        # Append it in a list
        news_articles.append(str(news_article))
    except:
        f = open(abs_filepath,'rb')
        # Read the contents of the file
        news_article = f.read()
        # Append it in a list
        news_articles.append(str(news_article))

In [4]:
news_articles[0]

In [5]:
# Create a stemmer object which will be used to stem all the words to its root
ps = PorterStemmer()

# Data cleaning

In [6]:
# Empty list to store the clean text
clean_articles = []

for article in tqdm(news_articles, colour='yellow'):
    # Replace the end lines <\n>
    article = article.replace("\\n",'')
    
    # Remove all excepth the alphabets
    article = re.sub("[^a-zA-Z]",' ', article)
    
    # Lower all the aplhabets
    article = article.lower()
    
    # Split the article on spaces, returning a list of words
    words = article.split()
    
   # Remove stopwords
    clean_article = [ps.stem(word) for word in words if not word in stopwords.words("english")]
    
    # Join clean words
    clean_article = " ".join(clean_article)
    
    # Append the tweet
    clean_articles.append(clean_article)

In [7]:
clean_articles[0]

# Text to vectors

In [8]:
# Initialize a vectorizer object
tfidf = TfidfVectorizer()

# Fit transform the clean article to create vectors
article_vectors = tfidf.fit_transform(clean_articles)

In [9]:
article_vectors

# Dimentionality reduction

In [10]:
# Initialize a SVD object
svd = TruncatedSVD(2000)

# Transform the data
reduced_articles = svd.fit_transform(article_vectors)

In [11]:
plt.figure(figsize=(10,8))
plt.title("Explained Variance VS Number Of Features")
sns.lineplot(x=[i for i in range(2000)],y=np.cumsum(svd.explained_variance_ratio_))
plt.show()

In [12]:
print("Total Explained Variance is ---> ",np.cumsum(svd.explained_variance_ratio_)[-1])

# Using Different clustering algorithms.

In [13]:
# To store sum  of squared distances for each number of cluster
SSD = []

# For each number of cluster k
for k in tqdm(range(2,10), colour='yellow'):
    # Initialize a model
    km = KMeans(n_clusters=k)
    # Fit the model
    km = km.fit(reduced_articles)
    # Append the sum of squared distances
    SSD.append(km.inertia_)

In [14]:
# Ploting an elbow plot (Num of clusters VS Sum of squared distances)
plt.figure(figsize=(10,8))
plt.title("Elbow Plot To Visually Select The Optimal K For Clustering")
plt.plot(range(2,10),SSD,'bx-')
plt.xlabel("Number Of Cluster")
plt.ylabel("SSD")
plt.show()

In [16]:
# Initialize the model
kmeans = KMeans(n_clusters=5)

# Fit on the data
kmeans.fit(reduced_articles)

# Get the labels
labels = kmeans.labels_

# Creating a dataframe of 2 dimensions:
* News Articles
* Labels

In [17]:
# Create a dictionary
df_dict = {"news":news_articles, 'labels_km':labels}

# Convert to dataframe 
df = pd.DataFrame(df_dict)

# Print head
df.head()

# Applying T-SNE to visulize data in 2d

In [18]:
# Initlalize the tnse object
tsne = TSNE(n_components=2)

# Transform the data
tsne_data = tsne.fit_transform(reduced_articles)

# Convert to Dataframe
tsne_df = pd.DataFrame(tsne_data, columns=['comp1','comp2'])

In [19]:
def tsne_viz(tsne_df,labels,label_col='',ax=False):
    if not ax:
        plt.figure(figsize=(15,9))
        sns.scatterplot(x=tsne_df['comp1'],y=tsne_df['comp2'],hue=labels,palette='Set2')
        plt.show()
    else:
        ax.set_title(f"Visualising the clusters of {label_col} using TSNE")
        sns.scatterplot(x=tsne_df['comp1'],y=tsne_df['comp2'],hue=labels,palette='Set2',ax=ax)  


# Visualize cluster

In [20]:
tsne_viz(tsne_df,df['labels_km'])

# Gaussian mixture model

In [21]:
# Initialize the GMM object
gmm = GaussianMixture(n_components=5)

# Fit the model
gmm.fit(reduced_articles)

# Get the labels
labels_gmm = gmm.predict(reduced_articles)

In [22]:
df['labels_gmm'] = labels_gmm

In [23]:
tsne_viz(tsne_df,df['labels_gmm'])

# Hierrchical cluatering

In [24]:
#Clustering with number of cluster as 5

h_single = AgglomerativeClustering(n_clusters=5,affinity='euclidean', linkage='single')
h_single.fit(reduced_articles)

In [25]:
df['labels_hier_single'] =  h_single.labels_

In [26]:
tsne_viz(tsne_df,df['labels_hier_single'])

# Average linkage method

In [27]:
#Clustering with number of cluster as 5
h_average = AgglomerativeClustering(n_clusters=5,affinity='euclidean', linkage='average')
h_average.fit(reduced_articles)

In [29]:
df['labels_hier_average'] =  h_average.labels_

In [30]:
tsne_viz(tsne_df,df['labels_hier_average'])

# Complete linkage method.

In [31]:
#Clustering with number of cluster as 5
h_complete = AgglomerativeClustering(n_clusters=5,affinity='euclidean', linkage='complete')
h_complete.fit(reduced_articles)

In [33]:
df['labels_hier_complete'] =  h_complete.labels_

In [34]:
tsne_viz(tsne_df,df['labels_hier_complete'])

# DBScan clustering

In [35]:
# Initialize a DBSCAN object
dbs = DBSCAN(eps = 1.25, min_samples=25)

# Fir & Get the labels
labels_dbs = dbs.fit_predict(reduced_articles)

In [36]:
df['labels_dbs'] = labels_dbs

In [37]:
tsne_viz(tsne_df,df['labels_dbs'])

# All clustering together.

In [38]:
fig,ax = plt.subplots(2,3,figsize=(30,15))

row = 0
col = 0

for column in df.columns[1:]:
    tsne_viz(tsne_df,df[column],ax=ax[row,col],label_col=column)
    
    if col==2:
        col=0
        row+=1
    else:
        col+=1
        
plt.tight_layout(pad=3)
plt.show()