# <div align="center">CP421 Data Mining - Assignment 2</div>
### <div align="center">Due Date: Nov 6, 2023 at 11:59 PM</div>

#### Imports:

In [92]:
import numpy as np
import pandas as pd
import csv

from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\twbm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\twbm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\twbm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Text Preprocessing

#### 1. Tokenization 
Tokenization refers to the process of breaking down a piece of text into smaller units, commonly known as tokens. Typically, tokens are words, but they can also be phrases, sentences, or any other unit that makes sense for the specific analysis. Tokenization is essential because it helps convert the unstructured form of textual data into a form that can be utilized in various Natural Language Processing (NLP) tasks.

In [93]:
filename = 'bbc-news-data-modified.csv'

df = pd.read_csv(filename)

df['Tokens'] = df['title'] + ' ' + df['content']

tokens = df['Tokens'].str.split()
print(tokens)

0       [Ad, sales, boost, Time, Warner, profit, Quart...
1       [Dollar, gains, on, Greenspan, speech, The, do...
2       [Yukos, unit, buyer, faces, loan, claim, The, ...
3       [High, fuel, prices, hit, BA's, profits, Briti...
4       [Pernod, takeover, talk, lifts, Domecq, Shares...
                              ...                        
2220    [BT, program, to, beat, dialler, scams, BT, is...
2221    [Spam, e-mails, tempt, net, shoppers, Computer...
2222    [Be, careful, how, you, code, A, new, European...
2223    [US, cyber, security, chief, resigns, The, man...
2224    [Losing, yourself, in, online, gaming, Online,...
Name: Tokens, Length: 2225, dtype: object


#### 2. Stop Words Removal 
Stop words removal are commonly used words in any language which don’t add much meaning to a sentence. Words like ‘and’, ‘the’, ‘is’, and ‘in’ are examples of stop words. In text mining and search engines, these words are eliminated from the text to expedite the processing.

In [94]:
stop_words = set(stopwords.words('english'))
tokens = tokens.apply(lambda words: [word for word in words if word.lower() not in stop_words])
# Now, df contains the 'Words' column with stop words removed
print(tokens)

0       [Ad, sales, boost, Time, Warner, profit, Quart...
1       [Dollar, gains, Greenspan, speech, dollar, hit...
2       [Yukos, unit, buyer, faces, loan, claim, owner...
3       [High, fuel, prices, hit, BA's, profits, Briti...
4       [Pernod, takeover, talk, lifts, Domecq, Shares...
                              ...                        
2220    [BT, program, beat, dialler, scams, BT, introd...
2221    [Spam, e-mails, tempt, net, shoppers, Computer...
2222    [careful, code, new, European, directive, coul...
2223    [US, cyber, security, chief, resigns, man, mak...
2224    [Losing, online, gaming, Online, role, playing...
Name: Tokens, Length: 2225, dtype: object


#### 3. Lemmatization and Stemming 
Lemmatization and stemming involves converting a word to its base or dictionary form. For instance, ‘running’ becomes ‘run’, ‘better’ becomes ‘good’.

In [95]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

tokens = tokens.apply(lambda words: [stemmer.stem(lemmatizer.lemmatize(word)) for word in words])
print(tokens)

0       [ad, sale, boost, time, warner, profit, quarte...
1       [dollar, gain, greenspan, speech, dollar, hit,...
2       [yuko, unit, buyer, face, loan, claim, owner, ...
3       [high, fuel, price, hit, ba', profit, british,...
4       [pernod, takeov, talk, lift, domecq, share, uk...
                              ...                        
2220    [bt, program, beat, dialler, scam, bt, introdu...
2221    [spam, e-mail, tempt, net, shopper, comput, us...
2222    [care, code, new, european, direct, could, put...
2223    [us, cyber, secur, chief, resign, man, make, s...
2224    [lose, onlin, game, onlin, role, play, game, t...
Name: Tokens, Length: 2225, dtype: object


#### 4. Vectorizerization
Vectorizerization can be used to transform the text data into a matrix of TF-IDF features. This matrix serves as the input for the clustering algorithms, enabling them to cluster the documents based on the significance and occurrence of terms within them

In [96]:
documents = [' '.join(doc) for doc in tokens]

tfidf_vectorizer = TfidfVectorizer()
# Fit and transform the 'Words' column
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

print(tfidf_matrix.shape)

# Convert the TF-IDF matrix to a DataFrame
text = pd.DataFrame(data=tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Now, print your DataFrame
print(text.head())



(2225, 27259)
    00       000  0001  000bn  000m  000th  001  001and  001st  004  ...  \
0  0.0  0.022250   0.0    0.0   0.0    0.0  0.0     0.0    0.0  0.0  ...   
1  0.0  0.000000   0.0    0.0   0.0    0.0  0.0     0.0    0.0  0.0  ...   
2  0.0  0.000000   0.0    0.0   0.0    0.0  0.0     0.0    0.0  0.0  ...   
3  0.0  0.020255   0.0    0.0   0.0    0.0  0.0     0.0    0.0  0.0  ...   
4  0.0  0.000000   0.0    0.0   0.0    0.0  0.0     0.0    0.0  0.0  ...   

   zooms  zooropa  zornotza  zorro  zubair  zuluaga  zurich  zuton  zvonareva  \
0    0.0      0.0       0.0    0.0     0.0      0.0     0.0    0.0        0.0   
1    0.0      0.0       0.0    0.0     0.0      0.0     0.0    0.0        0.0   
2    0.0      0.0       0.0    0.0     0.0      0.0     0.0    0.0        0.0   
3    0.0      0.0       0.0    0.0     0.0      0.0     0.0    0.0        0.0   
4    0.0      0.0       0.0    0.0     0.0      0.0     0.0    0.0        0.0   

   zvyagintsev  
0          0.0  
1       

### Models

#### 1. K-means Clustering: 
With the processed text, segregate the news articles into 5 distinct clusters using the K-means algorithm.

In [97]:
kmeans = KMeans(init="random", n_clusters=5, n_init=10, random_state=1)

#fit k-means algorithm to data
kmeans.fit(text)
kmeans.labels_

text['cluster'] = kmeans.labels_
print(text)

       00       000  0001  000bn  000m  000th  001  001and  001st  004  ...  \
0     0.0  0.022250   0.0    0.0   0.0    0.0  0.0     0.0    0.0  0.0  ...   
1     0.0  0.000000   0.0    0.0   0.0    0.0  0.0     0.0    0.0  0.0  ...   
2     0.0  0.000000   0.0    0.0   0.0    0.0  0.0     0.0    0.0  0.0  ...   
3     0.0  0.020255   0.0    0.0   0.0    0.0  0.0     0.0    0.0  0.0  ...   
4     0.0  0.000000   0.0    0.0   0.0    0.0  0.0     0.0    0.0  0.0  ...   
...   ...       ...   ...    ...   ...    ...  ...     ...    ...  ...  ...   
2220  0.0  0.046149   0.0    0.0   0.0    0.0  0.0     0.0    0.0  0.0  ...   
2221  0.0  0.026533   0.0    0.0   0.0    0.0  0.0     0.0    0.0  0.0  ...   
2222  0.0  0.000000   0.0    0.0   0.0    0.0  0.0     0.0    0.0  0.0  ...   
2223  0.0  0.000000   0.0    0.0   0.0    0.0  0.0     0.0    0.0  0.0  ...   
2224  0.0  0.006765   0.0    0.0   0.0    0.0  0.0     0.0    0.0  0.0  ...   

      zooropa  zornotza  zorro  zubair  zuluaga  zu

In [101]:
unique_values_counts = text['cluster'].value_counts()
# Print the result
print(unique_values_counts)

2    787
1    517
3    351
4    343
0    227
Name: cluster, dtype: int64


#### 2. DBSCAN Clustering: 
Utilize the preprocessed text and implement the DBSCAN clustering algorithm.

In [114]:
dbs = DBSCAN(eps=0.03, min_samples=2).fit(text)
labels = dbs.labels_
print(labels)

[-1 -1 -1 ... -1 -1 -1]


In [113]:
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

Estimated number of clusters: 112
Estimated number of noise points: 2001


#### 3. Gaussian Mixture Model (GMM) Clustering: 
Allocate the news articles into 5 clusters leveraging the Gaussian Mixture Model.

In [112]:
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture


# Initialize and fit a Gaussian Mixture Model
n_components = 1  # Set the number of components (clusters) you want
gmm = GaussianMixture(n_components=n_components, random_state=0)

# Extract the feature matrix from the DataFrame
X = text.values  # Assuming all columns are your features

# Fit the GMM to the feature matrix
gmm.fit(X)

MemoryError: Unable to allocate 5.54 GiB for an array with shape (27260, 27260) and data type float64

#### 4. Agglomerative Clustering: 
Administer Agglomerative Clustering on the processed text, aiming to form 5 Coherent Clusters

In [123]:
Agg_clustering = AgglomerativeClustering(n_clusters = 5).fit(text)

In [126]:
print(clustering.labels_)
n_clus_ = len(set(clustering.labels_)) - (1 if -1 in clustering.labels_ else 0)
n_noise_ = list(clustering.labels_).count(-1)

print(n_clus_)
print(n_noise_)

[0 0 0 ... 0 1 2]
5
0
