In [None]:
import re
import joblib
import nltk
import numpy as np
import pandas as pd
from gensim import corpora
from gensim.models import Word2Vec
from nltk import PorterStemmer
from nltk.corpus import stopwords
from sklearn.cluster import Birch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from utils import load_and_describe_raw_data
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from gensim.utils import simple_preprocess
from sklearn.metrics import accuracy_score,f1_score

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download ('wordnet')
class ProcessData:
    def __init__(self,dataset):
        self.dataframe = dataset

    def eliminate_labels(self):
        # Label, not label num
        self.dataframe.drop(columns = ['Id','Score','ViewCount','LabelNum'])

    def clean_text_and_tokenize(self,text):
        text = re.sub(r'[|\'|"|#|:|-|&|;|%|“|”]','',text)
        text = re.sub(r'[?|!|.|,|)|(|\\|\/]',' ',text)
        pattern = re.compile('<[^>]*>|\'|\(|\)|\"|”|“|\?|\.|,|:|;|&|[|]|-|\\\\')
        text = text.lower()
        text = re.sub(pattern, "", text)
        text = re.sub('[0-9]+', '', text)
        text = nltk.word_tokenize(text)
        stop_words = stopwords.words('english')
        text = [word for word in text if word not in stop_words]
       
        lemma = WordNetLemmatizer()
        text = ' '.join(lemma.lemmatize(word) for word in text)
       
        return text

    def merge_text_labels(self):
        self.dataframe['Content'] = self.dataframe['Title'] + self.dataframe['Body']
        self.dataframe['Content'] = self.dataframe['Content'].apply(self.clean_text_and_tokenize)

In [None]:
# Load data and process 
print("Loading data...")
X_train,X_valid,X_test = load_and_describe_raw_data()
dataset_train = ProcessData(X_train)
dataset_train.eliminate_labels()
dataset_train.merge_text_labels()


In [None]:
# Model class that has basic functions defined
class BirchModel:
    def __init__(self,data):
        self.data = data
        self.model = None


    def tfidf(self):
        vectorizer = TfidfVectorizer(min_df=0.025,stop_words='english',max_features=60000)
        X = vectorizer.fit_transform(self.data)
        
        # Get features
        #print(vectorizer.get_feature_names_out())
        return X

    def cluster(self):
        self.model = Birch(n_clusters=2,threshold=0.1,branching_factor=25)
        return self.model

In [None]:
B = BirchModel(dataset_train.dataframe['Content'])

In [None]:
# Tf-idf feature
X = B.tfidf()
# Get model
B_mod = B.cluster()

In [None]:
#Reduce components
pca = PCA(n_components=4)
X_pca = pca.fit_transform(X.toarray())
X = X_pca

In [None]:
#print("Fitting the model...")
B_mod.fit(X)
labels = B_mod.predict(X)
dataset_train.dataframe['y'] = labels

In [None]:
print("Score:")
print(silhouette_score(X,labels))

In [None]:
#Visualise clusters
plt.scatter(X[:,0], X[:,1],c=labels)
plt.show()

accuracy= accuracy_score(dataset_train.dataframe['LabelNum'], labels)
print (accuracy)

In [None]:
# Test
dataset_test = ProcessData(X_valid)
dataset_test.eliminate_labels()
dataset_test.merge_text_labels()

In [None]:
B_test = BirchModel(dataset_test.dataframe['Content'])
test_data = B_test.tfidf()
# pca_test = PCA(n_components=5)
test_pca = pca.fit_transform(test_data.toarray())
Y = test_pca
labels_test = B_mod.predict(Y)
accuracy= accuracy_score(dataset_test.dataframe['LabelNum'], labels_test)
dataset_test.dataframe['y'] = labels
print (accuracy)
score = f1_score(dataset_test.dataframe['LabelNum'], labels_test, average="macro")
print(score)
# for i in X_test.index:
#     print(dataset_test.dataframe.iloc[i]['LabelNum'], dataset_test.dataframe.iloc[i]['y'])

In [None]:
# Word2vec feature

sentences_tokenized = []

#tokenize sentence
for sentence in dataset_train.dataframe['Content'].values:
    sentences_tokenized.append(simple_preprocess(sentence))

w2v_model = Word2Vec(sentences_tokenized, workers=4,window=10,)

sent_vectors = [];  
for sentence in sentences_tokenized:  
    sentence_v = np.zeros(100)  
    wc = 0; 
    for word in sentence:  
        try:
            # add weigths
            vec = w2v_model.wv[word]
            sentence_v += vec
            wc += 1
        except:
            pass
    # average the vectors
    sentence_v /= wc
    sent_vectors.append(sentence_v)

sent_vectors = np.array(sent_vectors)
sent_vectors = np.nan_to_num(sent_vectors)

B_mod = B.cluster()
B_mod.fit(sent_vectors)
labels = B_mod.predict(sent_vectors)

dataset_train.dataframe['y'] = labels

In [None]:
print(silhouette_score(sent_vectors,labels))

In [None]:
plt.scatter(sent_vectors[:, 0], sent_vectors[:, 1], c=labels, cmap='rainbow')
plt.show()

In [None]:
# Hypertuning

branching_factor= [25,50,100,150]
n_clusters = [2,5,7]
threshold = [0.1,0.2,0.5]
model_results = {
                 'branching_factor': [],
                 'n_clusters': [],
                 'threshold': [],
                 'score': []
                }

for b in branching_factor:
    for cluster in n_clusters:
        for t in threshold:
            try:
                B_mod = Birch(n_clusters=cluster,threshold=t,branching_factor=b)
                B_mod.fit(X)
                labels = B_mod.predict(X)
                score = silhouette_score(X,labels)
                model_results['branching_factor'].append(b)
                model_results['n_clusters'].append(cluster)
                model_results['threshold'].append(t)
                model_results['score'].append(score)
                print(score)
            except:
                pass
pd.DataFrame(model_results).to_csv('./results/birch_tuning_tfidf.csv', index=False)

In [None]:
# Test
sentences_tokenized = []
for sentence in dataset_test.dataframe['Content'].values:
    sentences_tokenized.append(sentence)

#w2v_model = Word2Vec(sentences_tokenized, workers=4)

sent_vectors = [];  # the avg-w2v for each sentence/review is stored in this train
for sent in sentences_tokenized:  # for each review/sentence
    sent_vec = np.zeros(100)  # as word vectors are of zero length
    cnt_words = 0;  # num of words with a valid vector in the sentence/review
    for word in sent:  # for each word in a review/sentence
        try:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
        except:
            pass
    sent_vec /= cnt_words
    sent_vectors.append(sent_vec)

sent_vectors = np.array(sent_vectors)
sent_vectors = np.nan_to_num(sent_vectors)

labels_test = B_mod.predict(sent_vectors)
dataset_test.dataframe['y'] = labels_test

In [None]:
plt.scatter(sent_vectors[:, 0], sent_vectors[:, 1], c=labels_test, cmap='rainbow')
plt.show()

In [None]:
#Plotting parameters
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv('./results/birch_tuning_tfidf.csv')
# Plotting
plt.figure(figsize=(8, 6))

plt.scatter(df['branching_factor'], df['score'])
plt.xlabel('Number of branches')
plt.ylabel('Score')
plt.grid(True)
plt.show()