Importing all the required packages

In [76]:
from gensim.models import Word2Vec, FastText
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.datasets import fetch_20newsgroups
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
import gensim.downloader as api
import numpy as np
from sklearn.metrics import classification_report, roc_auc_score
import seaborn as sns
import sys
import nltk
import re
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
import gensim
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt

warnings.filterwarnings(action = 'ignore')

In [32]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Text Preprocessing

Using sklearn to get "20 News group" Dataset

In [33]:
categories = ['alt.atheism','rec.sport.baseball','talk.politics.mideast','comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',categories=categories, remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test',categories=categories, remove=('headers', 'footers', 'quotes'))

Creating functions to do preprocessing of the data

In [34]:
def preprocessing(text,stem=False, stop=False, sent=False):

    # Remove punctuations
    exclude = set(string.punctuation)
    text = ''.join(ch for ch in text if ch not in exclude)


    tokens = word_tokenize(text)

    if stop:
        stop = stopwords.words('english')
        tokens =[word for word in tokens if word not in stop]
        tokens = [word.lower() for word in tokens]

    if stem:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(t) for t in tokens]

    if sent:
        tokens = ' '.join(tokens)

    return tokens

In [35]:
def clean_news(articles):

    clean = []

    for article in articles:
        clean.append(preprocessing(article,stop=True,sent=False, stem=False))

    return clean

Here we are downloading the pre trained models

# Word Embedding Techniques

In [36]:
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


**Word2Vec:**

Word2Vec can make strong estimates about a word’s meaning based on their occurrences in the text. These estimates yield word associations with other words in the corpus.

In [7]:
wv = api.load('word2vec-google-news-300')



**FastText:**

FastText is a word embedding technique that provides embedding to the character n-grams. It is the extension of the word2vec model.

In [8]:
ft = api.load('fasttext-wiki-news-subwords-300')



**Comparison Word2Vec and FastText**

Word2Vec works on the word level, while fastText works on the character n-grams.

Word2Vec cannot provide embeddings for out-of-vocabulary words, while fastText can provide embeddings for OOV words.

FastText can provide better embeddings for morphologically rich languages compared to word2vec.

In terms of semantics, Word2Vec performs better.

Using pre-trained models, converting the text to vectors. Here, we are using Word2Vec and FastText

In [37]:
def sent_vec(sent):
    vector_size = wv.vector_size
    wv_res = np.zeros(vector_size)
    # print(wv_res)
    ctr = 1
    for w in sent:
        if w in wv:
            ctr += 1
            wv_res += wv[w]
    wv_res = wv_res/ctr
    return wv_res

In [38]:
def sent_vec_ft(sent):
    vector_size = ft.vector_size
    ft_res = np.zeros(vector_size)
    # print(wv_res)
    ctr = 1
    for w in sent:
        if w in ft:
            ctr += 1
            ft_res += ft[w]
    ft_res = ft_res/ctr
    return ft_res

This function is to calculate AUC ROC for each class

In [39]:
def roc_auc_score_multiclass(actual_class, pred_class, average = "macro"):

    #creating a set of all the unique classes using the actual class list
    unique_class = set(actual_class)
    roc_auc_dict = {}
    for per_class in unique_class:

        #creating a list of all the classes except the current class
        other_class = [x for x in unique_class if x != per_class]

        #marking the current class as 1 and all other classes as 0
        new_actual_class = [0 if x in other_class else 1 for x in actual_class]
        new_pred_class = [0 if x in other_class else 1 for x in pred_class]

        #using the sklearn metrics method to calculate the roc_auc_score
        roc_auc = roc_auc_score(new_actual_class, new_pred_class, average = average)
        roc_auc_dict[per_class] = roc_auc

    return roc_auc_dict

Converting the data from sklearn to Pandas

In [40]:
train_df = pd.DataFrame(data=newsgroups_train.data, columns = ['News'])
test_df = pd.DataFrame(data=newsgroups_test.data, columns = ['News'])

Calling preprocessing functions on the data, to tokenize the text

In [41]:
train_df['tokens'] = clean_news(train_df.News)
test_df['tokens'] = clean_news(test_df.News)

Calling the function to vectorize the tokens, here we are using Word2Vec

In [42]:
train_df['vec'] = train_df['tokens'].apply(sent_vec)
test_df['vec'] = test_df['tokens'].apply(sent_vec)

Creating a target variable from the dataset

In [43]:
train_df['target'] = newsgroups_train.target
test_df['target'] = newsgroups_test.target

y_train = train_df['target'].to_list()
y_test = test_df['target'].to_list()

Count of each category

In [96]:
train_df['target'].value_counts()

2    597
3    593
1    584
4    564
0    480
Name: target, dtype: int64

In [97]:
test_df['target'].value_counts()

2    397
3    394
1    389
4    376
0    319
Name: target, dtype: int64

Converting the vectors from the pandas dataframe to list to pass

In [44]:
X_train = train_df['vec'].to_list()
X_test = test_df['vec'].to_list()

# Text Classification

**Support Vector Machine**

SVM (Support vector machine) is an efficient classification method when the feature vector is high dimensional. Using SVM as classifier

In [86]:
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_train, y_train)

Testing the classifier with test data

In [87]:
predicted = svm_model_linear.predict(X_test)
accuracy = svm_model_linear.score(X_test, y_test)
print("Support Vector Machine Accuracy with Word2Vec:", accuracy)

Support Vector Machine Accuracy with Word2Vec: 0.8117333333333333


Metrics report for the classifier using Word2Vec

In [88]:
print(metrics.classification_report(y_test, predicted, digits=3))

              precision    recall  f1-score   support

           0      0.694     0.718     0.706       319
           1      0.878     0.866     0.872       389
           2      0.811     0.899     0.853       397
           3      0.812     0.789     0.801       394
           4      0.852     0.766     0.807       376

    accuracy                          0.812      1875
   macro avg      0.809     0.808     0.808      1875
weighted avg      0.813     0.812     0.812      1875



ROC for SVM using Word2Vec

In [89]:
roc_auc_dict = roc_auc_score_multiclass(y_test, predicted)
roc_auc_dict

{0: 0.8264791564255265,
 1: 0.9173476872402924,
 2: 0.9215436818084211,
 3: 0.8703621506938994,
 4: 0.8663009382141286}

Calling the function to vectorize the tokens, here we are using FastText

In [49]:
train_df['vec_ft'] = train_df['tokens'].apply(sent_vec_ft)
test_df['vec_ft'] = test_df['tokens'].apply(sent_vec_ft)

Converting the vectors from the pandas dataframe to list to pass

In [50]:
X_train_ft = train_df['vec_ft'].to_list()
X_test_ft = test_df['vec_ft'].to_list()

Using Support Vector Machine as classifier, to compare the results

In [90]:
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_train_ft, y_train)

Testing the classifier with test data

In [91]:
predicted_ft = svm_model_linear.predict(X_test_ft)
accuracy_ft = svm_model_linear.score(X_test_ft, y_test)
print("Support Vector Machine Accuracy with FastText:", accuracy_ft)

Support Vector Machine Accuracy with FastText: 0.7669333333333334


Metrics report for the classifier using FastText

In [92]:
print(metrics.classification_report(y_test, predicted_ft, digits=3))

              precision    recall  f1-score   support

           0      0.602     0.677     0.637       319
           1      0.820     0.830     0.825       389
           2      0.781     0.829     0.804       397
           3      0.756     0.754     0.755       394
           4      0.886     0.726     0.798       376

    accuracy                          0.767      1875
   macro avg      0.769     0.763     0.764      1875
weighted avg      0.774     0.767     0.769      1875



ROC for SVM using FastText

In [93]:
roc_auc_dict = roc_auc_score_multiclass(y_test, predicted_ft)
roc_auc_dict

{0: 0.7926068369180681,
 1: 0.8912774585073368,
 2: 0.8832345432421101,
 3: 0.8444930198761299,
 4: 0.8513574652605282}

**Advantages of SVM:**
*   SVM works relatively well when there is a clear margin of separation
between classes.
*   It is more effective in high dimensional spaces.
*   It is relatively memory efficient.

**Disadvantages of SVM:**
*   SVM does not perform very well when the data set has more noise i.e. target classes are overlapping.
*   In cases where the number of features for each data point exceeds the number of training data samples, the SVM will underperform.
*   SVM algorithm is not suitable for large data sets.

**Summary based on Metrics**

Based on the accuracy of SVM on both Word2Vec and FastText, Word2Vec is performing better with the 81% with ROC above 0.82 for all classes. While FastText gave accuracy 76% with ROC above 0.79

# Text Clustering

**K-Means**

K-means is a centroid-based clustering algorithm, where we calculate the distance between each data point and a centroid to assign it to a cluster.

As we already know number of classes in the data. We are fixing the number of clusters. Generally, the number of cluster will be determined based on the Elbow method.

Creating a variable to store the actual number of categories in the data

In [55]:
true_train_k = np.unique(train_df['target']).shape[0]
true_test_k = np.unique(test_df['target']).shape[0]

Using KMeans to cluster the data, by passing the vectors created using Word2Vec

In [104]:
km = KMeans(n_clusters = true_train_k, random_state = 0, n_init='auto')
km.fit(X_train)

In [105]:
pred = km.predict(X_test)

Metrics for the clusters created

In [106]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(train_df['target'], km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(train_df['target'], km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(train_df['target'], km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(train_df['target'], km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X_train, km.labels_, metric='euclidean'))

Homogeneity: 0.368
Completeness: 0.430
V-measure: 0.397
Adjusted Rand-Index: 0.389
Silhouette Coefficient: 0.042


Using KMeans to cluster the data, by passing the vectors created using FastText

In [107]:
km = KMeans(n_clusters = true_train_k, random_state = 0, n_init='auto')
km.fit(X_train_ft)

In [108]:
pred_ft = km.predict(X_test_ft)

Metrics for the clusters created

In [109]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(train_df['target'], km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(train_df['target'], km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(train_df['target'], km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(train_df['target'], km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X_train_ft, km.labels_, metric='euclidean'))

Homogeneity: 0.030
Completeness: 0.057
V-measure: 0.039
Adjusted Rand-Index: 0.027
Silhouette Coefficient: 0.169


**Advantages of KMeans:**


*   It is very easy to implement.
*   Tighter clusters are formed with K-means.
*   If we have large number of variables then, K-means would be faster than Hierarchical clustering.

**Disadvantages of KMeans:**



*   It is a bit difficult to predict the number of clusters i.e. the value of k.
*   It is not good in doing clustering job if the clusters have a complicated geometric shape.
*   Order of data will have strong impact on the final output.





**Summary based on Metrics**

Silhouette score returns the average silhouette coefficient applied on all the samples. The Silhouette Coefficient is calculated by using the mean of the distance of the intra-cluster and nearest cluster for all the samples.

The Silhouette score is comparitively better for the model with FastText (0.169).

V-Measure is a measure of the goodness of our clustering algorithm we can consider the harmonic average between homogeneity and completeness and obtain the V-measure.

Based on the V-Measure (0.397), the model with Word2Vec is better.

# Model Improvement

The dataset which we used in this assignment is the newsgroup, which is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across 20 different newsgroups. The possible improvements are

*  Using Lemmatization instead of Stemming technique to bring words to its proper base forms.
*  Expanding the abbrevations could add more value to the data.
*  Tuning the hyperparameters of the algorithms to give better results.