<a href="https://colab.research.google.com/github/RJ-Stony/A-Complete-Guide-to-TM/blob/main/05)Dimensionality_Reduction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Dimension Reduction using PCA

In [1]:
from sklearn.datasets import fetch_20newsgroups

# Create a list of topics you want to select from among 20 topics
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

# Load train dataset
# Delete the hint part from the mail content - purely classified by content
newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

# Load test dataset
newsgroups_test = fetch_20newsgroups(subset='test',
                                     remove=('headers', 'footers', 'quotes'),
                                     categories=categories)

In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english")

from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

X_train = newsgroups_train.data     # Train dataset Document
y_train = newsgroups_train.target   # Train dataset Label

X_test = newsgroups_test.data       # Test dataset Document
y_test = newsgroups_test.target     # Test dataset Label

RegTok = RegexpTokenizer("[\w']{3,}")     # Define tokenizer with Regular expression
english_stops = set(stopwords.words('english'))     # Take english stopwords

def tokenizer(text):
  tokens = RegTok.tokenize(text.lower())
  # Except stopwords
  words = [word for word in tokens if (word not in english_stops) and len(word) > 2]
  # Apply porter stemmer
  features = (list(map(lambda token: PorterStemmer().stem(token), words)))
  return features

tfidf = TfidfVectorizer(tokenizer=tokenizer)
X_train_tfidf = tfidf.fit_transform(X_train)      # Transform the train set
X_test_tfidf = tfidf.transform(X_test)            # Transform the test set

In [8]:
from sklearn.linear_model import LogisticRegression
LR_clf = LogisticRegression()     # Classifier Declaration
LR_clf.fit(X_train_tfidf, y_train)      # Train a classifier using train data
print('# Train set score: {:.3f}'.format(LR_clf.score(X_train_tfidf, y_train)))
print('# Test set score: {:.3f}'.format(LR_clf.score(X_test_tfidf, y_test)))

# Train set score: 0.962
# Test set score: 0.761


In [9]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2000, random_state=7)
X_train_pca = pca.fit_transform(X_train_tfidf.toarray())
X_test_pca = pca.transform(X_test_tfidf.toarray())

print('Original tfidf matrix shape:', X_train_tfidf.shape)
print('PCA Converted matrix shape:', X_train_pca.shape)
print(
    "Sum of explained variance ratio: {:.3f}".format(
        pca.explained_variance_ratio_.sum()
    )
)

Original tfidf matrix shape: (2034, 20085)
PCA Converted matrix shape: (2034, 2000)
Sum of explained variance ratio: 1.000


In [10]:
LR_clf.fit(X_train_pca, y_train)
print('# Train set score: {:.3f}'.format(LR_clf.score(X_train_pca, y_train)))
print('# Test set score: {:.3f}'.format(LR_clf.score(X_test_pca, y_test)))

# Train set score: 0.962
# Test set score: 0.761


In [12]:
lasso_clf = LogisticRegression(penalty='l1', solver='liblinear', C=1)
lasso_clf.fit(X_train_tfidf, y_train)

print('# Train set score: {:.3f}'.format(lasso_clf.score(X_train_tfidf, y_train)))
print('# Test set score: {:.3f}'.format(lasso_clf.score(X_test_tfidf, y_test)))

import numpy as np
# Output the number of non-zero coefficients
print(
    "# Used features count: {}".format(np.sum(lasso_clf.coef_ != 0)),
    "out of",
    X_train_tfidf.shape[1],
)

# Train set score: 0.790
# Test set score: 0.718
# Used features count: 321 out of 20085


In [13]:
pca = PCA(n_components=321, random_state=7)

X_train_pca = pca.fit_transform(X_train_tfidf.toarray())
X_test_pca = pca.transform(X_test_tfidf.toarray())
print('PCA Converted X shape:', X_train_pca.shape)
print(
    "Sum of explained variance ratio: {:.3f}".format(
        pca.explained_variance_ratio_.sum()
    )
)

LR_clf.fit(X_train_pca, y_train)
print('# Train set score: {:.3f}'.format(LR_clf.score(X_train_pca, y_train)))
print('# Test set score: {:.3f}'.format(LR_clf.score(X_test_pca, y_test)))

PCA Converted X shape: (2034, 321)
Sum of explained variance ratio: 0.437
# Train set score: 0.875
# Test set score: 0.751


In [14]:
pca = PCA(n_components=100, random_state=7)

X_train_pca = pca.fit_transform(X_train_tfidf.toarray())
X_test_pca = pca.transform(X_test_tfidf.toarray())
print('PCA Converted X shape:', X_train_pca.shape)
print(
    "Sum of explained variance ratio: {:.3f}".format(
        pca.explained_variance_ratio_.sum()
    )
)

LR_clf.fit(X_train_pca, y_train)
print('# Train set score: {:.3f}'.format(LR_clf.score(X_train_pca, y_train)))
print('# Test set score: {:.3f}'.format(LR_clf.score(X_test_pca, y_test)))

PCA Converted X shape: (2034, 100)
Sum of explained variance ratio: 0.211
# Train set score: 0.807
# Test set score: 0.738


## Dimension Reduction and Semantics using LSA