<a href="https://colab.research.google.com/github/Sehajbirsingh/ML-2/blob/main/Task_1ML2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.preprocessing import MinMaxScaler

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
X_train = newsgroups_train.data
y_train = newsgroups_train.target
X_test = newsgroups_test.data
y_test = newsgroups_test.target

In [6]:
def train_and_evaluate(classifier, X_train, X_test, y_train, y_test):
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return accuracy, report

In [7]:
feature_extractors = {
    'CountVectorizer': CountVectorizer(),
    'TF-IDF': TfidfVectorizer(),
    'Word2Vec': None,  # Will be handled separately
    'Doc2Vec': None   # Will be handled separately
}

algorithms = {
    'MultinomialNB': MultinomialNB(),
    'LogisticRegression': LogisticRegression(max_iter=10000),
    'LinearSVC': LinearSVC(max_iter=10000),
    'DecisionTreeClassifier': DecisionTreeClassifier()
}

In [8]:
# 4. Perform benchmark analysis
results = {}
for feature_name, extractor in feature_extractors.items():
    for algo_name, algorithm in algorithms.items():
        if feature_name in ['Word2Vec', 'Doc2Vec']:
            # Handle Word2Vec and Doc2Vec separately
            if feature_name == 'Word2Vec':
                # Train Word2Vec model
                sentences = [nltk.word_tokenize(doc) for doc in X_train]
                model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)
                # Get document vectors by averaging word vectors
                X_train_vec = [np.mean([model.wv[word] for word in nltk.word_tokenize(doc) if word in model.wv] or [np.zeros(100)], axis=0) for doc in X_train]
                X_test_vec = [np.mean([model.wv[word] for word in nltk.word_tokenize(doc) if word in model.wv] or [np.zeros(100)], axis=0) for doc in X_test]

                # Scale feature vectors to non-negative values using MinMaxScaler for MultinomialNB compatibility
                if algo_name == 'MultinomialNB':
                    scaler = MinMaxScaler()
                    X_train_vec = scaler.fit_transform(X_train_vec)
                    X_test_vec = scaler.transform(X_test_vec)

            else:  # Doc2Vec
                # Train Doc2Vec model
                documents = [TaggedDocument(nltk.word_tokenize(doc), [i]) for i, doc in enumerate(X_train)]
                model = Doc2Vec(documents, vector_size=100, window=5, min_count=5, workers=4)
                # Get document vectors
                X_train_vec = [model.infer_vector(nltk.word_tokenize(doc)) for doc in X_train]
                X_test_vec = [model.infer_vector(nltk.word_tokenize(doc)) for doc in X_test]

                # Scale feature vectors to non-negative values using MinMaxScaler for MultinomialNB compatibility
                if algo_name == 'MultinomialNB':
                    scaler = MinMaxScaler()
                    X_train_vec = scaler.fit_transform(X_train_vec)
                    X_test_vec = scaler.transform(X_test_vec)

            accuracy, report = train_and_evaluate(algorithm, X_train_vec, X_test_vec, y_train, y_test)
        else:
            # Use CountVectorizer or TF-IDF
            X_train_vec = extractor.fit_transform(X_train)
            X_test_vec = extractor.transform(X_test)
            accuracy, report = train_and_evaluate(algorithm, X_train_vec, X_test_vec, y_train, y_test)

        results[(feature_name, algo_name)] = {
            'accuracy': accuracy,
            'report': report
        }

# Print the results
for (feature_name, algo_name), result in results.items():
    print(f"Feature Extractor: {feature_name}, Algorithm: {algo_name}")
    print(f"Accuracy: {result['accuracy']}")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Feature Extractor: CountVectorizer, Algorithm: MultinomialNB
Accuracy: 0.5431492299522039
Feature Extractor: CountVectorizer, Algorithm: LogisticRegression
Accuracy: 0.596388741370154
Feature Extractor: CountVectorizer, Algorithm: LinearSVC
Accuracy: 0.5720924057355284
Feature Extractor: CountVectorizer, Algorithm: DecisionTreeClassifier
Accuracy: 0.4212692511949018
Feature Extractor: TF-IDF, Algorithm: MultinomialNB
Accuracy: 0.6062134891131173
Feature Extractor: TF-IDF, Algorithm: LogisticRegression
Accuracy: 0.6744556558682953
Feature Extractor: TF-IDF, Algorithm: LinearSVC
Accuracy: 0.6919808815719597
Feature Extractor: TF-IDF, Algorithm: DecisionTreeClassifier
Accuracy: 0.4038767923526288
Feature Extractor: Word2Vec, Algorithm: MultinomialNB
Accuracy: 0.19928305894848647
Feature Extractor: Word2Vec, Algorithm: LogisticRegression
Accuracy: 0.34877854487519916
Feature Extractor: Word2Vec, Algorithm: LinearSVC
Accuracy: 0.34891131173659057
Feature Extractor: Word2Vec, Algorithm: Deci