In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/20-newsgroups/misc.forsale.txt
/kaggle/input/20-newsgroups/rec.autos.txt
/kaggle/input/20-newsgroups/list.csv
/kaggle/input/20-newsgroups/comp.os.ms-windows.misc.txt
/kaggle/input/20-newsgroups/sci.electronics.txt
/kaggle/input/20-newsgroups/comp.sys.mac.hardware.txt
/kaggle/input/20-newsgroups/talk.politics.mideast.txt
/kaggle/input/20-newsgroups/talk.politics.guns.txt
/kaggle/input/20-newsgroups/talk.religion.misc.txt
/kaggle/input/20-newsgroups/comp.graphics.txt
/kaggle/input/20-newsgroups/soc.religion.christian.txt
/kaggle/input/20-newsgroups/rec.sport.hockey.txt
/kaggle/input/20-newsgroups/rec.sport.baseball.txt
/kaggle/input/20-newsgroups/comp.windows.x.txt
/kaggle/input/20-newsgroups/comp.sys.ibm.pc.hardware.txt
/kaggle/input/20-newsgroups/rec.motorcycles.txt
/kaggle/input/20-newsgroups/sci.med.txt
/kaggle/input/20-newsgroups/sci.space.txt
/kaggle/input/20-newsgroups/alt.atheism.txt
/kaggle/input/20-newsgroups/sci.crypt.txt
/kaggle/input/20-newsgroups/talk.politics

In [3]:
from sklearn.datasets import fetch_20newsgroups

# Load the dataset
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

# Extract features (text) and labels (categories)
X_train, y_train = newsgroups_train.data, newsgroups_train.target
X_test, y_test = newsgroups_test.data, newsgroups_test.target

# Print some information about the dataset
print("Training set size:", len(X_train))
print("Test set size:", len(X_test))
print("Categories:", newsgroups_train.target_names)

Training set size: 11314
Test set size: 7532
Categories: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [4]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('stopwords')

# Define preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join back into a string
    return " ".join(tokens)

# Apply preprocessing to the dataset
X_train_clean = [preprocess_text(doc) for doc in X_train]
X_test_clean = [preprocess_text(doc) for doc in X_test]

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train_clean)

# Transform the test data
X_test_tfidf = vectorizer.transform(X_test_clean)

# Check the shape of the resulting matrices
print("Training TF-IDF shape:", X_train_tfidf.shape)
print("Test TF-IDF shape:", X_test_tfidf.shape)

Training TF-IDF shape: (11314, 5000)
Test TF-IDF shape: (7532, 5000)


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Train a Logistic Regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = clf.predict(X_test_tfidf)

# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=newsgroups_train.target_names))

Accuracy: 0.6334306956983536

Classification Report:
                           precision    recall  f1-score   support

             alt.atheism       0.43      0.43      0.43       319
           comp.graphics       0.53      0.62      0.57       389
 comp.os.ms-windows.misc       0.64      0.60      0.62       394
comp.sys.ibm.pc.hardware       0.63      0.59      0.61       392
   comp.sys.mac.hardware       0.68      0.62      0.65       385
          comp.windows.x       0.73      0.64      0.68       395
            misc.forsale       0.72      0.74      0.73       390
               rec.autos       0.67      0.61      0.64       396
         rec.motorcycles       0.44      0.74      0.55       398
      rec.sport.baseball       0.77      0.76      0.76       397
        rec.sport.hockey       0.87      0.82      0.84       399
               sci.crypt       0.82      0.63      0.71       396
         sci.electronics       0.50      0.54      0.52       393
                 sci.

In [7]:
# Example: Classify a new document
new_doc = ["Ford Mustang 1969 breaks the record"]
new_doc_clean = [preprocess_text(doc) for doc in new_doc]  # Preprocess the new document
new_doc_tfidf = vectorizer.transform(new_doc_clean)       # Convert to TF-IDF
predicted_category = clf.predict(new_doc_tfidf)           # Predict category

print("Predicted Category:", newsgroups_train.target_names[predicted_category[0]])

Predicted Category: rec.autos


In [8]:
from nltk.tokenize import word_tokenize

# Tokenize the cleaned training data
tokenized_train = [word_tokenize(doc) for doc in X_train_clean]

In [9]:
from gensim.models import Word2Vec

# Train Word2Vec model
w2v_model = Word2Vec(sentences=tokenized_train, vector_size=100, window=5, min_count=2, workers=4)

# Save the model (optional)
w2v_model.save("word2vec.model")

In [10]:
print("Vocabulary size:", len(w2v_model.wv))
print("Vector for 'space':", w2v_model.wv['space'])

Vocabulary size: 37766
Vector for 'space': [-0.3371702   0.42458296  0.6542881   0.81963974  0.35731444 -0.11564608
 -0.8256642   1.6427228  -0.43374905  0.21915483 -0.37770844 -0.490688
 -0.89908856  0.44284     0.62875783 -0.05158376  0.68129224  0.3222319
  0.26204705 -1.8111628   0.63123554 -0.03237718 -0.06418905  0.30362436
  0.6107642   0.40036654 -0.7165361  -0.232868   -0.25195482  0.39870146
  0.0665701  -0.90802896 -1.3272429   0.5052152  -0.33399606 -0.08364765
 -0.4275814  -0.4172086  -1.8488071  -1.2803917  -0.28706822  0.4349651
  0.03069674  0.46281824  0.8970272   0.15681548 -0.9525128  -0.8646256
 -0.80863875 -0.09719582  0.3833773  -0.33230615 -1.3431838  -0.8684359
 -1.0022254  -0.78351927  1.685162    0.5043659  -1.0168439   0.21898514
 -0.99101126  0.6558249  -0.8908334   0.08653314 -1.9812384   1.0648988
 -0.46835923  0.61785996 -0.9427535   0.7977871  -1.8552308   0.8674219
  0.2899699  -0.81363463 -0.32532746 -0.4354655   0.16800524 -0.01639647
 -1.1689758   0.

In [11]:
import numpy as np

def get_document_embedding(text, model):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    # Get word vectors for tokens in the model's vocabulary
    embeddings = [model.wv[word] for word in tokens if word in model.wv]
    # Return the mean of the word vectors (or zeros if no valid words are found)
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(model.vector_size)

# Generate document embeddings for the training and test sets
X_train_w2v = np.array([get_document_embedding(doc, w2v_model) for doc in X_train_clean])
X_test_w2v = np.array([get_document_embedding(doc, w2v_model) for doc in X_test_clean])

# Check the shape of the resulting matrices
print("Training Word2Vec shape:", X_train_w2v.shape)
print("Test Word2Vec shape:", X_test_w2v.shape)

Training Word2Vec shape: (11314, 100)
Test Word2Vec shape: (7532, 100)


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Train a Logistic Regression model
clf_w2v = LogisticRegression(max_iter=1000)
clf_w2v.fit(X_train_w2v, y_train)

# Predict on the test set
y_pred_w2v = clf_w2v.predict(X_test_w2v)

# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred_w2v))
print("\nClassification Report:\n", classification_report(y_test, y_pred_w2v, target_names=newsgroups_train.target_names))

Accuracy: 0.40639936271906535

Classification Report:
                           precision    recall  f1-score   support

             alt.atheism       0.24      0.26      0.25       319
           comp.graphics       0.35      0.36      0.35       389
 comp.os.ms-windows.misc       0.48      0.41      0.44       394
comp.sys.ibm.pc.hardware       0.38      0.39      0.39       392
   comp.sys.mac.hardware       0.34      0.21      0.26       385
          comp.windows.x       0.53      0.57      0.55       395
            misc.forsale       0.54      0.62      0.58       390
               rec.autos       0.33      0.31      0.32       396
         rec.motorcycles       0.34      0.37      0.36       398
      rec.sport.baseball       0.43      0.42      0.43       397
        rec.sport.hockey       0.63      0.59      0.61       399
               sci.crypt       0.61      0.53      0.57       396
         sci.electronics       0.30      0.26      0.28       393
                 sci

In [13]:
# Example: Classify a new document
new_doc = ["Foxtale moisturizer hydrates even dry skin"]
new_doc_clean = [preprocess_text(doc) for doc in new_doc]  # Preprocess the new document
new_doc_w2v = np.array([get_document_embedding(doc, w2v_model) for doc in new_doc_clean])  # Convert to Word2Vec embedding
predicted_category = clf_w2v.predict(new_doc_w2v)  # Predict category

print("Predicted Category:", newsgroups_train.target_names[predicted_category[0]])

Predicted Category: sci.med


In [14]:
from gensim.models import KeyedVectors

# Path to the pre-trained Word2Vec model
word2vec_path = r"/kaggle/input/word2vec/GoogleNews-vectors-negative300.bin"

# Load the pre-trained Word2Vec model
w2v_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

# Test the model
print("Vector for 'space':", w2v_model['space'])

Vector for 'space': [-0.11914062  0.078125    0.05566406 -0.06542969 -0.11523438 -0.19726562
  0.07373047 -0.03112793  0.03930664 -0.13769531 -0.04833984 -0.06542969
 -0.12304688 -0.14941406 -0.04467773 -0.06494141  0.06542969  0.11621094
  0.15820312  0.09179688 -0.01409912 -0.14941406 -0.12255859 -0.01843262
 -0.1015625   0.29101562 -0.05297852 -0.05981445  0.0168457  -0.25195312
 -0.26171875 -0.01965332  0.03662109 -0.0007782  -0.06640625 -0.34960938
  0.03735352  0.02502441  0.00668335 -0.07177734 -0.10546875 -0.10351562
  0.19140625  0.4296875  -0.07080078 -0.05615234 -0.09082031  0.203125
  0.06787109 -0.14355469  0.08935547 -0.09619141 -0.19726562 -0.16992188
 -0.18164062 -0.15332031  0.13769531 -0.08447266  0.24121094 -0.02124023
  0.08398438 -0.13574219 -0.05834961 -0.28710938 -0.23144531  0.03759766
 -0.01611328 -0.078125   -0.11132812  0.18359375  0.03015137 -0.03466797
  0.20019531 -0.09814453 -0.328125   -0.12792969  0.00531006  0.09570312
 -0.26171875  0.1640625   0.10449

In [15]:
import numpy as np
from nltk.tokenize import word_tokenize

def get_document_embedding(text, model):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    # Get word vectors for tokens in the model's vocabulary
    embeddings = [model[word] for word in tokens if word in model]
    # Return the mean of the word vectors (or zeros if no valid words are found)
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(model.vector_size)

# Generate document embeddings for the training and test sets
X_train_w2v = np.array([get_document_embedding(doc, w2v_model) for doc in X_train_clean])
X_test_w2v = np.array([get_document_embedding(doc, w2v_model) for doc in X_test_clean])

# Check the shape of the resulting matrices
print("Training Word2Vec shape:", X_train_w2v.shape)
print("Test Word2Vec shape:", X_test_w2v.shape)

Training Word2Vec shape: (11314, 300)
Test Word2Vec shape: (7532, 300)


In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Train a Logistic Regression model
clf_w2v = LogisticRegression(max_iter=1000)
clf_w2v.fit(X_train_w2v, y_train)

# Predict on the test set
y_pred_w2v = clf_w2v.predict(X_test_w2v)

# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred_w2v))
print("\nClassification Report:\n", classification_report(y_test, y_pred_w2v, target_names=newsgroups_train.target_names))

Accuracy: 0.6103292618162507

Classification Report:
                           precision    recall  f1-score   support

             alt.atheism       0.40      0.39      0.39       319
           comp.graphics       0.56      0.58      0.57       389
 comp.os.ms-windows.misc       0.53      0.49      0.51       394
comp.sys.ibm.pc.hardware       0.51      0.49      0.50       392
   comp.sys.mac.hardware       0.53      0.46      0.49       385
          comp.windows.x       0.63      0.64      0.63       395
            misc.forsale       0.71      0.70      0.71       390
               rec.autos       0.46      0.69      0.55       396
         rec.motorcycles       0.70      0.64      0.67       398
      rec.sport.baseball       0.79      0.75      0.77       397
        rec.sport.hockey       0.80      0.83      0.82       399
               sci.crypt       0.66      0.63      0.65       396
         sci.electronics       0.55      0.50      0.52       393
                 sci.

In [17]:
# Example: Classify a new document
new_doc = ["This is a discussion about space exploration."]
new_doc_clean = [preprocess_text(doc) for doc in new_doc]  # Preprocess the new document
new_doc_w2v = np.array([get_document_embedding(doc, w2v_model) for doc in new_doc_clean])  # Convert to Word2Vec embedding
predicted_category = clf_w2v.predict(new_doc_w2v)  # Predict category

print("Predicted Category:", newsgroups_train.target_names[predicted_category[0]])

Predicted Category: sci.space


In [18]:
!pip install transformers datasets torch



In [19]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained BERT model (without a classification head)
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [20]:
def tokenize_and_encode(texts, max_length=128):
    # Tokenize and encode the texts
    encodings = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='pt'  # Return PyTorch tensors
    )
    return encodings

# Tokenize the training and test sets
train_encodings = tokenize_and_encode(X_train_clean)
test_encodings = tokenize_and_encode(X_test_clean)

In [22]:
from tqdm import tqdm  # Import tqdm for progress bars

def get_bert_embeddings_in_batches(encodings, model, batch_size=32):
    # Move inputs to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    # Prepare inputs
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']
    
    # Initialize a list to store embeddings
    all_embeddings = []
    
    # Calculate the total number of batches
    num_batches = (len(input_ids) + batch_size - 1) // batch_size
    
    # Process in batches with a progress bar
    for i in tqdm(range(0, len(input_ids), batch_size), total=num_batches, desc="Processing batches"):
        # Extract the current batch
        batch_input_ids = input_ids[i:i + batch_size].to(device)
        batch_attention_mask = attention_mask[i:i + batch_size].to(device)
        
        # Generate embeddings
        with torch.no_grad():
            outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
        
        # Extract [CLS] token embeddings (first token in each sequence)
        cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        all_embeddings.append(cls_embeddings)
    
    # Concatenate all embeddings into a single array
    return np.concatenate(all_embeddings, axis=0)

# Generate embeddings for the training and test sets
X_train_bert = get_bert_embeddings_in_batches(train_encodings, bert_model, batch_size=32)
X_test_bert = get_bert_embeddings_in_batches(test_encodings, bert_model, batch_size=32)

# Check the shape of the resulting matrices
print("Training BERT shape:", X_train_bert.shape)
print("Test BERT shape:", X_test_bert.shape)

Processing batches:  46%|████▌     | 163/354 [19:38<23:00,  7.23s/it] 


KeyboardInterrupt: 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Train a Logistic Regression model
clf_bert = LogisticRegression(max_iter=1000)
clf_bert.fit(X_train_bert, y_train)

# Predict on the test set
y_pred_bert = clf_bert.predict(X_test_bert)

# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred_bert))
print("\nClassification Report:\n", classification_report(y_test, y_pred_bert, target_names=newsgroups_train.target_names))

In [None]:
# Example: Classify a new document
new_doc = ["This is a discussion about space exploration."]
new_doc_clean = [preprocess_text(doc) for doc in new_doc]  # Preprocess the new document
new_doc_encodings = tokenize_and_encode(new_doc_clean)    # Tokenize the new document
new_doc_bert = get_bert_embeddings(new_doc_encodings, bert_model)  # Convert to BERT embedding
predicted_category = clf_bert.predict(new_doc_bert)  # Predict category

print("Predicted Category:", newsgroups_train.target_names[predicted_category[0]])