<a href="https://colab.research.google.com/github/PALBIBEK/Bengali.AI-Handwritten-Grapheme-Classification/blob/main/Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
import re
import string
import torch

# Download necessary NLTK data
nltk.download('punkt')

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Fetch the 20 Newsgroups dataset
newsgroups_data = fetch_20newsgroups(subset='all')
documents = newsgroups_data.data
labels = newsgroups_data.target

# Function to preprocess text data
def preprocess(text):
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert text to lowercase
    text = text.lower()
    # Remove stop words
    text = ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
    return text

# Preprocess all documents
processed_documents = [preprocess(doc) for doc in documents]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(processed_documents, labels, test_size=0.2, random_state=42)

# Tokenize the training documents
tokenized_docs = [word_tokenize(doc) for doc in X_train]

print(tokenized_docs[:5])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Using device: cpu
[['mahantgvcom', 'patrick', 'l', 'mahan', 'subject', 'just', 'newsgroup', 'dead', 'organization', 'internet', 'lines', 'nntppostinghost', 'enterpoopmitedu', 'xpertexpolcsmitedu', 'rlmhelensurfctycom', 'ive', 'gotten', 'posts', 'group', 'couple', 'days', 'recently', 'added', 'feed', 'list', 'just', 'group', 'near', 'death', 'seen', 'mailing', 'list', 'im', 'getting', 'right', 'traffic', 'patrick', 'l', 'mahan', 'tgv', 'window', 'washer', 'mahantgvcom', 'waking', 'person', 'unnecessarily', 'considered', 'lazarus', 'long', 'capital', 'crime', 'offense', 'notebooks', 'lazarus', 'long', 'patrick', 'l', 'mahan', 'tgv', 'window', 'washer', 'mahantgvcom', 'waking', 'person', 'unnecessarily', 'considered', 'lazarus', 'long', 'capital', 'crime', 'offense', 'notebooks', 'lazarus', 'long'], ['maxqueernetorg', 'max', 'j', 'rochlin', 'subject', 'speeding', 'ticket', 'chp', 'organization', 'queernet', 'lines', 'interesting', 'id', 'fight', 'ticket', 'theres', 'chance', 'cop', 'wont'

In [None]:
# Train the Word2Vec model on the tokenized documents
w2v_model = Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=2, workers=4)

# Save the trained Word2Vec model
w2v_model.save("word2vec.model")

# Load the Word2Vec model (optional, can be used to load a pre-trained model)
w2v_model = Word2Vec.load("word2vec.model")

# Function to get the average word vector for a document
def get_document_vector(doc, model):
    # Tokenize the document
    words = word_tokenize(doc)
    # Get vectors for words present in the Word2Vec model
    word_vecs = [model.wv[word] for word in words if word in model.wv]
    # Compute the mean vector if there are any word vectors, otherwise return a zero vector
    if word_vecs:
        return np.mean(word_vecs, axis=0)
    else:
        return np.zeros(model.vector_size)

# Represent each document in the training and testing sets by averaging word vectors
X_train_vecs = [get_document_vector(doc, w2v_model) for doc in X_train]
X_test_vecs =  ([get_document_vector(doc, w2v_model) for doc in X_test])

# Convert document vectors and labels to torch tensors and move them to the GPU
X_train_vecs = torch.tensor(X_train_vecs).to(device)
X_test_vecs = torch.tensor(X_test_vecs).to(device)
y_train = torch.tensor(y_train).to(device)
y_test = torch.tensor(y_test).to(device)

# Train a logistic regression classifier on the document vectors
clf = LogisticRegression(max_iter=1000, solver='liblinear')
clf.fit(X_train_vecs.cpu(), y_train.cpu())  # scikit-learn does not support GPU, so move tensors to CPU for training

# Make predictions on the testing set
y_pred = clf.predict(X_test_vecs.cpu())  # Move tensors to CPU for prediction

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test.cpu(), y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test.cpu(), y_pred, target_names=newsgroups_data.target_names))

  X_train_vecs = torch.tensor(X_train_vecs).to(device)


Accuracy: 0.6594
                          precision    recall  f1-score   support

             alt.atheism       0.54      0.56      0.55       151
           comp.graphics       0.54      0.57      0.55       202
 comp.os.ms-windows.misc       0.65      0.65      0.65       195
comp.sys.ibm.pc.hardware       0.48      0.52      0.50       183
   comp.sys.mac.hardware       0.59      0.39      0.47       205
          comp.windows.x       0.73      0.73      0.73       215
            misc.forsale       0.63      0.69      0.66       193
               rec.autos       0.63      0.64      0.64       196
         rec.motorcycles       0.58      0.78      0.67       168
      rec.sport.baseball       0.73      0.65      0.69       211
        rec.sport.hockey       0.72      0.81      0.76       198
               sci.crypt       0.86      0.88      0.87       201
         sci.electronics       0.61      0.48      0.54       202
                 sci.med       0.70      0.76      0.73   

In [None]:
X_train_vecs.size()

torch.Size([15076, 100])

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
import re
import string
import torch
import torch.nn as nn
from tqdm import tqdm

# Download necessary NLTK data
nltk.download('punkt')

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Fetch the 20 Newsgroups dataset
newsgroups_data = fetch_20newsgroups(subset='all')
documents = newsgroups_data.data
labels = newsgroups_data.target

# Function to preprocess text data
def preprocess(text):
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.lower()  # Convert text to lowercase
    text = ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])  # Remove stop words
    return text

# Preprocess all documents with tqdm progress bar
processed_documents = [preprocess(doc) for doc in tqdm(documents, desc="Preprocessing documents")]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(processed_documents, labels, test_size=0.2, random_state=42)

# Tokenize the training documents with tqdm progress bar
tokenized_docs = [word_tokenize(doc) for doc in tqdm(X_train, desc="Tokenizing documents")]

# Train the Word2Vec model on the tokenized documents with tqdm progress bar
w2v_model = Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=2, workers=4, epochs=10)
w2v_model.train(tokenized_docs, total_examples=len(tokenized_docs), epochs=10)

# Save the trained Word2Vec model
w2v_model.save("word2vec.model")

# Load the Word2Vec model (optional, can be used to load a pre-trained model)
w2v_model = Word2Vec.load("word2vec.model")

# Define the RNN model
class RNNDocumentEmbedder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(RNNDocumentEmbedder, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.hidden_size = hidden_size

    def forward(self, word_vecs):
        word_vecs = torch.tensor(word_vecs).float().unsqueeze(0).to(device)  # Add batch dimension and move to device
        _, (hn, _) = self.rnn(word_vecs)
        return hn.squeeze(0).cpu().detach().numpy()

# Initialize the RNN embedder
embedder = RNNDocumentEmbedder(input_size=w2v_model.vector_size, hidden_size=100).to(device)

# Function to get the document vector using the RNN
def get_document_vector_rnn(doc, model, embedder):
    words = word_tokenize(doc)
    word_vecs = [model.wv[word] for word in words if word in model.wv]
    if word_vecs:
        return embedder(word_vecs)
    else:
        return np.zeros(embedder.hidden_size)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Using device: cpu


Preprocessing documents: 100%|██████████| 18846/18846 [00:09<00:00, 1960.85it/s]
Tokenizing documents: 100%|██████████| 15076/15076 [00:11<00:00, 1260.44it/s]


In [None]:
# Represent each document in the training and testing sets by processing through the RNN
X_train_vecs = np.array([get_document_vector_rnn(doc, w2v_model, embedder) for doc in tqdm(X_train, desc="Vectorizing train documents")])
X_test_vecs = np.array([get_document_vector_rnn(doc, w2v_model, embedder) for doc in tqdm(X_test, desc="Vectorizing test documents")])


Vectorizing train documents: 100%|██████████| 15076/15076 [01:41<00:00, 148.30it/s]
Vectorizing test documents: 100%|██████████| 3770/3770 [00:25<00:00, 149.01it/s]


In [None]:
# Check the shapes of the document vectors
print(f"Shape of X_train_vecs: {X_train_vecs.shape}")
print(f"Shape of X_test_vecs: {X_test_vecs.shape}")
X_train_vecs=X_train_vecs.squeeze(1)
X_test_vecs=X_test_vecs.squeeze(1)
# Train a logistic regression classifier on the document vectors
clf = LogisticRegression(max_iter=1000, solver='liblinear')
clf.fit(X_train_vecs, y_train)

# Make predictions on the testing set
y_pred = clf.predict(X_test_vecs)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred, target_names=newsgroups_data.target_names))


Shape of X_train_vecs: (15076, 1, 100)
Shape of X_test_vecs: (3770, 1, 100)
Accuracy: 0.3045
                          precision    recall  f1-score   support

             alt.atheism       0.42      0.45      0.43       151
           comp.graphics       0.18      0.15      0.16       202
 comp.os.ms-windows.misc       0.20      0.10      0.13       195
comp.sys.ibm.pc.hardware       0.18      0.19      0.18       183
   comp.sys.mac.hardware       0.19      0.10      0.13       205
          comp.windows.x       0.23      0.21      0.22       215
            misc.forsale       0.28      0.35      0.31       193
               rec.autos       0.27      0.31      0.29       196
         rec.motorcycles       0.33      0.41      0.36       168
      rec.sport.baseball       0.31      0.32      0.31       211
        rec.sport.hockey       0.41      0.55      0.47       198
               sci.crypt       0.32      0.42      0.36       201
         sci.electronics       0.16      0.08   

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
import re
import string
import torch
import torch.nn as nn
from tqdm import tqdm

# Download necessary NLTK data
nltk.download('punkt')

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Fetch the 20 Newsgroups dataset
newsgroups_data = fetch_20newsgroups(subset='all')
documents = newsgroups_data.data
labels = newsgroups_data.target

# Function to preprocess text data
def preprocess(text):
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.lower()  # Convert text to lowercase
    text = ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])  # Remove stop words
    return text

# Preprocess all documents with tqdm progress bar
processed_documents = [preprocess(doc) for doc in tqdm(documents, desc="Preprocessing documents")]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(processed_documents, labels, test_size=0.2, random_state=42)

# Tokenize the training documents with tqdm progress bar
tokenized_docs = [word_tokenize(doc) for doc in tqdm(X_train, desc="Tokenizing documents")]

# Train the Word2Vec model on the tokenized documents with tqdm progress bar
w2v_model = Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=2, workers=4, epochs=10)
w2v_model.train(tokenized_docs, total_examples=len(tokenized_docs), epochs=10)

# Save the trained Word2Vec model
w2v_model.save("word2vec.model")

# Load the Word2Vec model (optional, can be used to load a pre-trained model)
w2v_model = Word2Vec.load("word2vec.model")

# Define the RNN model
class RNNDocumentEmbedder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(RNNDocumentEmbedder, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.hidden_size = hidden_size

    def forward(self, word_vecs):
        word_vecs = torch.tensor(word_vecs).float().unsqueeze(0).to(device)  # Add batch dimension and move to device
        _, (hn, _) = self.rnn(word_vecs)
        return hn.squeeze(0).cpu().detach().numpy()

# Initialize the RNN embedder
embedder = RNNDocumentEmbedder(input_size=w2v_model.vector_size, hidden_size=100).to(device)

# Function to get the document vector using the RNN
def get_document_vector_rnn(doc, model, embedder):
    words = word_tokenize(doc)
    word_vecs = [model.wv[word] for word in words if word in model.wv]
    if word_vecs:
        return embedder(word_vecs)
    else:
        return np.zeros(embedder.hidden_size)

# Define a custom dataset class
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, documents, labels, model, embedder):
        self.documents = documents
        self.labels = labels
        self.model = model
        self.embedder = embedder

    def __len__(self):
        return len(self.documents)

    def __getitem__(self, idx):
        doc = self.documents[idx]
        label = self.labels[idx]
        doc_vector = get_document_vector_rnn(doc, self.model, self.embedder)
        return torch.tensor(doc_vector, dtype=torch.float32), torch.tensor(label, dtype=torch.long)

# Initialize datasets
train_dataset = TextDataset(X_train, y_train, w2v_model, embedder)
test_dataset = TextDataset(X_test, y_test, w2v_model, embedder)

# Create DataLoaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the model, loss function, and optimizer
class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(RNNClassifier, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, word_vecs):
        _, (hn, _) = self.rnn(word_vecs)
        out = self.fc(hn.squeeze(0))
        return out

# Initialize the model, loss function, and optimizer
num_classes = len(newsgroups_data.target_names)
model = RNNClassifier(input_size=w2v_model.vector_size, hidden_size=100, num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{num_epochs}"):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

# Evaluation
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for inputs, labels in tqdm(test_loader, desc="Evaluating"):
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(all_labels, all_preds, target_names=newsgroups_data.target_names))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Using device: cpu


Preprocessing documents: 100%|██████████| 18846/18846 [00:02<00:00, 8613.51it/s]
Tokenizing documents: 100%|██████████| 15076/15076 [00:11<00:00, 1365.91it/s]
  word_vecs = torch.tensor(word_vecs).float().unsqueeze(0).to(device)  # Add batch dimension and move to device
Training Epoch 1/10: 100%|██████████| 472/472 [01:33<00:00,  5.07it/s]


Epoch [1/10], Loss: 2.7790


Training Epoch 2/10: 100%|██████████| 472/472 [01:30<00:00,  5.19it/s]


Epoch [2/10], Loss: 2.4865


Training Epoch 3/10: 100%|██████████| 472/472 [01:40<00:00,  4.72it/s]


Epoch [3/10], Loss: 2.4002


Training Epoch 4/10: 100%|██████████| 472/472 [01:33<00:00,  5.05it/s]


Epoch [4/10], Loss: 2.3473


Training Epoch 5/10: 100%|██████████| 472/472 [01:31<00:00,  5.15it/s]


Epoch [5/10], Loss: 2.3092


Training Epoch 6/10: 100%|██████████| 472/472 [01:30<00:00,  5.19it/s]


Epoch [6/10], Loss: 2.2731


Training Epoch 7/10: 100%|██████████| 472/472 [01:31<00:00,  5.16it/s]


Epoch [7/10], Loss: 2.2405


Training Epoch 8/10: 100%|██████████| 472/472 [01:31<00:00,  5.18it/s]


Epoch [8/10], Loss: 2.2114


Training Epoch 9/10: 100%|██████████| 472/472 [01:30<00:00,  5.20it/s]


Epoch [9/10], Loss: 2.1817


Training Epoch 10/10: 100%|██████████| 472/472 [01:30<00:00,  5.20it/s]


Epoch [10/10], Loss: 2.1528


Evaluating: 100%|██████████| 118/118 [00:24<00:00,  4.75it/s]

Accuracy: 0.3427
                          precision    recall  f1-score   support

             alt.atheism       0.45      0.46      0.45       151
           comp.graphics       0.19      0.16      0.17       202
 comp.os.ms-windows.misc       0.17      0.11      0.13       195
comp.sys.ibm.pc.hardware       0.16      0.13      0.14       183
   comp.sys.mac.hardware       0.22      0.12      0.16       205
          comp.windows.x       0.19      0.27      0.22       215
            misc.forsale       0.34      0.34      0.34       193
               rec.autos       0.31      0.39      0.34       196
         rec.motorcycles       0.36      0.45      0.40       168
      rec.sport.baseball       0.40      0.31      0.35       211
        rec.sport.hockey       0.44      0.58      0.50       198
               sci.crypt       0.38      0.46      0.42       201
         sci.electronics       0.18      0.13      0.16       202
                 sci.med       0.28      0.30      0.29   




In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.datasets import fetch_20newsgroups
import nltk
import re
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS


data=fetch_20newsgroups()
X=data.data
y=data.target

def preprocess(text):
  text=re.sub(r'\d+',' ',text)
  text = ''.join([char for char in text if char not in string.punctuation])
  text=text.lower()
  text=' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
  return text

preprocess_texts=[preprocess(text) for text in X]
preprocess_texts[:5]

['lerxstwamumdedu wheres thing subject car nntppostinghost rac wamumdedu organization university maryland college park lines wondering enlighten car saw day door sports car looked late s early s called bricklin doors really small addition bumper separate rest body know tellme model engine specs years production car history info funky looking car email thanks il brought neighborhood lerxst',
 'guykuocarsonuwashingtonedu guy kuo subject si clock poll final summary final si clock reports keywords siaccelerationclockupgrade articleid shelley qvfo innc s organization university washington lines nntppostinghost carsonuwashingtonedu fair number brave souls upgraded si clock oscillator shared experiences poll send brief message detailing experiences procedure speed attained cpu rated speed add cards adapters heat sinks hour usage day floppy disk functionality m floppies especially requested summarizing days add network knowledge base clock upgrade havent answered poll thanks guy kuo guykuouwas

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.datasets import fetch_20newsgroups
import nltk
import re
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Fetch the dataset
data = fetch_20newsgroups()
X = data.data
y = data.target

def preprocess(text):
    # Remove digits
    text = re.sub(r'\d+', ' ', text)
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Convert text to lowercase
    text = text.lower()
    # Remove stop words
    text = ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
    return text

# Apply preprocessing to all texts
preprocessed_texts = [preprocess(text) for text in X]
print(preprocessed_texts[:5])




In [None]:
import string
import re
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

data=fetch_20newsgroups()
X=data.data
y=data.target

def process(text):

  text=re.sub(r'\d+', ' ',text)
  text=''.join([chars for chars in text if chars not in string.punctuation])
  text=text.lower()
  text=' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
  return text



process_text=[process(text) for text in X]
process_text[:5]

['lerxstwamumdedu wheres thing subject car nntppostinghost rac wamumdedu organization university maryland college park lines wondering enlighten car saw day door sports car looked late s early s called bricklin doors really small addition bumper separate rest body know tellme model engine specs years production car history info funky looking car email thanks il brought neighborhood lerxst',
 'guykuocarsonuwashingtonedu guy kuo subject si clock poll final summary final si clock reports keywords siaccelerationclockupgrade articleid shelley qvfo innc s organization university washington lines nntppostinghost carsonuwashingtonedu fair number brave souls upgraded si clock oscillator shared experiences poll send brief message detailing experiences procedure speed attained cpu rated speed add cards adapters heat sinks hour usage day floppy disk functionality m floppies especially requested summarizing days add network knowledge base clock upgrade havent answered poll thanks guy kuo guykuouwas

In [None]:
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

X_train,X_test,Y_train,Y_test=train_test_split(process_text,y,test_size=0.2,random_state=True)

all_sentence_all_tokens=[]
for text in X_train:
  all_sentence_all_tokens.append(word_tokenize(text))


In [None]:
word2vec=Word2Vec(all_sentence_all_tokens,vector_size=100)
word2vec.train(all_sentence_all_tokens,total_examples=len(all_sentence_all_tokens),epochs=10)



(12056576, 13735890)

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
import re
import string
import torch
import torch.nn as nn
from tqdm import tqdm

# Download necessary NLTK data
nltk.download('punkt')

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Fetch the 20 Newsgroups dataset
newsgroups_data = fetch_20newsgroups(subset='all')
documents = newsgroups_data.data
labels = newsgroups_data.target

# Function to preprocess text data
def preprocess(text):
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.lower()  # Convert text to lowercase
    text = ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])  # Remove stop words
    return text

# Preprocess all documents with tqdm progress bar
processed_documents = [preprocess(doc) for doc in tqdm(documents, desc="Preprocessing documents")]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(processed_documents, labels, test_size=0.2, random_state=42)

# Tokenize the training documents with tqdm progress bar
tokenized_docs = [word_tokenize(doc) for doc in tqdm(X_train, desc="Tokenizing documents")]

# Train the Word2Vec model on the tokenized documents with tqdm progress bar
w2v_model = Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=2, workers=4, epochs=10)
w2v_model.train(tokenized_docs, total_examples=len(tokenized_docs), epochs=10)

# Save the trained Word2Vec model
w2v_model.save("word2vec.model")

# Load the Word2Vec model (optional, can be used to load a pre-trained model)
w2v_model = Word2Vec.load("word2vec.model")

# Define the improved combined model
class ImprovedRNNClassifier(nn.Module):
    def __init__(self, embedding_size, hidden_size, num_classes, dropout_prob=0.5):
        super(ImprovedRNNClassifier, self).__init__()
        self.embedder = nn.LSTM(embedding_size, hidden_size, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(hidden_size * 2, hidden_size)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()

    def forward(self, word_vecs):
        _, (hn, _) = self.embedder(word_vecs)
        hn = torch.cat((hn[0], hn[1]), dim=1)  # Concatenate the outputs of the forward and backward LSTMs
        out = self.fc1(hn)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out

# Define a custom dataset class
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, documents, labels, model):
        self.documents = documents
        self.labels = labels
        self.model = model

    def __len__(self):
        return len(self.documents)

    def __getitem__(self, idx):
        doc = self.documents[idx]
        label = self.labels[idx]
        words = word_tokenize(doc)
        word_vecs = [self.model.wv[word] for word in words if word in self.model.wv]
        if not word_vecs:
            word_vecs = [np.zeros(self.model.vector_size)]
        word_vecs = torch.tensor(word_vecs, dtype=torch.float32)
        return word_vecs, torch.tensor(label, dtype=torch.long)

# Initialize datasets
train_dataset = TextDataset(X_train, y_train, w2v_model)
test_dataset = TextDataset(X_test, y_test, w2v_model)

# Create DataLoaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

# Initialize the improved model, loss function, and optimizer
num_classes = len(newsgroups_data.target_names)
improved_model = ImprovedRNNClassifier(embedding_size=w2v_model.vector_size, hidden_size=100, num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(improved_model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    improved_model.train()
    running_loss = 0.0
    for inputs, labels in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{num_epochs}"):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = improved_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

# Evaluation
improved_model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for inputs, labels in tqdm(test_loader, desc="Evaluating"):
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = improved_model(inputs)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(all_labels, all_preds, target_names=newsgroups_data.target_names))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Using device: cpu


Preprocessing documents: 100%|██████████| 18846/18846 [00:10<00:00, 1835.38it/s]
Tokenizing documents: 100%|██████████| 15076/15076 [00:18<00:00, 834.96it/s]
  word_vecs = torch.tensor(word_vecs, dtype=torch.float32)
Training Epoch 1/10:   0%|          | 0/472 [00:00<?, ?it/s]


RuntimeError: stack expects each tensor to be equal size, but got [314, 100] at entry 0 and [45, 100] at entry 1

1
