In [28]:
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import torch
import torchvision
import torch.nn as nn
from nltk.corpus import stopwords
from torch.utils.data import DataLoader, TensorDataset
import re
from collections import Counter

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/shatansh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df = pd.read_csv("./NLP3/train.csv")
print(len(df['text']))
print(df.head())

2000
   index                                               text  label
0      0  Zimbabwe annual inflation drops to 209 percent...      0
1      1  More than 1,000 dead in Haiti (09/23/04)-- A m...      0
2      2  Seven die in Japan  #39;suicide pact #39; Japa...      0
3      3  Security No. 1 for Afghan head President Hamid...      0
4      4  Palestinians say they hope Bush accepts dealin...      0


In [29]:
def preprocessText(text):
    text = text.lower()
    text = text.replace('\\', ' ')
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stopWordsList = set(stopwords.words('english'))
    newList = []
    for token in tokens:
        if token not in stopWordsList:
            newList.append(token)
    return newList

In [30]:
df['prep'] = df['text'].apply(preprocessText)
print(df['prep'].head())

0    [zimbabwe, annual, inflation, drops, percent, ...
1    [dead, haiti, mass, grave, haiti, holds, bodie...
2    [seven, die, japan, suicide, pact, japanese, p...
3    [security, afghan, head, president, hamid, kar...
4    [palestinians, say, hope, bush, accepts, deali...
Name: prep, dtype: object


In [31]:
# Preparing the vocabulary
vocabOfAllWords = []

for llist in df['prep']:
    for word in llist:
        vocabOfAllWords.append(word)

freqOfAllWords = Counter(vocabOfAllWords)

totalWords = len(vocabOfAllWords)
minFreq = 2
maxFreq = 0.95 * totalWords

vocab = set()
for word, freq in freqOfAllWords.items():
    if freq<=maxFreq and freq>=minFreq:
        vocab.add(word)

print(len(vocabOfAllWords)) # Total number of words
print(len(freqOfAllWords)) # Total number of unique words
print(len(vocab)) # Total number of words after filtration

50387
11856
6009


In [36]:
trainData, validationData = train_test_split(df, test_size=0.1, random_state=42)
print(len(trainData))
print(len(validationData))

w2vModel = Word2Vec(sentences=trainData['prep'], vector_size=300, window=5, min_count=1, workers=4)

1800
200


In [None]:
# Function to generate feature vector for a given list of words
def getFeatures(wordsList, model, numFeatures):
    featureVec = np.zeros((num_features,), dtype="float32")
    nWords = 0
    
    for word in wordsList:
        if word in model.wv:
            nWords += 1
            featureVec = np.add(featureVec, model.wv[word])
            
    if nWords > 0:
        featureVec = np.divide(featureVec, nWords)
        
    return featureVec

## Training the model using Neural Networks

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, inputDim, outputDim):
        super(NeuralNetwork, self).__init__()
        self.layerOne = nn.Linear(inputDim, 128)
        self.relu = nn.ReLU()
        self.layerTwo = nn.Linear(128, 64)
        self.layerThree = nn.Linear(64, outputDim)
        
    def forward(self, x):
        out = self.layerOne(x)
        out = self.relu(out)
        out = self.layerTwo(out)
        out = self.relu(out)
        out = self.layerThree(out)
        return out

In [33]:
def train_w2v_model(train_X, train_y, val_X, val_y, num_epochs=10):
    model = W2VClassifier(input_dim=300, output_dim=4)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for i in range(len(train_X)):
            optimizer.zero_grad()
            output = model(torch.tensor(train_X[i]).unsqueeze(0))
            loss = criterion(output, torch.tensor([train_y[i]]))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for i in range(len(val_X)):
                output = model(torch.tensor(val_X[i]).unsqueeze(0))
                loss = criterion(output, torch.tensor([val_y[i]]))
                val_loss += loss.item()

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_X)}, Val Loss: {val_loss/len(val_X)}')

    return model

NameError: name 'model' is not defined

In [None]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs):
    best_val_loss = float('inf')
    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                val_loss += criterion(outputs, labels).item()
        
        val_loss /= len(val_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {val_loss}")
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_model.pt')

test_data = pd.read_csv('./NLP3/test.csv')
test_data['text'] = test_data['text'].apply(preprocess_text)
X_test = np.array([generate_feature_vectors(text, word2vec_model, 100) for text in test_data['text']])
y_test = label_encoder.transform(test_data['label'])

train_dataset = CustomDataset(X_train, y_train)
val_dataset = CustomDataset(X_val, y_val)
test_dataset = CustomDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

model = SimpleNN(input_dim=100, output_dim=4)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=50)

best_model = SimpleNN(input_dim=100, output_dim=4)
best_model.load_state_dict(torch.load('best_model.pt'))
best_model.eval()

In [None]:
def evaluate_model(model, test_loader):
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.tolist())
            all_labels.extend(labels.tolist())
    
    return all_labels, all_preds

y_test_true, y_test_pred = evaluate_model(best_model, test_loader)

test_accuracy = accuracy_score(y_test_true, y_test_pred)
test_macro_f1 = f1_score(y_test_true, y_test_pred, average='macro')

print("Test Accuracy:", test_accuracy)
print("Test Macro-F1 Score:", test_macro_f1)

def plot_confusion_matrix(y_true, y_pred, labels):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title('Confusion Matrix')
    plt.show()

def plot_classification_report(y_true, y_pred, target_names):
    report = classification_report(y_true, y_pred, target_names=target_names, output_dict=True, zero_division=1)
    plt.figure(figsize=(8, 6))
    sns.heatmap(pd.DataFrame(report).iloc[:-1, :].T, annot=True, cmap='Blues')
    plt.title('Classification Report')
    plt.show()

target_names = ['World', 'Sports', 'Business', 'Science/Technology']

plot_confusion_matrix(y_test_true, y_test_pred, labels=[0, 1, 2, 3])

plot_classification_report(y_test_true, y_test_pred, target_names)

In [39]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import confusion_matrix, f1_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Load the CSV files
train_data = pd.read_csv('NLP3/train.csv')
test_data = pd.read_csv('NLP3/test.csv')

# Data Preprocessing
def preprocess_data(train_data, test_data):
    # Convert to lowercase
    train_data['text'] = train_data['text'].lower()
    test_data['text'] = test_data['text'].str.lower()

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    train_data['text'] = train_data['text'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in stop_words]))
    test_data['text'] = test_data['text'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in stop_words]))


    # Create vocabulary
    vocab = train_data['text'].str.split().explode().unique()
    min_df = 5  # Minimum document frequency
    max_df = 0.9  # Maximum document frequency
    vocab = vocab[(vocab.str.len() > 1) & (train_data['text'].str.contains(vocab, case=False).sum() >= min_df) & (train_data['text'].str.contains(vocab, case=False).mean() <= max_df)]
    
    return train_data, test_data, vocab

# Word2Vec Model
def get_word2vec_embeddings(train_data, test_data, vocab):
    # Train Word2Vec model
    w2v_model = Word2Vec(sentences=train_data['text'].str.split(), vector_size=300, window=5, min_count=1, workers=4)
    
    # Get word embeddings
    train_embeddings = [w2v_model.wv[word] for word in train_data['text'].str.split().explode() if word in w2v_model.wv]
    test_embeddings = [w2v_model.wv[word] for word in test_data['text'].str.split().explode() if word in w2v_model.wv]
    
    # Compute average embeddings for each sentence
    train_X = [np.mean(emb, axis=0) for emb in train_embeddings]
    test_X = [np.mean(emb, axis=0) for emb in test_embeddings]
    
    return train_X, test_X

# NN Layers
class W2VClassifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(W2VClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Training and Evaluation
def train_w2v_model(train_X, train_y, val_X, val_y, num_epochs=10):
    model = W2VClassifier(input_dim=300, output_dim=4)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for i in range(len(train_X)):
            optimizer.zero_grad()
            output = model(torch.tensor(train_X[i]).unsqueeze(0))
            loss = criterion(output, torch.tensor([train_y[i]]))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for i in range(len(val_X)):
                output = model(torch.tensor(val_X[i]).unsqueeze(0))
                loss = criterion(output, torch.tensor([val_y[i]]))
                val_loss += loss.item()

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_X)}, Val Loss: {val_loss/len(val_X)}')

    return model

# Task 1A
# Preprocess the data
train_data = pd.read_csv('NLP3/train.csv')
test_data = pd.read_csv('NLP3/test.csv')
train_data, test_data, vocab = preprocess_data(train_data, test_data)

# Get Word2Vec embeddings
train_X, test_X = get_word2vec_embeddings(train_data, test_data, vocab)

# Split the training data into train and validation sets
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(train_X, train_data['label'], test_size=0.1, random_state=42)

# Train the W2V model
model = train_w2v_model(train_X, train_data['label'], val_X, val_y, num_epochs=10)

# Evaluate the model on the test set
test_preds = [model(torch.tensor(x).unsqueeze(0)).argmax().item() for x in test_X]
test_acc = (test_data['label'] == test_preds).mean()
test_f1 = f1_score(test_data['label'], test_preds, average='macro')

print(f'Test Accuracy: {test_acc:.4f}')
print(f'Test Macro F1-Score: {test_f1:.4f}')

# Save the predictions
test_data['preds'] = test_preds
test_data[['text', 'label', 'preds']].to_csv('w2v_test.csv', index=False)


AttributeError: 'numpy.ndarray' object has no attribute 'str'