### Load and preprocess the dataset

In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt


data_path = '4.csv'
new_df = pd.read_csv(data_path)

# Fill missing values with empty strings
new_df = new_df.fillna('')

# Text preprocessing
new_df['text'] = new_df['short_description'].apply(lambda x: re.sub(r'\W', ' ', str(x)).lower().strip())


new_df['label'] = pd.factorize(new_df['category'])[0]

# Data set partitioning
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(new_df, test_size=0.2, random_state=42, stratify=new_df['category'])
train_df, valid_df = train_test_split(train_df, test_size=0.25, random_state=42, stratify=train_df['category'])


print("Training set size:", len(train_df))
print("Validation set size:", len(valid_df))
print("Test set size:", len(test_df))

In [None]:
new_df.head()

Perform an exploration of the data

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

# Perform an analysis of the most common terms for each category
def get_common_terms(text_series, top_n=10):
    all_words = ' '.join(text_series).split()
    counter = Counter(all_words)
    return counter.most_common(top_n)


categories = new_df['category'].unique()
for category in categories:
    common_terms = get_common_terms(new_df[new_df['category'] == category]['text'])
    terms, counts = zip(*common_terms)
    plt.figure(figsize=(10, 6))
    sns.barplot(x=list(counts), y=list(terms))
    plt.title(f'Most Common Terms in Category: {category}')
    plt.xlabel('Count')
    plt.ylabel('Terms')
    plt.show()

# Perform an analysis of the sentence length for each category
new_df['text_length'] = new_df['text'].apply(lambda x: len(x.split()))
length_stats = new_df.groupby('category')['text_length'].describe()
print(length_stats)

#check the blank values
missing_values = new_df.isnull().sum()
print("Missing values in each column:\n", missing_values)

#check the outliers
plt.figure(figsize=(12, 6))
sns.boxplot(x='category', y='text_length', data=new_df)
plt.xlabel('Category')
plt.ylabel('Text Length')
plt.title('Boxplot of Text Length by Category')
plt.show()


### Task1.3 Perform an exploration of the data.

#### Perform an analysis of the most common terms for each category. 

**Result**：
- **TRAVEL**： "the" (over 8000 times), "to", "of", "a", "and", "in", "you", "s", "is", "for"。
- **WEIRD NEWS** "the" (over 800 times), "a", "to", "s", "it", "of", "in", "and", "is", "you"。

We can see that the "the" term is the most common term in the two categories, and these common terms are mainly common words in English, which have little impact on the construction of the classification model and need to be removed during pre-processing.

#### The length of sentences in each category

**Result**：
- **TRAVEL**：The average sentence length is about 27.36 words, the standard deviation is 13.76, and the distribution of sentence lengths are relatively uniform.
- **WEIRD NEWS**：The average sentence length is about 8.60 words, the standard deviation is 7.36 and the sentences are short and concentrated.

Sentences in the TRAVEL category were widely distributed, with a maximum length of 166 words, while sentences in the WEIRD NEWS category were relatively short, with a maximum length of 50 words. It is observed that the difference in sentence length might have an effect on the classification task.

There are no blank values in the dataset. The integrity of the data is good, so there is no need to fill in additional missing values and can be used directly for subsequent analysis.

The box plot shows the distribution of sentence length. The sentence length of the TRAVEL category is widely distributed, and there are some sentences with longer length in the TRAVEL category, while the sentence length of the WEIRD NEWS category is mainly concentrated in 0-50 words, which might affect classification.


### Task 2. Data Preparation & Modelling

In [None]:
from sklearn.model_selection import train_test_split

# Splitting the dataset into training, development and test sets
train_df, test_df = train_test_split(new_df, test_size=0.2, random_state=42, stratify=new_df['category'])
train_df, valid_df = train_test_split(train_df, test_size=0.25, random_state=42, stratify=train_df['category'])


train_df.to_csv('train.csv', index=False)
valid_df.to_csv('valid.csv', index=False)
test_df.to_csv('test.csv', index=False)


 The data set is split into training, development and test sets in a ratio of 60:20:20. Since we are dealing with an imbalance dataset where the ratio of TRAVEL to WEIRD NEWS is 3:1, the 'stratify' parameter is used to ensure that each class is equally distributed in the training set, development set, and test set, which makes the distribution of classes consistent across the splits.

 Load train.csv valid.csv files. Apply preprocessing steps.

In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer

#  Load train.csv and valid.csv files. 
train_df = pd.read_csv('train.csv')
valid_df = pd.read_csv('valid.csv')

# Define text preprocessing functions
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower() 
        text = re.sub(r'\b\w{1,2}\b', '', text) 
        text = re.sub(r'[^\w\s]', '', text) 
        text = re.sub(r'\s+', ' ', text)  
    else:
        text = ""
    return text

# Apply the preprocess function to the text column
train_df['text'] = train_df['text'].apply(preprocess_text)
valid_df['text'] = valid_df['text'].apply(preprocess_text)

# Handle NaN values
train_df['text'].fillna('', inplace=True)
valid_df['text'].fillna('', inplace=True)

# Use TF-IDF for text representation
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df['text'])
X_valid = vectorizer.transform(valid_df['text'])

y_train = train_df['label']
y_valid = valid_df['label']


print("Training set text representation shape:", X_train.shape)
print("Validation set text representation shape:", X_valid.shape)


The text was converted to lower case, punctuation and extra spaces are removed as this was noticed when I was examining the data. The text is then represented as feature vectors using TF-IDF, with a maximum of 5000 features. Finally, the TF-IDF representation of the training set and verification set is obtained. The training set has dimensions (4800, 5000), while the verification set has dimensions (1600, 5000).

In [None]:
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load the previously saved training set and validation set 
train_df = pd.read_csv('train.csv')
valid_df = pd.read_csv('valid.csv')

# Define text preprocessing functions
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()  
        text = re.sub(r'\b\w{1,2}\b', '', text) 
        text = re.sub(r'[^\w\s]', '', text)  
        text = re.sub(r'\s+', ' ', text)  
    else:
        text = ""
    return text

# Apply the function
train_df['text'] = train_df['text'].apply(preprocess_text)
valid_df['text'] = valid_df['text'].apply(preprocess_text)

# Handle Nan
train_df['text'].fillna('', inplace=True)
valid_df['text'].fillna('', inplace=True)

# Use TF-IDF for text representation
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df['text'])
X_valid = vectorizer.transform(valid_df['text'])

y_train = train_df['label']
y_valid = valid_df['label']

# Train a random forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Save the random forest model
joblib.dump(rf_model, 'rf_model.pkl')

# Training naive Bayes models
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Save the naive Bayes model
joblib.dump(nb_model, 'nb_model.pkl')

# Load the model
rf_model_loaded = joblib.load('rf_model.pkl')
nb_model_loaded = joblib.load('nb_model.pkl')

# Make predictions on validation sets
rf_valid_preds = rf_model_loaded.predict(X_valid)
nb_valid_preds = nb_model_loaded.predict(X_valid)


rf_valid_accuracy = accuracy_score(y_valid, rf_valid_preds)
nb_valid_accuracy = accuracy_score(y_valid, nb_valid_preds)

print("Random Forest - Valid Accuracy:", rf_valid_accuracy)
print(classification_report(y_valid, rf_valid_preds, target_names=['TRAVEL', 'WEIRD NEWS', 'OTHER']))

print("Naive Bayes - Valid Accuracy:", nb_valid_accuracy)
print(classification_report(y_valid, nb_valid_preds, target_names=['TRAVEL', 'WEIRD NEWS', 'OTHER']))


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
from tqdm import tqdm

# Define the data set class
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Convert data to tensors
X_train_tensor = torch.tensor(X_train.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_valid_tensor = torch.tensor(X_valid.toarray(), dtype=torch.float32)
y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.long)


train_dataset = TextDataset(X_train_tensor, y_train_tensor)
valid_dataset = TextDataset(X_valid_tensor, y_valid_tensor)

# load the data
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False)

# CNN model
class SimpleCNNTextClassificationModel(nn.Module):
    def __init__(self, input_dim):
        super(SimpleCNNTextClassificationModel, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=50, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.fc1 = nn.Linear(50 * (input_dim // 2), 128)
        self.fc2 = nn.Linear(128, 3)

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.pool(torch.relu(self.conv1(x)))
        x = x.view(x.size(0), -1) 
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

input_dim = X_train.shape[1]
model = SimpleCNNTextClassificationModel(input_dim)

# Loss functions and optimizers
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 10
train_losses = []
valid_losses = []
train_accuracies = []
valid_accuracies = []

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for inputs, labels in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{num_epochs}"):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_accuracy = correct / total
    train_losses.append(epoch_loss)
    train_accuracies.append(epoch_accuracy)
    
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in tqdm(valid_loader, desc=f"Validating Epoch {epoch+1}/{num_epochs}"):
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    epoch_loss = running_loss / len(valid_loader.dataset)
    epoch_accuracy = correct / total
    valid_losses.append(epoch_loss)
    valid_accuracies.append(epoch_accuracy)
    
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_losses[-1]:.4f}, Train Accuracy: {train_accuracies[-1]:.4f}, Valid Loss: {valid_losses[-1]:.4f}, Valid Accuracy: {valid_accuracies[-1]:.4f}")

# Save the model
torch.save(model.state_dict(), 'simple_cnn_text_classification_model.pth')


plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(range(1, num_epochs+1), train_losses, label='Train Loss')
plt.plot(range(1, num_epochs+1), valid_losses, label='Valid Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(range(1, num_epochs+1), train_accuracies, label='Train Accuracy')
plt.plot(range(1, num_epochs+1), valid_accuracies, label='Valid Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()

plt.show()



As training progresses, the model learns and improves. The training and validation sets are also getting more accurate, showing that the model is improving. At epoch 10, the training set loss is 0.2622 and the accuracy is 0.8900; the validation set loss is 0.3164 and the accuracy is 0.8562.
The overall trend is upward, and the final verification accuracy is close to the training accuracy. This shows that the model performs well on both the training set and the verification set.

 ### Task3: Evaluate model performance on training and validation sets

In [None]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score

# Load the model
model.load_state_dict(torch.load('simple_cnn_text_classification_model.pth'))

# Evaluate model performance on validation sets
model.eval()
correct = 0
total = 0
valid_preds = []
with torch.no_grad():
    for inputs, labels in valid_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        valid_preds.extend(predicted.cpu().numpy())
valid_accuracy = correct / total
print(f"Validation Accuracy: {valid_accuracy:.4f}")


y_valid_pred = np.array(valid_preds)
print(classification_report(y_valid_tensor.numpy(), y_valid_pred, target_names=['TRAVEL', 'WEIRD NEWS', 'OTHER']))


In [None]:
error_analysis_df = pd.DataFrame({
    'text': valid_df['text'].values,
    'true_label': y_valid_tensor.numpy(),
    'predicted_label': y_valid_pred
})

# The misclassified samples were filtered out
error_analysis_df = error_analysis_df[error_analysis_df['true_label'] != error_analysis_df['predicted_label']]


print(error_analysis_df.head(10))

Many TRAVEL texts are misclassified as WEIRD NEWS, probably because the two categories share similarities in some features

In [None]:
import joblib
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
import torch.nn as nn
import torch.optim as optim


target_names = ['TRAVEL', 'WEIRD NEWS', 'OTHER']

# Random forest model optimization: Increase the number of trees
optimized_rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
optimized_rf_model.fit(X_train, y_train)
optimized_rf_valid_preds = optimized_rf_model.predict(X_valid)
optimized_rf_valid_accuracy = accuracy_score(y_valid, optimized_rf_valid_preds)
print("Optimized Random Forest - Valid Accuracy:", optimized_rf_valid_accuracy)
print(classification_report(y_valid, optimized_rf_valid_preds, target_names=target_names, zero_division=0))

# Save it
joblib.dump(optimized_rf_model, 'optimized_rf_model.pkl')

# Naive Bayesian model optimization: using smoothing parameters
optimized_nb_model = MultinomialNB(alpha=0.5)
optimized_nb_model.fit(X_train, y_train)
optimized_nb_valid_preds = optimized_nb_model.predict(X_valid)
optimized_nb_valid_accuracy = accuracy_score(y_valid, optimized_nb_valid_preds)
print("Optimized Naive Bayes - Valid Accuracy:", optimized_nb_valid_accuracy)
print(classification_report(y_valid, optimized_nb_valid_preds, target_names=target_names, zero_division=0))

# Save the optimized naive Bayes model
joblib.dump(optimized_nb_model, 'optimized_nb_model.pkl')

# Deep learning model optimization: Change the model structure and increase the training rounds
class SimpleCNNModel(nn.Module):
    def __init__(self, input_dim):
        super(SimpleCNNModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(256, 128)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(128, 3)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

optimized_model = SimpleCNNModel(input_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(optimized_model.parameters(), lr=0.001)

# Retrain the deep learning model
num_epochs = 10
train_losses = []
valid_losses = []
train_accuracies = []
valid_accuracies = []

for epoch in range(num_epochs):
    optimized_model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for inputs, labels in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{num_epochs}"):
        optimizer.zero_grad()
        outputs = optimized_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_accuracy = correct / total
    train_losses.append(epoch_loss)
    train_accuracies.append(epoch_accuracy)
    
    optimized_model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in tqdm(valid_loader, desc=f"Validating Epoch {epoch+1}/{num_epochs}"):
            outputs = optimized_model(inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    epoch_loss = running_loss / len(valid_loader.dataset)
    epoch_accuracy = correct / total
    valid_losses.append(epoch_loss)
    valid_accuracies.append(epoch_accuracy)
    
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_losses[-1]:.4f}, Train Accuracy: {train_accuracies[-1]:.4f}, Valid Loss: {valid_losses[-1]:.4f}, Valid Accuracy: {valid_accuracies[-1]:.4f}")

# Save it
torch.save(optimized_model.state_dict(), 'optimized_cnn_model.pth')

# Visual training process
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.plot(range(1, num_epochs+1), train_losses, label='Train Loss')
plt.plot(range(1, num_epochs+1), valid_losses, label='Valid Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(range(1, num_epochs+1), train_accuracies, label='Train Accuracy')
plt.plot(range(1, num_epochs+1), valid_accuracies, label='Valid Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()

plt.show()


#### Optimized Random Forest:
**Valid Accuracy: 0.865625**
By increasing the number of trees, the performance of the random forest model on the verification set is improved. The TRAVEL category had the highest accuracy, and the WEIRD NEWS category also improved
#### Optimized Naive Bayes:
**Valid Accuracy: 0.78375**
By adjusting the smoothing parameter alpha, the performance of the model is improved, and the accuracy on the verification set is improved. Accuracy was higher in the TRAVEL category, but the WEIRD NEWS category still needed improvement.
#### Optimized CNN Model:
The accuracy and loss curves during training show how the model performs on the training set and the validation set. By simplifying the model structure and increasing the number of training rounds, the performance of the model on the verification set is further improved. The accuracy of the validation set is 0.8606, indicating that the model performance has improved.

In [None]:
from sklearn.model_selection import cross_val_score

# Combine the training set and the validation set for cross-validation
X_combined = np.vstack((X_train.toarray(), X_valid.toarray()))
y_combined = np.concatenate((y_train, y_valid))

# Cross-validate random forest models
optimized_rf_model = joblib.load('optimized_rf_model.pkl')
rf_cv_scores = cross_val_score(optimized_rf_model, X_combined, y_combined, cv=5, scoring='accuracy')
print("Random Forest Cross-Validation Accuracy:", rf_cv_scores)
print("Mean CV Accuracy:", np.mean(rf_cv_scores))

# Cross-validate naive Bayes model
optimized_nb_model = joblib.load('optimized_nb_model.pkl')
nb_cv_scores = cross_val_score(optimized_nb_model, X_combined, y_combined, cv=5, scoring='accuracy')
print("Naive Bayes Cross-Validation Accuracy:", nb_cv_scores)
print("Mean CV Accuracy:", np.mean(nb_cv_scores))

class SimpleCNNModel(nn.Module):
    def __init__(self, input_dim):
        super(SimpleCNNModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(256, 128)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(128, 3)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

# Load the optimized CNN model
input_dim = X_train.shape[1]
optimized_model = SimpleCNNModel(input_dim)
optimized_model.load_state_dict(torch.load('optimized_cnn_model.pth'))

# Use KFold for cross-validation
from sklearn.model_selection import KFold

kf = KFold(n_splits=5)
cnn_cv_scores = []

for train_index, val_index in kf.split(X_combined):
    X_train_kf, X_val_kf = X_combined[train_index], X_combined[val_index]
    y_train_kf, y_val_kf = y_combined[train_index], y_combined[val_index]
    
    train_dataset_kf = TensorDataset(torch.tensor(X_train_kf, dtype=torch.float32), torch.tensor(y_train_kf, dtype=torch.long))
    val_dataset_kf = TensorDataset(torch.tensor(X_val_kf, dtype=torch.float32), torch.tensor(y_val_kf, dtype=torch.long))
    
    train_loader_kf = DataLoader(train_dataset_kf, batch_size=64, shuffle=True)
    val_loader_kf = DataLoader(val_dataset_kf, batch_size=64, shuffle=False)
    
    model = SimpleCNNModel(input_dim)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    for epoch in range(5):  
        model.train()
        for inputs, labels in train_loader_kf:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
    
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader_kf:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    accuracy = correct / total
    cnn_cv_scores.append(accuracy)

print("CNN Cross-Validation Accuracy:", cnn_cv_scores)
print("Mean CV Accuracy:", np.mean(cnn_cv_scores))



Based on the cross-validation results, the CNN model is the best performing model, followed by the Random forest model. The highest average accuracy of the CNN model in the task indicates that it is the most suitable model for this text classification task

In [None]:
# Load test set
test_df = pd.read_csv('test.csv')

# Preprocessing test set text
test_df['text'] = test_df['text'].apply(preprocess_text)
X_test = vectorizer.transform(test_df['text'])
y_test = test_df['label']

# Load the optimized CNN model
optimized_model = SimpleCNNModel(input_dim)
optimized_model.load_state_dict(torch.load('optimized_cnn_model.pth'))


test_dataset = TensorDataset(torch.tensor(X_test.toarray(), dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Evaluate the optimized CNN model on the test set
optimized_model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = optimized_model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

test_accuracy = correct / total
print("Optimized CNN - Test Accuracy:", test_accuracy)


y_test_pred = []
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = optimized_model(inputs)
        _, predicted = torch.max(outputs, 1)
        y_test_pred.extend(predicted.numpy())

print(classification_report(y_test, y_test_pred, target_names=target_names))



The accuracy of the optimized CNN model on the test set is 0.835, which is very close to the average accuracy of the cross-validation of 0.8573. It shows that the model has strong generalization ability on unseen data



Retrain the model and evaluate the test set using the combined training and validation sets

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import numpy as np

# Redefine the CNN model
class SimpleCNNModel(nn.Module):
    def __init__(self, input_dim):
        super(SimpleCNNModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 3)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Load the training set and the validation set
train_df = pd.read_csv('train.csv')
valid_df = pd.read_csv('valid.csv')
test_df = pd.read_csv('test.csv')

# Combine them
combined_df = pd.concat([train_df, valid_df])
combined_df['text'].fillna('', inplace=True)

# Use TF-IDF for text representation
vectorizer = TfidfVectorizer(max_features=5000)
X_combined = vectorizer.fit_transform(combined_df['text'])
y_combined = combined_df['label']

# Convert to tensor
X_combined_tensor = torch.tensor(X_combined.toarray(), dtype=torch.float32)
y_combined_tensor = torch.tensor(y_combined.values, dtype=torch.long)

# Data loader
combined_dataset = TensorDataset(X_combined_tensor, y_combined_tensor)
combined_loader = DataLoader(combined_dataset, batch_size=64, shuffle=True)

# Define the model, loss function, and optimizer
input_dim = X_combined.shape[1]
retrained_model = SimpleCNNModel(input_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(retrained_model.parameters(), lr=0.001)

# Training model
num_epochs = 10
for epoch in range(num_epochs):
    retrained_model.train()
    for inputs, labels in tqdm(combined_loader, desc=f'Training Epoch {epoch+1}/{num_epochs}'):
        optimizer.zero_grad()
        outputs = retrained_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Evaluate the retrained CNN model on the test set
test_df['text'].fillna('', inplace=True)
X_test = vectorizer.transform(test_df['text'])
y_test = test_df['label']

X_test_tensor = torch.tensor(X_test.toarray(), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

retrained_model.eval()
correct = 0
total = 0
y_test_pred = []

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = retrained_model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        y_test_pred.extend(predicted.numpy())

test_accuracy_retrained = correct / total
print("Retrained CNN - Test Accuracy:", test_accuracy_retrained)


print(classification_report(y_test, y_test_pred, target_names=['TRAVEL', 'WEIRD NEWS', 'OTHER']))

torch.save(retrained_model.state_dict(), 'retrained_cnn_model.pth')



**The retrained CNN model performed well with an overall accuracy of 0.84 on the test set.**
For **TRAVEL**, the model performed well, with accuracy, recall and F1 scores of 0.89.
**WEIRD NEWS** reports that both the accuracy rate and recall rate of the model are 0.68
OTHER, due to the small sample size (only 4), the model did not successfully predict this category, resulting in accuracy, recall, and F1 scores of 0.00