In [None]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import joblib
from sklearn.metrics import accuracy_score, classification_report
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import string
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from nltk.stem import WordNetLemmatizer
import re

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

### Task 1

In [None]:
# Reading the csv
df = pd.read_csv('100.csv')
df

In [None]:
# Category count
df.category.value_counts()

In [None]:
# Dropping unnecessary columns that are not needed for the analysis.
df.drop(columns=['Unnamed: 0','link','date','authors'], inplace=True)
df

In [None]:
# Merging headline and short description.
df['text'] = df['headline'] + " " + df['short_description']
df

In [None]:
# Dropping columns after merging the important columns.
df = df.drop(columns=['headline', 'short_description'])
df

In [None]:
# Sum of the NA values in each category
df.isna().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove digits (optional)
    text = re.sub(r'\d+', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    
    return text

In [None]:
# After removing lowercase, punctuations and stop words.
df['text'] = df['text'].apply(preprocess_text)

In [None]:
df

In [None]:
# Common words in Category.
def get_most_common_terms(category, n=20):
    vectorizer = CountVectorizer(max_features=1000)
    category_texts = df[df['category'] == category]['text']
    X = vectorizer.fit_transform(category_texts)
    terms = vectorizer.get_feature_names_out()
    sums = X.sum(axis=0)
    term_freq = [(term, sums[0, idx]) for term, idx in vectorizer.vocabulary_.items()]
    term_freq = sorted(term_freq, key=lambda x: x[1], reverse=True)
    return term_freq[:n]

In [None]:
# Most Common terms in each of the categories.
categories = df['category'].unique()
for category in categories:
    print(f"\nMost common terms in category '{category}':")
    common_terms = get_most_common_terms(category)
    for term, freq in common_terms:
        print(f"{term}: {freq}")

In [None]:
# Sentence length analysis
df['sentence_length'] = df['text'].apply(lambda x: len(x.split()))

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='category', y='sentence_length', data=df)
plt.title('Sentence Length Distribution by Category')
plt.xlabel('Category')
plt.ylabel('Sentence Length')
plt.show()


Both the POLITICS and STYLE categories have a similar median sentence length of around 20 words. However, the POLITICS category exhibits more significant outliers and longer sentences, with the maximum sentence length approaching 140 words, Compared to around 100 words for the STYLE category. This indicates that POLITICS articles tend to include longer sentences, while STYLE articles are relatively more concise. The overall distribution shows that sentence lengths for both categories are concentrated around the median, But the POLITICS category has a longer tail, indicating more long sentences.

The most common words ：
POLITICS： mainly focused on political figures (such as Trump, Clinton) and related political terms (such as President, GOP).
STYLE： mainly concentrated on fashion and beauty related terms (such as fashion, style, appearance, beauty).

Category Style has less data points than category Politics that might favor category Politics more. We can oversample the category Style or we can apply more weight to the Style category.



In [None]:
# To remove the outliers using IQR
def categoryOut(cate):
    Q1 = df[df['category']==cate]['sentence_length'].quantile(0.25)
    Q3 = df[df['category']==cate]['sentence_length'].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5*IQR
    upper = Q3 + 1.5*IQR
    return lower,upper

In [None]:
# Lower and Upper limit of Politics. Points outside this range should be considered as outliers.
lp,up=categoryOut('POLITICS')
lp,up

In [None]:
# Lower and Upper limit of Style. Points outside this range should be considered as outliers.
lc,uc=categoryOut('STYLE')
lc,uc

In [None]:
df = df[
    ( (df['category'] == 'POLITICS') & (df['sentence_length'] >= lp) & (df['sentence_length'] <= up))
    |
    ( (df['category'] == 'STYLE') & (df['sentence_length'] >= lc) & (df['sentence_length'] <= uc))
]

In [None]:
# Outliers removed
df.category.value_counts()

Category Style has less data points than category Politics that might favor category Politics more. We can oversample the category Style or we can apply more weight to the Style category.

## Task 2

The split strategy that I chose is a good practice for creating a reliable and balanced dataset for training, validation, and testing purposes. 

In [None]:
#Splitting the dataset into training, validation and test sets
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df['category'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['category'])

In [None]:
#Saving the train,validation and test locally.
train_df.to_csv('train.csv', index=False)
val_df.to_csv('valid.csv', index=False)
test_df.to_csv('test.csv', index=False)

In [None]:
train_df = pd.read_csv('train.csv')
valid_df = pd.read_csv('valid.csv')

In [None]:
train_df

In [None]:
train_df.shape, valid_df.shape

In this step, we use the TF-IDF Vectorizer to convert the text data into numerical representations. TF-IDF helps to highlight important words in each document while reducing the weight of commonly occurring words that are less informative (like "the", "is", "in", etc.).

In [None]:
# Converting words to numerical representation
vectorizer = TfidfVectorizer(max_features=4500, stop_words='english')

# Fit and transform the training data
X_train = vectorizer.fit_transform(train_df['text'])

# Transform the validation data
X_valid = vectorizer.transform(valid_df['text'])

In [None]:
# Target variable
y_train = train_df['category']
y_valid = valid_df['category']

### Logistic Regression
Choice of Classifier: Logistic Regression is a simple and interpretable model suitable for binary classification problems. It performs well with a large number of features and provides probabilistic outputs.
Parameters Used: We used the default parameters except for setting random_state=42 to ensure reproducibility.


In [None]:
logreg = LogisticRegression(random_state=42)

# Training the model
logreg.fit(X_train, y_train)

#Saving the model
joblib.dump(logreg, 'logreg_model_without_params.pkl')


### Random Forest Classifier
Choice of Classifier: Random Forest is an ensemble learning method that constructs multiple decision trees and merges their results. It tends to provide high accuracy and robustness to overfitting, making it a good choice for complex datasets.
Parameters Used: We set random_state=42 for reproducibility and used n_estimators=100 to specify the number of trees in the forest, which is a common choice to balance performance and computational efficiency.

In [None]:
# Initialize Random Forest model
rf = RandomForestClassifier(random_state=42, n_estimators=100)

# Training the model
rf.fit(X_train, y_train)

#Saving the model
joblib.dump(rf, 'random_forest_without_params.pkl')

### Deep learning model
##### Multi Layer perceptron (MLP)
When text data is converted into numerical vectors using techniques like TF-IDF, the resulting feature vectors can be effectively handled by MLPs. These vectorized features represent the text in a format that MLPs are well-equipped to process.


In [None]:
train_df_dl = train_df.copy()
valid_df_dl = val_df.copy()

In [None]:
label_encoder = LabelEncoder()
train_df_dl['category'] = label_encoder.fit_transform(train_df_dl['category'])
valid_df_dl['category'] = label_encoder.transform(valid_df_dl['category'])

In [None]:
# Target variable
y_train_dl = train_df_dl['category'].values
y_valid_dl = valid_df_dl['category'].values

In [None]:
vectorizer = TfidfVectorizer(max_features=4500, stop_words='english')

# Fitting and transforming the training data
X_train_dl = vectorizer.fit_transform(train_df_dl['text']).toarray()

# Transforming the validation data
X_valid_dl = vectorizer.transform(valid_df_dl['text']).toarray()

In [None]:
# Converting data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_dl, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_dl, dtype=torch.long)
X_valid_tensor = torch.tensor(X_valid_dl, dtype=torch.float32)
y_valid_tensor = torch.tensor(y_valid_dl, dtype=torch.long)

In [None]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
valid_dataset = TensorDataset(X_valid_tensor, y_valid_tensor)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

The MLP (Multilayer Perceptron) architecture consists of an input layer connected to a hidden layer via a linear transformation, followed by a ReLU activation function. The hidden layer is then connected to the output layer, which performs a linear transformation to produce the final class scores.

In [None]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

In [None]:
# Initializing the model, loss function, and optimizer
input_size = X_train_dl.shape[1]
hidden_size = 100
num_classes = len(label_encoder.classes_)

In [None]:
model = MLP(input_size, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
torch.save(model.state_dict(), 'mlp_model_state.pth')

### Task 3

Since our dataset is imbalanced, with more political data samples than style data samples, the F1-score is the ideal choice for evaluating our binary classification models. The F1-score is the harmonic mean of precision and recall, providing a single measure that balances both metrics.

Balance Between Precision and Recall: Precision measures how many of the selected items are relevant, while recall measures how many relevant items are selected. The F1-score combines these into one metric, ensuring that neither precision nor recall is unduly prioritized.
Handling Imbalanced Data: In imbalanced datasets, where one class dominates, accuracy can be misleading because the model could achieve high accuracy by simply predicting the majority class. The F1-score, by considering both false positives and false negatives, provides a more nuanced and informative evaluation.
Relevance to Our Task: For distinguishing between political and style news categories, it is crucial to minimize both false positives and false negatives. The F1-score effectively captures this by taking into account both types of errors.

Good Benchmark
For many binary classification tasks, an F1-score above 0.70 is considered good, indicating a reasonable balance between precision and recall. An F1-score close to 0.90 is considered excellent, demonstrating that the model performs very well on both the majority and minority classes. These benchmarks can guide us in evaluating the performance of our models:

Above 0.70: A good performance indicator, showing the model is reasonably balanced between precision and recall. 
Close to 0.90: Indicates excellent performance, with the model effectively handling both political and style categories.

In [None]:
df.category.value_counts()

#### Performance evaluation of Logistic regression

We will use the F1 score as our primary metric for evaluation. Additionally, we will leverage the classification report to gain deeper insights into precision and recall for specific categories, such as "political" and "style."

In [None]:
logistic_regression_model = joblib.load('logreg_model_without_params.pkl')

In [None]:
from sklearn.metrics import classification_report, f1_score
y_pred_logreg = logistic_regression_model.predict(X_valid)
f1_logistic_regression = f1_score(y_valid, y_pred_logreg, average='weighted')

print("Logistic Regression Accuracy:", accuracy_score(y_valid, y_pred_logreg))
print("------------------------------------------------------------------------")
print(f"Logistic Regression Model F1-Score: {f1_logistic_regression}")
print("------------------------------------------------------------------------")
report = classification_report(y_valid, y_pred_logreg, output_dict=True)
print("Classification Report:")
print(classification_report(y_valid, y_pred_logreg))


The Logistic Regression classifier achieved an impressive accuracy of 92.9% on the validation set. The weighted F1-score of 92.3% indicates a strong overall performance. The classifier demonstrated excellent precision and recall for the 'POLITICS' category, with an F1-score of 96%. However, for the 'STYLE' category, while the precision was very high at 99%, the recall was significantly lower at 65%, resulting in an F1-score of 78%. This suggests that while the model is very good at identifying true positives for 'STYLE', it also misses a substantial number of them, indicating room for improvement in balancing the recall across categories.

#### Performance evaluation of Random Forest

In [None]:
random_forest_model = joblib.load('random_forest_without_params.pkl')

In [None]:
y_pred_rf = random_forest_model.predict(X_valid)
f1_logistic_regression = f1_score(y_valid, y_pred_rf, average='weighted')

print("Random Forest Accuracy:", accuracy_score(y_valid, y_pred_rf))
print("------------------------------------------------------------------------")
print(f"Random Forest Model F1-Score: {f1_logistic_regression}")
print("------------------------------------------------------------------------")
report = classification_report(y_valid, y_pred_rf, output_dict=True)
print("Classification Report:")
print(classification_report(y_valid, y_pred_rf))

The Random Forest classifier achieved a high accuracy of 94.5% on the validation set, with a weighted F1-score of 94.2%, indicating robust overall performance. It demonstrated strong precision and recall for the 'POLITICS' category, achieving an F1-score of 97%. For the 'STYLE' category, the model showed improved performance compared to the logistic regression, with an F1-score of 84%, reflecting better balance between precision and recall. This suggests that the Random Forest model is effective in distinguishing between categories and handles class imbalance better than the logistic regression model.

#### Performance evaluation of deep learning model MLP

In [None]:
# model = MLP(input_size, hidden_size, num_classes)
model.load_state_dict(torch.load('mlp_model_state.pth'))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        X_batch, y_batch = batch
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_loss}")

In [None]:
# Evaluate on validation set
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in valid_loader:
        X_batch, y_batch = batch
        outputs = model(X_batch)
        _, predicted = torch.max(outputs.data, 1)
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(y_batch.cpu().numpy())

# Calculate F1 score and classification report
f1 = f1_score(all_labels, all_predictions, average='weighted')
print(f"MLP Validation F1 Score: {f1}")
print("------------------------------------------------------------------------")
report = classification_report(all_labels, all_predictions)
print("MLP Validation Classification Report:")
print(report)

The Multilayer Perceptron (MLP) classifier performed exceptionally well on the validation set, achieving an accuracy of 98.0% and a weighted F1-score of 98.1%. The model demonstrated high precision and recall for both classes, with an F1-score of 99% for class 0 ('POLITICS') and 95% for class 1 ('STYLE'). The balanced performance across categories, especially the strong recall for class 0 and high precision for class 1, highlights the MLP's effectiveness in distinguishing between classes and handling class imbalance. Overall, the MLP shows superior performance compared to the logistic regression and Random Forest models.

### Apply at least one change to the classifier

In [None]:
# Cross Validation
param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'liblinear']
}
logreg = LogisticRegression(random_state=42)
grid_search = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_logreg = grid_search.best_estimator_

joblib.dump(best_logreg, 'logreg_model_with_params.pkl')

In [None]:
best_logreg = joblib.load('logreg_model_with_params.pkl')
y_valid_pred = best_logreg.predict(X_valid)
valid_accuracy_logreg = accuracy_score(y_valid, y_valid_pred)
print(f'Logistic Regression Validation Accuracy: {valid_accuracy_logreg}')

# Classification report and F1 score for Logistic Regression
print("Logistic Regression Classification Report:\n", classification_report(y_valid, y_valid_pred))
f1_logreg = f1_score(y_valid, y_valid_pred, average='weighted')
print(f'Logistic Regression F1 Score: {f1_logreg}')

To improve the Logistic Regression model, I performed a grid search to optimize hyperparameters, specifically testing different values of C, penalty, and solver. The best parameters found resulted in a validation accuracy of 95.9% and an F1-score of 95.8%, indicating improved performance. The optimized model achieved better precision and recall for both classes, especially enhancing recall for the 'STYLE' category, which was previously lower. This demonstrates the effectiveness of hyperparameter tuning in refining model performance and achieving closer to the benchmark for the task.

### Random Forest with parameter tuning

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [10, 20, None]
}

# Initializing Random Forest model
rf = RandomForestClassifier(random_state=42)

# Initializing GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')

# Fitting GridSearchCV
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_

joblib.dump(best_rf, 'random_forest_with_params.pkl')

# Validation predictions and accuracy
y_valid_pred = best_rf.predict(X_valid)
valid_accuracy_rf = accuracy_score(y_valid, y_valid_pred)
print(f'Random Forest Validation Accuracy: {valid_accuracy_rf}')

# Classification report and F1 score for Random Forest
print("Random Forest Classification Report:\n", classification_report(y_valid, y_valid_pred))
f1_rf = f1_score(y_valid, y_valid_pred, average='weighted')
print(f'Random Forest F1 Score: {f1_rf}')

By tuning the Random Forest model with GridSearchCV, I optimized hyperparameters such as n_estimators, max_features, and max_depth, resulting in a validation accuracy of 95.2% and a weighted F1-score of 94.9%. This improvement is evident in the enhanced recall for the 'STYLE' category and consistently high precision for 'POLITICS'. The refined model shows better performance and robustness, addressing class imbalance issues more effectively and coming closer to the benchmark expectations.

### MlP

In [None]:
train_df_dl = train_df.copy()
valid_df_dl = val_df.copy()
label_encoder = LabelEncoder()
train_df_dl['category'] = label_encoder.fit_transform(train_df_dl['category'])
valid_df_dl['category'] = label_encoder.transform(valid_df_dl['category'])
# Target variable
y_train_dl = train_df_dl['category'].values
y_valid_dl = valid_df_dl['category'].values

vectorizer = TfidfVectorizer(max_features=4500, stop_words='english')

X_train_dl = vectorizer.fit_transform(train_df_dl['text']).toarray()

X_valid_dl = vectorizer.transform(valid_df_dl['text']).toarray()
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
valid_dataset = TensorDataset(X_valid_tensor, y_valid_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)


input_size = X_train_dl.shape[1]
hidden_size = 200  # Changed hidden size
num_classes = len(y_train.unique())
model = MLP(input_size, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

num_epochs = 10  
batch_size = 64 # Changed batch size

model = MLP(input_size, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(num_epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

    model.eval()
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for batch_X, batch_y in valid_loader:
            outputs = model(batch_X)
            _, predicted = torch.max(outputs.data, 1)
            all_predictions.extend(predicted.numpy())
            all_labels.extend(batch_y.cpu().numpy())

f1 = f1_score(all_labels, all_predictions, average='weighted')
print(f"MLP Validation F1 Score: {f1}")
print("------------------------------------------------------------------------")
report = classification_report(all_labels, all_predictions)
print("MLP Validation Classification Report:")
print(report)            

In [None]:
# Save the model
torch.save(model.state_dict(), 'mlp_model_with_params.pth')

I made several adjustments to the MLP model, including increasing the hidden layer size to 200. The model achieved a validation F1-score of 98.1%, with high precision and recall for both classes, particularly excelling in classifying 'POLITICS' with near-perfect recall. These changes resulted in enhanced performance, demonstrating the model's capacity to better learn and generalize from the data, achieving strong results across evaluation metrics.

In [None]:
# Combining the training and validation data
combined_df = pd.concat([train_df, valid_df])
combined_df

In [None]:
X_combined = vectorizer.transform(combined_df['text'])
y_combined = combined_df['category']

In [None]:
# Logistic Regression

In [None]:
best_logreg = joblib.load('logreg_model_with_params.pkl')

In [None]:
cv_scores_logreg = cross_val_score(best_logreg, X_combined, y_combined, cv=5, scoring='accuracy')

# Print results
print("Logistic Regression Cross-Validation Scores:", cv_scores_logreg)
print("Logistic Regression Mean Cross-Validation Accuracy:", cv_scores_logreg.mean())

The cross-validation scores for the Logistic Regression model indicate consistent and high performance across different folds, with accuracies ranging from 94.9% to 96.5%. The mean cross-validation accuracy is 95.9%, reflecting the model's robustness and generalizability. These results confirm that the optimized Logistic Regression model performs well across diverse subsets of the combined dataset, demonstrating its effectiveness in handling the classification task.

In [None]:
# Perform cross-validation
cv_scores_rf = cross_val_score(best_rf, X_combined, y_combined, cv=5, scoring='accuracy')

# Print results
print("Random Forest Cross-Validation Scores:", cv_scores_rf)
print("Random Forest Mean Cross-Validation Accuracy:", cv_scores_rf.mean())

The Random Forest model's cross-validation results show high and stable performance, with accuracies ranging from 95.2% to 96.6%. The mean cross-validation accuracy of 96.1% indicates strong generalization across different subsets of the data. These results highlight the Random Forest model's robustness and effectiveness, confirming its reliability for the classification task. The consistent high performance across folds suggests that the model is well-tuned and capable of maintaining accuracy on unseen data.

### MLP

In [None]:
train_df_dl = combined_df.copy()

label_encoder = LabelEncoder()
train_df_dl['category'] = label_encoder.fit_transform(train_df_dl['category'])

# Target variable
y_train_dl = train_df_dl['category'].values

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=4500, stop_words='english')

# Fit and transform the training data
X_train_dl = vectorizer.fit_transform(train_df_dl['text']).toarray()

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_dl, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_dl, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores_mlp = []

model.train()
for epoch in range(num_epochs):
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

# Evaluation phase
model.eval()
all_predictions = []
all_labels = []
with torch.no_grad():
    for batch_X, batch_y in valid_loader:
        outputs = model(batch_X)
        _, predicted = torch.max(outputs.data, 1)
        all_predictions.extend(predicted.numpy())
        all_labels.extend(batch_y.numpy())

# Computing metrics
f1 = f1_score(all_labels, all_predictions, average='weighted')
print(f"MLP Validation F1 Score: {f1}")
report = classification_report(all_labels, all_predictions)
print("MLP Validation Classification Report:")
print(report)

cv_scores_mlp.append(f1)

print("MLP Cross-Validation F1 Scores:", cv_scores_mlp)
print("MLP Mean Cross-Validation F1 Score:", np.mean(cv_scores_mlp))        

The MLP model achieved a validation F1-score of 93.2%, with high precision and recall for the 'POLITICS' category but lower performance for the 'STYLE' category. The mean cross-validation F1-score matches the validation F1-score, indicating consistent performance across different folds. The results show that while the MLP is effective in classifying 'POLITICS', it struggles more with 'STYLE', suggesting that further tuning or adjustments might be needed to balance performance across all classes.

The Random Forest model performed the best using cross-validation, with a mean accuracy of 96.1% and consistent high scores across folds. This indicates its robustness and superior generalization compared to the other models. The Logistic Regression and MLP models also performed well but did not match the Random Forest’s accuracy, highlighting the Random Forest's strength in handling the classification task with high reliability and consistency. So we will use saved Random Forest for our test data.

In [None]:
# 13
test = pd.read_csv('test.csv')
test

In [None]:
# Converting words to numerical representation
vectorizer = TfidfVectorizer(max_features=4500, stop_words='english')
X_train = vectorizer.fit_transform(train_df['text']) #Just a reference

# Transforming the test data
X_test = vectorizer.transform(test_df['text'])

In [None]:
y_test = test_df['category']

In [None]:
best_rf = joblib.load('random_forest_with_params.pkl')

In [None]:
# Making predictions on the test set
y_pred_rf = best_rf.predict(X_test)

# Calculate F1 score
f1_random_forest = f1_score(y_test, y_pred_rf, average='weighted')

# Print Accuracy and F1-Score
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("------------------------------------------------------------------------")
print(f"Random Forest Model F1-Score: {f1_random_forest}")
print("------------------------------------------------------------------------")

# Print Classification Report
report = classification_report(y_test,y_pred_rf)
print("Classification Report:")
print(report)

The Random Forest model achieved an impressive accuracy of 97.6% and an F1-score of 97.6% on the test set, demonstrating excellent overall performance. It performed exceptionally well on 'POLITICS' with high precision and recall, and also showed strong results for 'STYLE', with an F1-score of 94%. These results highlight the model's effectiveness and reliability in classifying both categories accurately, confirming its robustness and suitability for the task.

In [None]:
# Combining the training and validation data
combined_df = pd.concat([train_df, valid_df])
combined_df

In [None]:
X_combined = vectorizer.transform(combined_df['text'])
y_combined = combined_df['category']

In [None]:
best_model = joblib.load('random_forest_with_params.pkl')

In [None]:
# Initializing Random Forest model
best_model.fit(X_combined, y_combined)

y_pred_rf = best_model.predict(X_test)

f1_random_forest = f1_score(y_test, y_pred_rf, average='weighted')

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("------------------------------------------------------------------------")
print(f"Random Forest Model F1-Score: {f1_random_forest}")
print("------------------------------------------------------------------------")

report = classification_report(y_test, y_pred_rf)
print("Classification Report:")
print(report)

Retraining the Random Forest model with both the train and validation datasets resulted in a test accuracy of 96.3% and an F1-score of 96.3%. These metrics are slightly lower compared to the model trained solely on the training set, which achieved 97.6% accuracy and a 97.6% F1-score. This decrease suggests that including the validation data for retraining led to a slight reduction in performance, potentially due to the model's exposure to a larger, more diverse dataset. Despite this, the retrained model still shows strong performance, confirming its effectiveness in handling the classification task.