# Data Preparation
This contains the functions that i use to prepare the data

In [None]:
# Importing necessary libraries for data manipulation, machine learning, and visualization
import pandas as pd
import numpy as np
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import confusion_matrix



In [None]:
# Ensure all necessary NLTK resources are available
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
# Setup stopwords and stemming 
default_stopwords = set(stopwords.words('english')) | {
    'said', 'would', 'even', 'according', 'could', 'year', 'years', 'also', 'new', 'people', 
    'old', 'one', 'two', 'time', 'first', 'last', 'say', 'make', 'best', 'get', 'three', 
    'year old', 'told', 'made', 'like', 'take', 'many', 'set', 'number', 'month', 'week', 
    'well', 'back'
}

#Stemming 
stemmer = PorterStemmer()


In [None]:
def clean_text(text):
    """Clean text by lowercasing, removing stopwords, digits, and non-alphabetic characters,
    and applying stemming."""

    # Normalize text: lowercasing and removing non-alphabetic characters
    text = re.sub(r'[^a-z\s]', ' ', text.lower())

    # Tokenization and filtering out stopwords and short words
    tokens = [word for word in word_tokenize(text) if word not in default_stopwords and len(word) > 3]

    # Stemming
    tokens = [stemmer.stem(word) for word in tokens]

    # Join cleaned tokens back into a string
    return ' '.join(tokens)

In [None]:
def load_and_preprocess_data(file_path):
    """Load and preprocess dataset."""
    df = pd.read_csv(file_path)
    df['text'] = df['title'] + ' ' + df['content']  # Combine title and content
    df['text'] = df['text'].apply(clean_text)  # Clean text

    # Keep only the 'text' column as a feature 
    df = df[['text', 'category_level_1', 'category_level_2']]
    return df

In [None]:
def tfidf_vectorize(texts):
    """Vectorize text using TF-IDF with optimized settings. I try different parameters """
    vectorizer = TfidfVectorizer(max_features=30000,  # I should reduce if overfitting
                                 ngram_range=(1, 3),  # Try using trigrams
                                 min_df=3,  # Minimum document frequency
                                 max_df=0.7)  # Max proportion of docs containing the term
    
    #different parameters using 
    #vectorizer =TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,2), norm='l2', max_features=50000)
    return vectorizer.fit_transform(texts)

# Plot metrics Functions

In [None]:

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score,f1_score


# Function to plot confusion matrix
def plot_confusion_matrix(y_true, y_pred, labels, title='Confusion Matrix'):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(title)
    plt.show()

#not really helpful, quite unecessary 
def print_qualitative_results(model, X_test, y_test, labels, num_examples=5):
    y_pred = model.predict(X_test)
    for i in range(num_examples):
        print(f"Input Text: {X_test[i]}")
        print(f"Predicted: {labels[y_pred[i]]}")
        print(f"Ground Truth: {labels[y_test[i]]}\n")


In [None]:

def cross_validation_models(model, X, y):
    
    # Setting up cross-validation with KFold 
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # Performing cross-validation to get predictions
    y_pred = cross_val_predict(model, X, y, cv=kf)
    
    # Computing the confusion matrix
    conf_matrix = confusion_matrix(y, y_pred)
    
    # Plotting the confusion matrix
    plt.figure(figsize=(10, 7))
    sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', cbar=False)
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title('Confusion Matrix')
    plt.show()
    
    # Calculating other metrics from confusion matrix
    accuracy = np.trace(conf_matrix) / float(np.sum(conf_matrix))
    precision = np.diag(conf_matrix) / np.sum(conf_matrix, axis=0)
    recall = np.diag(conf_matrix) / np.sum(conf_matrix, axis=1)
    f1_score = 2 * precision * recall / (precision + recall)
    
    # the results for each metric into mean scores to summarize the performance
    metrics = {
        'accuracy': accuracy,
        'precision': np.nanmean(precision),  # using nanmean to avoid NaN values affecting the mean calculation
        'recall': np.nanmean(recall),
        'f1_score': np.nanmean(f1_score)
    }
    
    # Outputting the results for quick inspection
    print(f"Cross-Validated Accuracy: {metrics['accuracy']:.4f}")
    print(f"Cross-Validated Precision: {metrics['precision']:.4f}")
    print(f"Cross-Validated Recall: {metrics['recall']:.4f}")
    print(f"Cross-Validated F1-Score: {metrics['f1_score']:.4f}")
    
    return metrics

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer

# Load and preprocess data
file_path = '/Users/theodorapoultsidou/Desktop/machinelearning/news-classification.csv'
df = load_and_preprocess_data(file_path)

# Vectorize text data
tfidf_vectorized_data = tfidf_vectorize(df['text'])

# Encode labels for Level 1 and Level 2 categories
label_encoder_1 = LabelEncoder()
y_level_1 = label_encoder_1.fit_transform(df['category_level_1'])

label_encoder_2 = LabelEncoder()
y_level_2 = label_encoder_2.fit_transform(df['category_level_2'])

# Split the data for level 1
X_train, X_test, y_train_level_1, y_test_level_1 = train_test_split(tfidf_vectorized_data, y_level_1, test_size=0.2, random_state=42)

# Split the data for level 2
X_train_level_2, X_test_level_2, y_train_level_2, y_test_level_2 = train_test_split(tfidf_vectorized_data, y_level_2, test_size=0.2, random_state=42)
    


In [None]:
df.head()

In [None]:
# Display basic information about the dataset
print("Basic Information:")
print(df.info())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

In [None]:
# Check the distribution of the target variables
print("\nDistribution of Category Level 1:")
print(df['category_level_1'].value_counts())

print("\nDistribution of Category Level 2:")
print(df['category_level_2'].value_counts())


# Model Training and Evaluation 

#### test different alpha values for Naive Bayes

In [None]:
# Define the range of alpha values to test
alpha_values = np.logspace(-3, 1, 10)  # 10 values from 0.001 to 10

# Lists to store the results
train_accuracies = []
test_accuracies = []
# Evaluate the model for each alpha value
for alpha in alpha_values:
    model = MultinomialNB(alpha=alpha)
    ovr_model = OneVsRestClassifier(model)
    
    # Cross-validation scores
    metrics_nb_l1 = cross_validation_models(ovr_model, tfidf_vectorized_data, y_level_1)
    test_accuracy = np.mean(metrics_nb_l1['accuracy'])
    
    # Fit the model on the entire training set and evaluate on the test set
    ovr_model.fit(X_train, y_train_level_1)
    train_accuracy = ovr_model.score(X_train, y_train_level_1)
    test_accuracy = ovr_model.score(X_test, y_test_level_1)
    
    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy)

# Plot the accuracy curve
plt.figure(figsize=(5, 3))
plt.plot(alpha_values, train_accuracies, label='Train Accuracy', marker='o')
plt.plot(alpha_values, test_accuracies, label='Test Accuracy', marker='o')
plt.xscale('log')
plt.xlabel('Alpha')
plt.ylabel('Accuracy')
plt.title('Naive Bayes Accuracy vs. Alpha')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Train and cross evaluate models for category_level_1 
model_nb_l1  = MultinomialNB(alpha=0.1)
metrics_nb_l1 = cross_validation_models(model_nb_l1, tfidf_vectorized_data, y_level_1)


#### test different k for the KNN

In [None]:
X_train, X_test, y_train_level_1, y_test_level_1 = train_test_split(tfidf_vectorized_data, y_level_1, test_size=0.2, random_state=42)

# Define the range of k values to test with a step of 2
k_values = range(1, 21, 4)  # k from 1 to 20 with step of 4

# Lists to store the results
train_accuracies = []
test_accuracies = []

# Evaluate the model for each k value
for k in k_values:
    model = KNeighborsClassifier(n_neighbors=k)
    
    # Fit the model on the entire training set and evaluate on the test set
    model.fit(X_train, y_train_level_1)
    train_accuracy = model.score(X_train, y_train_level_1)
    test_accuracy = model.score(X_test, y_test_level_1)
    
    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy)

# Plot the accuracy curve
plt.figure(figsize=(6, 4))
plt.plot(k_values, train_accuracies, label='Train Accuracy', marker='o')
plt.plot(k_values, test_accuracies, label='Test Accuracy', marker='o')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Accuracy')
plt.title('KNN Accuracy vs. Number of Neighbors (k)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Train and cross evaluate models for category_level_1 with KNN
model_knn_l1 = KNeighborsClassifier(n_neighbors=16)
metrics_knn_l1 = cross_validation_models(model_knn_l1, tfidf_vectorized_data, y_level_1)

#### try different C for the logistic 

In [None]:
# Define the range of C values to test
C_values = np.logspace(-3, 2, 10)  # 10 values from 0.001 to 1000

# Lists to store the results
train_accuracies = []
test_accuracies = []

# Evaluate the model for each C value
for C in C_values:
    model = LogisticRegression(C=C, solver='sag', max_iter=1000)
    ovr_model = OneVsRestClassifier(model)
    
    # Cross-validation scores
    metrics_lr_l1 = cross_validation_models(ovr_model, tfidf_vectorized_data, y_level_1)
    test_accuracy = np.mean(metrics_lr_l1['accuracy'])
    
    # Fit the model on the entire training set and evaluate on the test set
    ovr_model.fit(X_train, y_train_level_1)
    train_accuracy = ovr_model.score(X_train, y_train_level_1)
    test_accuracy = ovr_model.score(X_test, y_test_level_1)
    
    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy)

# Plot the accuracy curve
plt.figure(figsize=(10, 6))
plt.plot(C_values, train_accuracies, label='Train Accuracy', marker='o')
plt.plot(C_values, test_accuracies, label='Test Accuracy', marker='o')
plt.xscale('log')
plt.xlabel('Regularization Strength (C)')
plt.ylabel('Accuracy')
plt.title('Logistic Regression Accuracy vs. Regularization Strength (C)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Train and cross evaluate models for category_level_1 with Logistic Regression
model_lr_l1 = OneVsRestClassifier(LogisticRegression(C=100,solver='sag', max_iter=1000))
metrics_lr_l1 = cross_validation_models(model_lr_l1, tfidf_vectorized_data, y_level_1)

#### try different C for the SVC 

In [None]:
X_train, X_test, y_train_level_1, y_test_level_1 = train_test_split(tfidf_vectorized_data, y_level_1, test_size=0.2, random_state=42)


# Define the range of C values to test
C_values = np.logspace(-5, 15, num=11, base=2) 
# Lists to store the results
train_accuracies = []
test_accuracies = []

# Evaluate the model for each C value
for C in C_values:
    model = SVC(C=C, kernel='linear')
    ovr_model = OneVsRestClassifier(model)
    
    # Fit the model on the entire training set and evaluate on the test set
    ovr_model.fit(X_train, y_train_level_1)
    train_accuracy = ovr_model.score(X_train, y_train_level_1)
    test_accuracy = ovr_model.score(X_test, y_test_level_1)
    
    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy)

# Plot the accuracy curve
plt.figure(figsize=(10, 6))
plt.plot(C_values, train_accuracies, label='Train Accuracy', marker='o')
plt.plot(C_values, test_accuracies, label='Test Accuracy', marker='o')
plt.xscale('log')
plt.xlabel('Regularization Parameter (C)')
plt.ylabel('Accuracy')
plt.title('SVC Accuracy vs. Regularization Parameter (C)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# SVC
model_svc_l1 = OneVsRestClassifier(SVC(C=10,kernel='linear'))
metrics_svc_l1 = cross_validation_models(model_svc_l1, tfidf_vectorized_data, y_level_1)

In [None]:
# KNN
model_knn_l2 = KNeighborsClassifier(n_neighbors=16)
metrics_knn_l2 = cross_validation_models(model_knn_l2, X_train_level_2, y_train_level_2)

In [None]:
# Train and cross evaluate models for category_level_2
model_nb_l2 = MultinomialNB(alpha=0.1)
metrics_nb_l2 = cross_validation_models(model_nb_l2, tfidf_vectorized_data, y_level_2)


In [None]:
# Logistic Regression
model_lr_l2 = OneVsRestClassifier(LogisticRegression(C=100,solver='sag',max_iter=1000))
metrics_lr_l2 = cross_validation_models(model_lr_l2, tfidf_vectorized_data, y_level_2)

In [None]:
# SVC
model_svc_l2 = OneVsRestClassifier(SVC(C=10, kernel='linear'))
metrics_svc_l2 = cross_validation_models(model_svc_l2, tfidf_vectorized_data, y_level_2)


In [None]:
import time
# Initialize a dictionary to store results for each Level 1 category
hierarchical_results = {}
model_aggregate_metrics = {
    'Naive Bayes': {'Accuracy': [], 'Precision': [], 'Recall': [], 'F1-Score': [], 'Training Time': [], 'Prediction Time': []},
    'Logistic Regression': {'Accuracy': [], 'Precision': [], 'Recall': [], 'F1-Score': [], 'Training Time': [], 'Prediction Time': []},
    'SVC': {'Accuracy': [], 'Precision': [], 'Recall': [], 'F1-Score': [], 'Training Time': [], 'Prediction Time': []},
    'KNN': {'Accuracy': [], 'Precision': [], 'Recall': [], 'F1-Score': [], 'Training Time': [], 'Prediction Time': []}
}

def measure_time(model, X_train, X_test, y_train, y_test):
    start_train = time.time()
    model.fit(X_train, y_train)
    end_train = time.time()
    
    start_predict = time.time()
    y_pred = model.predict(X_test)
    end_predict = time.time()
    
    training_time = end_train - start_train
    prediction_time = end_predict - start_predict
    
    return training_time, prediction_time

for category in label_encoder_1.classes_:
    print(f"Processing category: {category}")
    df_category = df[df['category_level_1'] == category]
    
    if df_category.empty:
        print(f"No data available for category {category}. Skipping...")
        continue

    tfidf_vectorizer = TfidfVectorizer()
    X_category = tfidf_vectorizer.fit_transform(df_category['text'])
    y_category = label_encoder_2.transform(df_category['category_level_2'])

    X_train, X_test, y_train, y_test = train_test_split(X_category, y_category, test_size=0.2, random_state=42)

    # Train and evaluate models, storing and printing results
    for model_name, model in {
        'Naive Bayes': OneVsRestClassifier(MultinomialNB(alpha=0.1)),
        'Logistic Regression':  OneVsRestClassifier(LogisticRegression(C=100,solver='sag',max_iter=1000 )),
        'SVC':  OneVsRestClassifier(SVC(C=10, kernel='linear')),
        'KNN': KNeighborsClassifier(n_neighbors=6)
    }.items():
        train_time, predict_time = measure_time(model, X_train, X_test, y_train, y_test)
        predictions = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, predictions)
        precision = precision_score(y_test, predictions, average='macro')
        recall = recall_score(y_test, predictions, average='macro')
        f1 = f1_score(y_test, predictions, average='macro')
        
        model_aggregate_metrics[model_name]['Accuracy'].append(accuracy)
        model_aggregate_metrics[model_name]['Precision'].append(precision)
        model_aggregate_metrics[model_name]['Recall'].append(recall)
        model_aggregate_metrics[model_name]['F1-Score'].append(f1)
        model_aggregate_metrics[model_name]['Training Time'].append(train_time)
        model_aggregate_metrics[model_name]['Prediction Time'].append(predict_time)

        hierarchical_results.setdefault(category, {})[model_name] = {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1,
            'Training Time': train_time,
            'Prediction Time': predict_time
        }
        print(f"Results for {category} - {model_name}: {hierarchical_results[category][model_name]}")

# Calculate and print average metrics for each model across all categories
for model_name, metrics in model_aggregate_metrics.items():
    print(f"\nAverage Metrics for {model_name}:")
    for metric_name, values in metrics.items():
        average_metric = np.mean(values)
        print(f"{metric_name}: {average_metric:.4f}")

# Print the average training and prediction times for each model
print("\nAverage Training and Prediction Times for Each Model:")
for model_name, metrics in model_aggregate_metrics.items():
    avg_training_time = np.mean(metrics['Training Time'])
    avg_prediction_time = np.mean(metrics['Prediction Time'])
    print(f"{model_name}:")
    print(f"  Average Training Time: {avg_training_time:.4f} seconds")
    print(f"  Average Prediction Time: {avg_prediction_time:.4f} seconds")

In [None]:
# Calculate and print average metrics for each model across all categories
for model_name, metrics in model_aggregate_metrics.items():
    print(f"\nAverage Metrics for {model_name}:")
    for metric_name, values in metrics.items():
        average_metric = np.mean(values)
        print(f"{metric_name}: {average_metric:.4f}")


#### Calculate the time complexity, I run it on a single split, not kfold cross validation, to test it with the hierarchical which is on a single split 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_vectorized_data, y_level_1, test_size=0.2, random_state=42)

nb = (MultinomialNB(alpha=0.1))
lr =  OneVsRestClassifier(LogisticRegression(C=100,solver='sag',max_iter=1000 ))
svc = OneVsRestClassifier(SVC(C=10, kernel='linear'))
knn = KNeighborsClassifier(n_neighbors=10)


def measure_time(model, X_train, X_test, y_train, y_test):
    start_train = time.time()
    model.fit(X_train, y_train)
    end_train = time.time()
    
    start_predict = time.time()
    y_pred = model.predict(X_test)
    end_predict = time.time()
    
    training_time = end_train - start_train
    prediction_time = end_predict - start_predict
    
    return training_time, prediction_time

nb_train_time, nb_predict_time = measure_time(nb, X_train, X_test, y_train, y_test)
lr_train_time, lr_predict_time = measure_time(lr, X_train, X_test, y_train, y_test)
svc_train_time, svc_predict_time = measure_time(svc, X_train, X_test, y_train, y_test)
knn_train_time, knn_predict_time = measure_time(knn, X_train, X_test, y_train, y_test)

print(f"Naive Bayes - Training Time: {nb_train_time:.4f}s, Prediction Time: {nb_predict_time:.4f}s")
print(f"Logistic Regression - Training Time: {lr_train_time:.4f}s, Prediction Time: {lr_predict_time:.4f}s")
print(f"SVC - Training Time: {svc_train_time:.4f}s, Prediction Time: {svc_predict_time:.4f}s")
print(f"KNN - Training Time: {knn_train_time:.4f}s, Prediction Time: {knn_predict_time:.4f}s")

In [None]:
X_train_level_2, X_test_level_2, y_train, y_test = train_test_split(tfidf_vectorized_data, y_level_2, test_size=0.2, random_state=42)


nb = (MultinomialNB(alpha=0.1))
lr =  OneVsRestClassifier(LogisticRegression(C=100,solver='sag',max_iter=1000 ))
svc = OneVsRestClassifier(SVC(C=10, kernel='linear'))
knn = KNeighborsClassifier(n_neighbors=10)


def measure_time(model, X_train, X_test, y_train, y_test):
    start_train = time.time()
    model.fit(X_train, y_train)
    end_train = time.time()
    
    start_predict = time.time()
    y_pred = model.predict(X_test)
    end_predict = time.time()
    
    training_time = end_train - start_train
    prediction_time = end_predict - start_predict
    
    return training_time, prediction_time

nb_train_time, nb_predict_time = measure_time(nb, X_train, X_test, y_train, y_test)
lr_train_time, lr_predict_time = measure_time(lr, X_train, X_test, y_train, y_test)
svc_train_time, svc_predict_time = measure_time(svc, X_train, X_test, y_train, y_test)
knn_train_time, knn_predict_time = measure_time(knn, X_train, X_test, y_train, y_test)

print(f"Naive Bayes - Training Time: {nb_train_time:.4f}s, Prediction Time: {nb_predict_time:.4f}s")
print(f"Logistic Regression - Training Time: {lr_train_time:.4f}s, Prediction Time: {lr_predict_time:.4f}s")
print(f"SVC - Training Time: {svc_train_time:.4f}s, Prediction Time: {svc_predict_time:.4f}s")
print(f"KNN - Training Time: {knn_train_time:.4f}s, Prediction Time: {knn_predict_time:.4f}s")
