In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Imports

In [None]:
!pip install datasketch

Collecting datasketch
  Downloading datasketch-1.6.5-py3-none-any.whl.metadata (5.8 kB)
Downloading datasketch-1.6.5-py3-none-any.whl (89 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.2/89.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datasketch
Successfully installed datasketch-1.6.5


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import spacy
from multiprocessing import Pool
import time

import pickle
import csv

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, f1_score
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.preprocessing import MaxAbsScaler
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import binarize
from sklearn.model_selection import train_test_split

from scipy.spatial.distance import jaccard
from scipy.sparse import csr_matrix

from datasketch import MinHash, MinHashLSH

print("Imports have been run...")

Imports have been run...


## Preprocessing
### Read datasets

In [None]:
train_df = pd.read_csv("/kaggle/input/bigdata2024classification/train.csv")
test_df = pd.read_csv("/kaggle/input/bigdata2024classification/test_without_labels.csv")

print("Files loaded...")

Files loaded...


### Visualise label distribution in the dataset

In [None]:
label_counts = train_df['Label'].value_counts()

label_counts.plot(kind='bar')
plt.title('Label distribution in train set')
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()

### Dataset cleanup and pre-processing

## Bag of Words + Scaling

In [None]:
# Remove rows with empty content or label
train_df = train_df.dropna(subset=['Content', 'Label'])

# Transform all content to lower case
train_df['Content'] = train_df['Content'].str.lower()
test_df['Content'] = test_df['Content'].str.lower()

print("Cleanup done...")

Cleanup done...


### Notes for max features and lemmatization

#### Max Features
* 1000: Lower 90s percentage, good performance
* 5000: Best results
* 10000: accuracy declines again

#### Lemmatization
In all the feature combinations it does not seem to improve accuracy. Just doubles the total computation time including the fitting of the SVM and Forest models.

In [None]:
vectorizer = CountVectorizer(stop_words='english', max_features=1000)
vectorizer_big = CountVectorizer(stop_words='english', max_features=5000)

count_vect = vectorizer.fit_transform(train_df['Content'])
count_vect_test = vectorizer.transform(test_df['Content'])

count_vect_big = vectorizer_big.fit_transform(train_df['Content'])
count_vect_test_big = vectorizer_big.transform(test_df['Content'])



scaler = MaxAbsScaler()

count_vect_scaled = scaler.fit_transform(count_vect)
count_vect_test_scaled = scaler.transform(count_vect_test)

count_vect_big_scaled = scaler.fit_transform(count_vect_big)
count_vect_test_big_scaled = scaler.transform(count_vect_test_big)

print("Bag of words and scaling done...")

Bag of words and scaling done...


## SVM

In [None]:
# Support Vector Machine (SVM)
# LinearSVC is another (faster) implementation of Support Vector Classification for the case of a linear kernel
# It implements “one-vs-the-rest” multi-class strategy
svm_model = LinearSVC(max_iter=10000)

# Perform 5-fold cross-validation and get predictions for each fold
prediction = cross_val_predict(svm_model, count_vect_scaled, train_df['Label'], cv=5)

# Print classification report for each category
print("========== SVM Classification Report ==========\n")
print(classification_report(train_df['Label'], prediction))

# Perform 5-fold cross-validation and get predictions for each fold
prediction = cross_val_predict(svm_model, count_vect_big_scaled, train_df['Label'], cv=5)

# Print classification report for each category
print("========== SVM Classification Report - More Features ==========\n")
print(classification_report(train_df['Label'], prediction))


               precision    recall  f1-score   support

     Business       0.80      0.76      0.78     24834
Entertainment       0.84      0.94      0.89     44834
       Health       0.83      0.68      0.75     12020
   Technology       0.85      0.80      0.82     30107

     accuracy                           0.83    111795
    macro avg       0.83      0.79      0.81    111795
 weighted avg       0.83      0.83      0.83    111795


               precision    recall  f1-score   support

     Business       0.88      0.88      0.88     24834
Entertainment       0.96      0.97      0.97     44834
       Health       0.92      0.89      0.90     12020
   Technology       0.90      0.91      0.90     30107

     accuracy                           0.92    111795
    macro avg       0.92      0.91      0.91    111795
 weighted avg       0.92      0.92      0.92    111795



## Random Forest

In [None]:
# Random Forest
# A random forest is a meta estimator that fits a number of decision tree classifiers (estimators) on various sub-samples of the dataset
# and uses averaging to improve the predictive accuracy and control over-fitting.
rf_model = RandomForestClassifier(n_estimators=100, n_jobs=-1) # n_jobs=-1 for using all available processors

# Perform 5-fold cross-validation and get predictions for each fold
prediction = cross_val_predict(rf_model, count_vect_scaled, train_df['Label'], cv=5)

# Print classification report for each category
print("========== Random Forest Classification Report ==========\n")
print(classification_report(train_df['Label'], prediction))

# Perform 5-fold cross-validation and get predictions for each fold
prediction = cross_val_predict(rf_model, count_vect_big_scaled, train_df['Label'], cv=5)

# Print classification report for each category
print("========== Random Forest Classification Report - More Features ==========\n")
print(classification_report(train_df['Label'], prediction))

## Predict Test Set with the best performing model

In [None]:
# The best performing model was the SVM with 5000 features
# It had 95% accuracy

svm_model = LinearSVC(max_iter=10000)

svm_model.fit(count_vect_big_scaled, train_df['Label'])
print("Training complete...")

prediction = svm_model.predict(count_vect_test_big_scaled)
print("Prediction of test set complete...")

In [None]:
with open('testSet_categories.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Id","Predicted"])

    for i, label in enumerate(prediction):
        writer.writerow([test_df['Id'][i], label])

print("CSV created...")

## Nearest Neigbours w/ Jaccard - Brute

In [None]:
# Split the train set to measure F1 Score for Question 1 --- Comment out when not wanting to calculate F1 Score of Brute Force
# train_x, test_x, train_y, test_y = train_test_split(train_df['Content'], train_df['Label'], test_size=0.1, random_state=0)

# The assignment's original datasets -- Comment out when wanting to calculate F1 Score of Brute Force
train_x, test_x = train_df['Content'], test_df['Content']


# Revectorize the training and testing sets for clarity
vectorizer = CountVectorizer(max_features=5000)

count_vect_train = vectorizer.fit_transform(train_x)
count_vect_test = vectorizer.transform(test_x)

# Create an analyzer to use for the LSH
analyzer = vectorizer.build_analyzer()

print("KNN pre-processing done...")

In [None]:
# Make the vects dense
count_vect_train = count_vect_train.toarray()
count_vect_test = count_vect_test.toarray()

print("Vects became dense...")

# Make the vects boolean for jaccard
count_vect_train = count_vect_train.astype(bool)
count_vect_test = count_vect_test.astype(bool)

print("Vects became bool for jaccard...")

In [None]:
# Start counting time here for building
start_time = time.time()

knn_model = NearestNeighbors(n_neighbors=7, algorithm='brute', metric='jaccard', n_jobs=-1)

knn_model.fit(count_vect_train)

# Build time after fitting the knn_model
build_time = time.time() - start_time

print("KNN fitting completed...")
print(f"Build time: {build_time} seconds")

# Start counting time here for querying
start_time = time.time()

distances, indices = knn_model.kneighbors(count_vect_test)

print ("Finished querying...")

# Querying time
query_time = time.time() - start_time
print(f"Query time: {query_time} seconds")

# Total time
total_time = build_time + query_time
print(f"Total time: {total_time} seconds")


#### Calculate F1 Metric

In [None]:
# Comment out when running with the assignment's original datasets

# # Convert neighbor indices to labels using majority voting
# predicted_labels = []
# for i in range(len(indices)):
#     neighbor_labels = train_y.iloc[indices[i]]  # Get the labels of nearest neighbors
#     predicted_label = max(set(neighbor_labels), key=list(neighbor_labels).count)  # Majority voting
#     predicted_labels.append(predicted_label)

# # Compute F1-score
# f1 = f1_score(test_y, predicted_labels, average='weighted')
# print(f"KNN F1 Score: {f1:.4f}")

## Nearest Neigbours w/ Jaccard - MinHash LSH

In [None]:
# Create MinHash functions
def get_minhash(content, num_perm):
    m = MinHash(num_perm=num_perm)
    words = analyzer(content)

    for word in words:
        m.update(word.encode('utf8'))
    return m

# Query the LSH index for similar content
def query_lsh_index(content, lsh, minhashes, num_perm, num_candidates=7):
    # Create MinHash for the content
    content_minhash = get_minhash(content, num_perm)

    # Query the LSH index for the most similar contents
    similar_contents = lsh.query(content_minhash)

    # Only return the top 'num_candidates' documents from the LSH results - For the question's specification 7
    similar_contents = similar_contents[:num_candidates]

    # Retrieve the corresponding documents from the minhashes dictionary
    similar_contents = [(i, minhashes[i]) for i in similar_contents]

    return similar_contents

### Different params cells

In [None]:
#===== BUILD =====#

threshold = 0.8
num_perm = 16

# Start counting time here for building
start_time = time.time()

# Create LSH Index
lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)

# Create MinHash
minhashes = {}
for i, content in enumerate(train_x):
    minhashes[i] = get_minhash(content, num_perm)
    lsh.insert(i, minhashes[i])

# Build time after fitting the knn_model
build_time = time.time() - start_time

print(f"Finished building LSH Index and Minhashes (threshold={threshold}, num_perm={num_perm})...")
print(f"Build time: {build_time} seconds")

#===== QUERY =====#

# Start counting time here for querying
start_time = time.time()

lsh_results = []
for (i, content) in enumerate(test_x):
    similar_docs = query_lsh_index(content, lsh, minhashes, num_perm, num_candidates=7)
    lsh_results.append(similar_docs)

print ("Finished querying...")

# Querying time
query_time = time.time() - start_time
print(f"Query time: {query_time} seconds")

# Total time
total_time = build_time + query_time
print(f"Total time: {total_time} seconds")

#===== ASSESS =====#

correct_matches = 0
total_retrieved = 0

for i, brute_neighbors in enumerate(indices):
    lsh_neighbors = [doc_id for doc_id, _ in lsh_results[i]]  # Extract only document IDs
    correct_matches += len(set(map(int, lsh_neighbors)) & set(map(int, brute_neighbors)))
    total_retrieved += len(lsh_neighbors)

precision = correct_matches / total_retrieved if total_retrieved > 0 else 0

print(f"Precision for threshold={threshold} and num_perm={num_perm}: {precision}")

In [None]:
#===== BUILD =====#

threshold = 0.9
num_perm = 32

# Start counting time here for building
start_time = time.time()

# Create LSH Index
lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)

# Create MinHash
minhashes = {}
for i, content in enumerate(train_x):
    minhashes[i] = get_minhash(content, num_perm)
    lsh.insert(i, minhashes[i])

# Build time after fitting the knn_model
build_time = time.time() - start_time

print(f"Finished building LSH Index and Minhashes (threshold={threshold}, num_perm={num_perm})...")
print(f"Build time: {build_time} seconds")

#===== QUERY =====#

# Start counting time here for querying
start_time = time.time()

lsh_results = []
for (i, content) in enumerate(test_x):
    similar_docs = query_lsh_index(content, lsh, minhashes, num_perm, num_candidates=7)
    lsh_results.append(similar_docs)

print ("Finished querying...")

# Querying time
query_time = time.time() - start_time
print(f"Query time: {query_time} seconds")

# Total time
total_time = build_time + query_time
print(f"Total time: {total_time} seconds")

#===== ASSESS =====#

correct_matches = 0
total_retrieved = 0

for i, brute_neighbors in enumerate(indices):
    lsh_neighbors = [doc_id for doc_id, _ in lsh_results[i]]  # Extract only document IDs
    correct_matches += len(set(map(int, lsh_neighbors)) & set(map(int, brute_neighbors)))
    total_retrieved += len(lsh_neighbors)

precision = correct_matches / total_retrieved if total_retrieved > 0 else 0

print(f"Precision for threshold={threshold} and num_perm={num_perm}: {precision}")

In [None]:
#===== BUILD =====#

threshold = 0.9
num_perm = 64

# Start counting time here for building
start_time = time.time()

# Create LSH Index
lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)

# Create MinHash
minhashes = {}
for i, content in enumerate(train_x):
    minhashes[i] = get_minhash(content, num_perm)
    lsh.insert(i, minhashes[i])

# Build time after fitting the knn_model
build_time = time.time() - start_time

print(f"Finished building LSH Index and Minhashes (threshold={threshold}, num_perm={num_perm})...")
print(f"Build time: {build_time} seconds")

#===== QUERY =====#

# Start counting time here for querying
start_time = time.time()

lsh_results = []
for (i, content) in enumerate(test_x):
    similar_docs = query_lsh_index(content, lsh, minhashes, num_perm, num_candidates=7)
    lsh_results.append(similar_docs)

print ("Finished querying...")

# Querying time
query_time = time.time() - start_time
print(f"Query time: {query_time} seconds")

# Total time
total_time = build_time + query_time
print(f"Total time: {total_time} seconds")

#===== ASSESS =====#

correct_matches = 0
total_retrieved = 0

for i, brute_neighbors in enumerate(indices):
    lsh_neighbors = [doc_id for doc_id, _ in lsh_results[i]]  # Extract only document IDs
    correct_matches += len(set(map(int, lsh_neighbors)) & set(map(int, brute_neighbors)))
    total_retrieved += len(lsh_neighbors)

precision = correct_matches / total_retrieved if total_retrieved > 0 else 0

print(f"Precision for threshold={threshold} and num_perm={num_perm}: {precision}")

In [None]:
#===== BUILD =====#

threshold = 0.7
num_perm = 32

# Start counting time here for building
start_time = time.time()

# Create LSH Index
lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)

# Create MinHash
minhashes = {}
for i, content in enumerate(train_x):
    minhashes[i] = get_minhash(content, num_perm)
    lsh.insert(i, minhashes[i])

# Build time after fitting the knn_model
build_time = time.time() - start_time

print(f"Finished building LSH Index and Minhashes (threshold={threshold}, num_perm={num_perm})...")
print(f"Build time: {build_time} seconds")

#===== QUERY =====#

# Start counting time here for querying
start_time = time.time()

lsh_results = []
for (i, content) in enumerate(test_x):
    similar_docs = query_lsh_index(content, lsh, minhashes, num_perm, num_candidates=7)
    lsh_results.append(similar_docs)

print ("Finished querying...")

# Querying time
query_time = time.time() - start_time
print(f"Query time: {query_time} seconds")

# Total time
total_time = build_time + query_time
print(f"Total time: {total_time} seconds")

#===== ASSESS =====#

correct_matches = 0
total_retrieved = 0

for i, brute_neighbors in enumerate(indices):
    lsh_neighbors = [doc_id for doc_id, _ in lsh_results[i]]  # Extract only document IDs
    correct_matches += len(set(map(int, lsh_neighbors)) & set(map(int, brute_neighbors)))
    total_retrieved += len(lsh_neighbors)

precision = correct_matches / total_retrieved if total_retrieved > 0 else 0

print(f"Precision for threshold={threshold} and num_perm={num_perm}: {precision}")

In [None]:
#===== BUILD =====#

threshold = 0.7
num_perm = 64

# Start counting time here for building
start_time = time.time()

# Create LSH Index
lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)

# Create MinHash
minhashes = {}
for i, content in enumerate(train_x):
    minhashes[i] = get_minhash(content, num_perm)
    lsh.insert(i, minhashes[i])

# Build time after fitting the knn_model
build_time = time.time() - start_time

print(f"Finished building LSH Index and Minhashes (threshold={threshold}, num_perm={num_perm})...")
print(f"Build time: {build_time} seconds")

#===== QUERY =====#

# Start counting time here for querying
start_time = time.time()

lsh_results = []
for (i, content) in enumerate(test_x):
    similar_docs = query_lsh_index(content, lsh, minhashes, num_perm, num_candidates=7)
    lsh_results.append(similar_docs)

print ("Finished querying...")

# Querying time
query_time = time.time() - start_time
print(f"Query time: {query_time} seconds")

# Total time
total_time = build_time + query_time
print(f"Total time: {total_time} seconds")

#===== ASSESS =====#

correct_matches = 0
total_retrieved = 0

for i, brute_neighbors in enumerate(indices):
    lsh_neighbors = [doc_id for doc_id, _ in lsh_results[i]]  # Extract only document IDs
    correct_matches += len(set(map(int, lsh_neighbors)) & set(map(int, brute_neighbors)))
    total_retrieved += len(lsh_neighbors)

precision = correct_matches / total_retrieved if total_retrieved > 0 else 0

print(f"Precision for threshold={threshold} and num_perm={num_perm}: {precision}")

In [None]:
#===== BUILD =====#

threshold = 0.5
num_perm = 16

# Start counting time here for building
start_time = time.time()

# Create LSH Index
lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)

# Create MinHash
minhashes = {}
for i, content in enumerate(train_x):
    minhashes[i] = get_minhash(content, num_perm)
    lsh.insert(i, minhashes[i])

# Build time after fitting the knn_model
build_time = time.time() - start_time

print(f"Finished building LSH Index and Minhashes (threshold={threshold}, num_perm={num_perm})...")
print(f"Build time: {build_time} seconds")

#===== QUERY =====#

# Start counting time here for querying
start_time = time.time()

lsh_results = []
for (i, content) in enumerate(test_x):
    similar_docs = query_lsh_index(content, lsh, minhashes, num_perm, num_candidates=7)
    lsh_results.append(similar_docs)

print ("Finished querying...")

# Querying time
query_time = time.time() - start_time
print(f"Query time: {query_time} seconds")

# Total time
total_time = build_time + query_time
print(f"Total time: {total_time} seconds")

#===== ASSESS =====#

correct_matches = 0
total_retrieved = 0

for i, brute_neighbors in enumerate(indices):
    lsh_neighbors = [doc_id for doc_id, _ in lsh_results[i]]  # Extract only document IDs
    correct_matches += len(set(map(int, lsh_neighbors)) & set(map(int, brute_neighbors)))
    total_retrieved += len(lsh_neighbors)

precision = correct_matches / total_retrieved if total_retrieved > 0 else 0

print(f"Precision for threshold={threshold} and num_perm={num_perm}: {precision}")

In [None]:
#===== BUILD =====#

threshold = 0.5
num_perm = 64

# Start counting time here for building
start_time = time.time()

# Create LSH Index
lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)

# Create MinHash
minhashes = {}
for i, content in enumerate(train_x):
    minhashes[i] = get_minhash(content, num_perm)
    lsh.insert(i, minhashes[i])

# Build time after fitting the knn_model
build_time = time.time() - start_time

print(f"Finished building LSH Index and Minhashes (threshold={threshold}, num_perm={num_perm})...")
print(f"Build time: {build_time} seconds")

#===== QUERY =====#

# Start counting time here for querying
start_time = time.time()

lsh_results = []
for (i, content) in enumerate(test_x):
    similar_docs = query_lsh_index(content, lsh, minhashes, num_perm, num_candidates=7)
    lsh_results.append(similar_docs)

print ("Finished querying...")

# Querying time
query_time = time.time() - start_time
print(f"Query time: {query_time} seconds")

# Total time
total_time = build_time + query_time
print(f"Total time: {total_time} seconds")

#===== ASSESS =====#

correct_matches = 0
total_retrieved = 0

for i, brute_neighbors in enumerate(indices):
    lsh_neighbors = [doc_id for doc_id, _ in lsh_results[i]]  # Extract only document IDs
    correct_matches += len(set(map(int, lsh_neighbors)) & set(map(int, brute_neighbors)))
    total_retrieved += len(lsh_neighbors)

precision = correct_matches / total_retrieved if total_retrieved > 0 else 0

print(f"Precision for threshold={threshold} and num_perm={num_perm}: {precision}")