In [39]:
import shutil
shutil.unpack_archive('/content/BBC News Test.csv.zip')
shutil.unpack_archive('/content/BBC News Train.csv.zip')

In [40]:
import numpy as np
import pandas as pd
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
import string
import nltk
nltk.download('stopwords')
import warnings
warnings.filterwarnings('ignore')

# Load the data
df_train = pd.read_csv('/content/BBC News Train.csv')
df_test = pd.read_csv('/content/BBC News Test.csv')

# Combine train and test data for consistent preprocessing
df_combined = pd.concat([df_train, df_test], axis=0, ignore_index=True)

# Text preprocessing
stop_words = stopwords.words('english')
punctuation = string.punctuation

df_combined['Text'] = df_combined['Text'].apply(lambda x: ''.join([i for i in x if i not in punctuation]).split())
df_combined['Text'] = df_combined['Text'].apply(lambda x: ' '.join([i for i in x if i not in stop_words]))

# Encode categories consistently
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df_combined['Category'])

# Split data back into train and test
X_train, X_test, y_train, y_test = train_test_split(df_combined['Text'], y_encoded, test_size=0.2, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=10000, max_df=0.8, min_df=5, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Fine-tune NMF parameters
n_latent_factors = 50
nmf_model = NMF(n_components=n_latent_factors, init='nndsvd', random_state=42)

# Fit and transform on training data
W_train = nmf_model.fit_transform(X_train_tfidf)
H_train = nmf_model.components_

# Transform test data
W_test = nmf_model.transform(X_test_tfidf)

# Predict category labels for test data
predicted_categories = np.argmax(W_test, axis=1)

# Create a DataFrame with index as document IDs and predicted categories
result_df = pd.DataFrame({'Predicted_Category': predicted_categories})

# Save the DataFrame to a CSV file
result_df.to_csv('/content/submission.csv', index=False)

print("Predictions saved to predicted_categories.csv")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Predictions saved to predicted_categories.csv
