In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

# Load the dataset
file_path_new = '/path/to/your/updated_single_category.csv'
df_updated = pd.read_csv(file_path_new)

# Prepare the data: separate the categorized rows (training data) and uncategorized rows (to predict)
categorized_df = df_updated[df_updated['Single_Category'] != 'uncategorized']
uncategorized_df = df_updated[df_updated['Single_Category'] == 'uncategorized']

# Using 'Description' as the feature and 'Single_Category' as the label for training
X_train = categorized_df['Description']
y_train = categorized_df['Single_Category']
X_test = uncategorized_df['Description']

# Convert text data to TF-IDF feature vectors
vectorizer = TfidfVectorizer(max_features=5000)  # Limit to 5000 features for efficiency
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a K-Nearest Neighbors (KNN) classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_tfidf, y_train)

# Predict the categories for uncategorized rows
y_pred = knn.predict(X_test_tfidf)

# Assign the predicted categories back to the uncategorized rows
df_updated.loc[df_updated['Single_Category'] == 'uncategorized', 'Single_Category'] = y_pred

# Save the fully updated dataset with new categories assigned
final_output_knn_path = '/path/to/your/final_updated_with_categories_knn.csv'
df_updated.to_csv(final_output_knn_path, index=False)

# Check how many rows are still labeled as 'uncategorized' after the prediction
remaining_uncategorized_final_knn = df_updated[df_updated['Single_Category'] == 'uncategorized']
print(f'Remaining uncategorized rows: {remaining_uncategorized_final_knn.shape[0]}')
