In [12]:
import re
import nltk
from nltk.corpus import reuters
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
from nltk.stem import PorterStemmer
from sklearn.metrics import accuracy_score, classification_report


 #Download NLTK data files (run once)
# nltk.download('reuters')
# nltk.download('punkt')
# nltk.download('stopwords')

# Get a list of file IDs in the Reuters dataset
file_ids = reuters.fileids()


#Extract articles 
num_articles = 10000
articles = [reuters.raw(file_id) for file_id in file_ids[:num_articles]]

# Define preprocessing function
def preprocess(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = word_tokenize(text)  # Tokenize text
    stop_words = set(stopwords.words('english'))  # Get English stop words
    words = [word for word in words if word not in stop_words]  # Remove stop words
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)


# Preprocess articles with stemming
processed_articles = [preprocess(article) for article in articles]


# Split data into training, validation, and testing sets
train_articles, test_articles = train_test_split(processed_articles, test_size=0.5, random_state=42)
valid_articles, test_articles = train_test_split(test_articles, test_size=0.7, random_state=42)

# Create TF-IDF vectorizer and fit on training data
vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = vectorizer.fit_transform(articles)

# Transform validation and test data
X_valid = vectorizer.transform(valid_articles)
X_test = vectorizer.transform(test_articles)

# Get feature names (keywords)
keywords = vectorizer.get_feature_names_out()

# Sum the TF-IDF scores for each word across all articles in the training set
tfidf_scores = np.sum(X_train.toarray(), axis=0)

# Get keywords and their scores
keyword_scores = dict(zip(keywords, tfidf_scores))

# Sort keywords by their scores in descending order
sorted_keywords = sorted(keyword_scores.items(), key=lambda item: item[1], reverse=True)

# Get top N keywords (e.g., top 10)
top_n = 10
hot_keywords = sorted_keywords[:top_n]


# Print sizes of the splits
print(f"Training set size: {len(train_articles)} articles")
print(f"Validation set size: {len(valid_articles)} articles")
print(f"Testing set size: {len(test_articles)} articles")


# Convert categories to numerical format
from sklearn.preprocessing import LabelEncoder
categories = [reuters.categories(file_id)[0] for file_id in file_ids[:num_articles]]
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(categories)


from sklearn.metrics.pairwise import cosine_similarity

def compute_cosine_similarity(X):
    return cosine_similarity(X)

# Compute cosine similarity matrix
cosine_sim_matrix = compute_cosine_similarity(X_train)

# Example: Similarity between first article and others
def get_similar_articles(index, sim_matrix, top_n=500):
    sim_scores = list(enumerate(sim_matrix[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # Exclude the article itself
    return sim_scores

# Example: Get top 5 similar articles to the first article
similar_articles = get_similar_articles(0, cosine_sim_matrix, top_n=500)
#print(f"Top similar articles to article 0: {similar_articles}")


from sklearn.metrics import accuracy_score

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_encoded, test_size=0.5, random_state=42)

# Simple classification based on closest similarity (K-Nearest Neighbors for simplicity)
from sklearn.neighbors import KNeighborsClassifier

# Use cosine similarity as distance metric
knn = KNeighborsClassifier(n_neighbors=10, metric='cosine')
knn.fit(X_train, y_train)

# Predict on test set
y_test_pred = knn.predict(X_test)

# Evaluate model accuracy
KNN_test_accuracy = accuracy_score(y_test, y_test_pred)

# apply RandomForest
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=180, random_state=42)
rf_model.fit(X_train, y_train)
y_test_pred = rf_model.predict(X_test)
RANDOM_test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy with Random Forest: {RANDOM_test_accuracy:.2f}")



# Print accuracy
print(f"KNN_Test_Accuracy: {KNN_test_accuracy:.2f}")
from sklearn.metrics import classification_report, confusion_matrix

# print(confusion_matrix(y_test, y_test_pred))



print(classification_report(y_test, y_test_pred))



Training set size: 5000 articles
Validation set size: 1500 articles
Testing set size: 3500 articles
Test Accuracy with Random Forest: 0.84
KNN_Test_Accuracy: 0.82
              precision    recall  f1-score   support

           0       0.78      0.98      0.87      1076
           1       1.00      0.04      0.07        28
           2       0.29      0.10      0.15        20
           3       0.86      0.44      0.58        43
           4       0.93      0.47      0.62        30
           5       0.00      0.00      0.00         2
           6       1.00      0.83      0.91        41
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         3
           9       0.91      0.92      0.92        65
          10       0.79      0.50      0.61        30
          12       0.85      0.80      0.83       107
          13       1.00      0.27      0.43        22
          15       0.83      0.68      0.75        50
          16       0.00   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
