In [None]:
import pandas as pd
import numpy as np
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import matplotlib.pyplot as plt

# Load the data
df = pd.read_csv('cleandata_processed.csv', index_col=0, nrows=19351)

# Ensure NLTK data is downloaded
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize NLTK stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Additional custom stopwords
additional_stopwords = {
    'according', 'actually', 'almost', 'already', 'although', 'always', 'another',
    'anything', 'around', 'away', 'believe', 'better', 'business', 'certain',
    'comes', 'concerning', 'consider', 'different', 'enough', 'especially',
    'everyone', 'everything', 'exactly', 'finally', 'following', 'happens', 'however',
    'important', 'includes', 'including', 'information', 'instead', 'involves',
    'least', 'maybe', 'might', 'much', 'often', 'once', 'others', 'perhaps',
    'possible', 'probably', 'provides', 'rather', 'recent', 'seems', 'several',
    'something', 'sometimes', "https", "com", "one", "two", "three", "four", "five", "six", "seven", "eight",
    "nine", "ten", "read", "new", "old", "also", "people", "person",
    "comment", "first", "last", "time", "said", "like", "says", "could", "social", "media",
    "january", "february", "march", "april", "may", "june", "july", "august",
    "september", "october", "november", "december",
    "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday",
    "day", "week", "month", "year", "today", "tomorrow", "yesterday",
    "get", "go", "back", "make", "way", "come", "keep", "take", "put",
    "thing", "think", "look", "see", "know", "use", "want", "need",
    "good", "bad", "great", "best", "better", "worst", "well", "much",
    "little", "big", "small", "large", "old", "young", "experience"
}

# Combine NLTK stopwords with additional stopwords
all_stopwords = set(stopwords.words('english')).union(additional_stopwords)

# Function to preprocess text
def preprocess_text(text):
    if not isinstance(text, str):
        return []
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    words = text.split()
    words = [word for word in words if word not in all_stopwords]  # Remove stopwords
    return words  # Return list of words instead of joined string

# Apply preprocessing to 'Article_Body'
df['Processed_Words'] = df['Article_Body'].apply(preprocess_text)

# Function to apply stemming to a list of words
def stem_words(words):
    return [stemmer.stem(word) for word in words]

# Function to apply lemmatization to a list of words
def lemmatize_words(words):
    return [lemmatizer.lemmatize(word) for word in words]

# Apply stemming and lemmatization to 'Processed_Words'
df['Processed_Words'] = df['Processed_Words'].apply(stem_words)
df['Processed_Words'] = df['Processed_Words'].apply(lemmatize_words)

# Convert 'Processed_Words' back to string format for TF-IDF vectorizer
df['Processed_Words_Str'] = df['Processed_Words'].apply(lambda x: ' '.join(x))

# Compute TF-IDF scores
vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = vectorizer.fit_transform(df['Processed_Words_Str'])
feature_names = vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# Identify the most relevant words for each article
df['Top_Words'] = tfidf_df.apply(lambda x: [feature_names[i] for i in x.argsort()[-10:][::-1]], axis=1)

# Print the top words for each article
print("\nTop words for each article:")
print(df[['Article_Body', 'Top_Words']].head())

# Aggregate TF-IDF scores to find the most important words across all articles
tfidf_scores = tfidf_matrix.sum(axis=0).A1
tfidf_scores_df = pd.DataFrame({'Word': feature_names, 'TF-IDF Score': tfidf_scores})

# Sort by TF-IDF score
tfidf_scores_df = tfidf_scores_df.sort_values(by='TF-IDF Score', ascending=False)
