In [1]:
import pandas as pd
import numpy as np
import sklearn
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [None]:
%pip install scikit-learn
!pip install numpy==1.23.5
!pip install gensim
nltk.download('punkt_tab')
nltk.download('stopwords')



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### **1. Load the dataset into a DataFrame.**

---



In [2]:
df = pd.read_csv('isw_data.csv')

### **2. Clean the text data**

---


In [None]:
def clean_text(text):
    if pd.isnull(text):
        return ''

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'isw.?s interactive map of the russian invasion of ukraine.*?static maps present in this report', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Click.*?(?=[A-Z]|$)', '', text, flags=re.DOTALL)
    text = re.sub(r'^.*?\b(am|pm)\b\s*(et)?', '', text, flags=re.IGNORECASE)
    text = re.sub(r'[^\w\s]', '', text)
    # Remove punctuation and extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text.lower()



df['Bold Text'] = df['Bold Text'].apply(clean_text)

def cut_before_pm(text):
     if isinstance(text, str):
         lowered = text.lower()
         index = lowered.find("pm")
         if index != -1:
             return text[index + 7:]
     return text
df["Bold Text"][0:14] = df["Bold Text"][0:14].apply(cut_before_pm)

def cut_before_pm_2(text):
     if isinstance(text, str):
         lowered = text.lower()
         index = lowered.find("pm")
         if index != -1:
             return text[index + 6:]
     return text
df["Bold Text"][14:] = df["Bold Text"][14:].apply(cut_before_pm_2)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["Bold Text"][0:14] = df["Bold Text"][0:14].apply(cut_before_pm)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-cop

### **3. Define a set of stop words to be removed from the text.**
---

In [None]:
stop_words = set([
    "a", "an", "the", "and", "or", "but", "if", "so", "then",
    "to", "of", "in", "on", "at", "by", "with", "from", "into", "over",
    "under","is", "are", "was", "were", "be", "been", "being",
    "have", "has", "had", "do", "does", "did", "will", "would", "can",
    "could", "should", "may", "might", "must",
    "this", "that", "these", "those",
    "very", "too", "just", "only", "still",
    "some", "any", "every", "each",
    "what", "who", "where", "when", "why", "how"
])

def clean_text(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

df['Bold Text'] = df['Bold Text'].apply(clean_text)

###**The function processes a given text by tokenizing it, removing stop words, filtering out non-alphabetic characters, and then reducing the remaining words to their root forms using stemming.**

In [None]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_and_stem(text):
    words = nltk.word_tokenize(text)
    filtered_words = [
        stemmer.stem(word)
        for word in words
        if word not in stop_words and word.isalpha()
    ]
    return ' '.join(filtered_words)

df['Bold Text'] = df['Bold Text'].apply(clean_and_stem)

### **4. Remove numbers.**
---

In [None]:
def remove_numbers(text):
        if isinstance(text, str):
          return re.sub(r'\\d+', '', text)
        return text

        df['Bold Text'] = df['Bold Text'].apply(remove_numbers)

### **5. Convert the cleaned text into TF-IDF features.**
---

In [None]:
vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = vectorizer.fit_transform(df['Bold Text'])
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df.tail()


Unnamed: 0,abil,abl,accept,access,accus,achiev,acknowledg,across,act,action,...,world,worth,yar,year,yet,yevgeni,zaporizhia,zaporizhzhia,zelenski,znpp
1075,0.0,0.0,0.0,0.0,0.0,0.135618,0.0,0.0,0.0,0.0,...,0.0,0.0,0.057641,0.0,0.0,0.0,0.017323,0.0,0.058664,0.0
1076,0.0,0.0,0.07849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.029126,0.0,0.0,0.0,0.035012,0.0,0.0,0.0
1077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.02745,0.0,0.0,0.0,0.049498,0.0,0.055875,0.0
1078,0.0,0.0,0.0,0.0,0.0,0.121909,0.0,0.0,0.0,0.0,...,0.076181,0.0,0.025907,0.0,0.0,0.0,0.0,0.0,0.052734,0.0
1079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.025946,0.0,0.0,0.0,0.015595,0.0,0.0,0.0


### **6. Calculate the importance of each word based on TF-IDF scores.**
---

In [None]:
word_importance = np.sum(X_tfidf.toarray(), axis=0)
word_importance_df = pd.DataFrame({
    'word': vectorizer.get_feature_names_out(),
    'importance': word_importance
})

word_importance_df = word_importance_df.sort_values(by='importance', ascending=False)
word_importance_df.head(10)

Unnamed: 0,word,importance
784,russian,446.05406
931,ukrainian,178.216601
371,forc,129.176451
608,oblast,118.726785
930,ukrain,112.421747
300,effort,99.342007
756,report,93.264992
563,militari,90.65317
607,object,87.980492
194,continu,81.523092


### **7. Calculate the mean vector for each article.**
---

In [None]:
article_vectors = X_tfidf.mean(axis=1)
article_vectors_df = pd.DataFrame(article_vectors, columns=[f'feat_{i}' for i in range(article_vectors.shape[1])])
article_vectors_df.head()

Unnamed: 0,feat_0
0,0.007762
1,0.007433
2,0.006165
3,0.007813
4,0.006585


In [None]:
df['text_vector'] = article_vectors_df.astype('float64').values

df[['text_vector']].head()

Unnamed: 0,text_vector
0,0.007762
1,0.007433
2,0.006165
3,0.007813
4,0.006585


In [None]:
df = df.drop('Bold Text', axis=1)
df

Unnamed: 0,Year,Month,Day,text_vector
0,2022,2,28,0.007762
1,2022,3,1,0.007433
2,2022,3,2,0.006165
3,2022,3,3,0.007813
4,2022,3,4,0.006585
...,...,...,...,...
1075,2025,2,26,0.010409
1076,2025,2,27,0.009256
1077,2025,2,28,0.009872
1078,2025,3,1,0.010219


In [None]:
df.loc[250:260]

Unnamed: 0,Year,Month,Day,text_vector
250,2022,11,8,0.010252
251,2022,11,9,0.00968
252,2022,11,10,0.010192
253,2022,11,11,0.008848
254,2022,11,12,0.010554
255,2022,11,13,0.008273
256,2022,11,14,0.010267
257,2022,11,15,0.008173
258,2022,11,16,0.009345
259,2022,11,17,0.00879


In [None]:
df.to_csv('isw_data_without_num_stopword.csv', index=False)