In [1]:
import pandas as pd
from pathlib import Path

In [2]:
current_dir = Path.cwd()

In [3]:
data_path = current_dir.parent.parent / 'Data_Collection' / 'isw_scraper' / 'isw_data.txt'

In [4]:
df_test = pd.read_csv(data_path, delimiter="\t")
print(df_test.head())

          ISW RUSSIAN OFFENSIVE CAMPAIGN ASSESSMENTS
0  SOURCE: https://www.understandingwar.org/backg...
1  Russian Offensive Campaign Assessment, March 2...
2  ----------------------------------------------...
3  SOURCE: https://www.understandingwar.org/backg...
4  Russian Offensive Campaign Assessment, March 2...


In [3]:
df_test

Unnamed: 0,ISW RUSSIAN OFFENSIVE CAMPAIGN ASSESSMENTS
0,SOURCE: https://www.understandingwar.org/backg...
1,"Russian Offensive Campaign Assessment, March 2..."
2,----------------------------------------------...
3,SOURCE: https://www.understandingwar.org/backg...
4,"Russian Offensive Campaign Assessment, March 2..."
...,...
2392,"Russian Offensive Campaign Assessment, January..."
2393,----------------------------------------------...
2394,SOURCE: https://www.understandingwar.org/backg...
2395,"Russian Offensive Campaign Assessment, Februar..."


In [4]:
with open("isw_data.txt", "r", encoding="utf-8") as file:
    lines = [line.strip() for line in file if line.strip()] 

lines = lines[1:]

while len(lines) % 3 != 0:
    lines.append(None)

data = [lines[i:i+3] for i in range(0, len(lines), 3)]

df = pd.DataFrame(data, columns=["Col1", "Col2", "Col3"])

df

Unnamed: 0,Col1,Col2,Col3
0,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, March 2...",----------------------------------------------...
1,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, March 2...",----------------------------------------------...
2,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, March 2...",----------------------------------------------...
3,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, March 1...",----------------------------------------------...
4,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, March 1...",----------------------------------------------...
...,...,...,...
794,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, January...",----------------------------------------------...
795,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, January...",----------------------------------------------...
796,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, January...",----------------------------------------------...
797,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, January...",----------------------------------------------...


In [5]:
df = df.drop(columns=["Col3"])
df

Unnamed: 0,Col1,Col2
0,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, March 2..."
1,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, March 2..."
2,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, March 2..."
3,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, March 1..."
4,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, March 1..."
...,...,...
794,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, January..."
795,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, January..."
796,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, January..."
797,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, January..."


In [6]:
split_sentence = "References to all sources used are provided in the endnotes of each update."

df[["Col2", "Col3"]] = df["Col2"].str.split(split_sentence, n=1, expand=True)

df["Col3"] = split_sentence + df["Col3"].fillna("")

df

Unnamed: 0,Col1,Col2,Col3
0,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, March 2...",References to all sources used are provided in...
1,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, March 2...",References to all sources used are provided in...
2,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, March 2...",References to all sources used are provided in...
3,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, March 1...",References to all sources used are provided in...
4,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, March 1...",References to all sources used are provided in...
...,...,...,...
794,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, January...",References to all sources used are provided in...
795,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, January...",References to all sources used are provided in...
796,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, January...",References to all sources used are provided in...
797,SOURCE: https://www.understandingwar.org/backg...,"Russian Offensive Campaign Assessment, January...",References to all sources used are provided in...


In [7]:
import re
from collections import Counter

In [8]:
url_pattern = re.compile(r"https?://[^\s]+|www\.[^\s]+|[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}/[^\s]*")

In [9]:
all_urls = df["Col3"].dropna().apply(lambda x: url_pattern.findall(x)).sum()
url_counts = dict(Counter(all_urls))

In [10]:
top_urls = dict(Counter(all_urls).most_common(10))
print(top_urls)

{'https://suspilne': 2314, 'https://armyinform.com': 1785, 'https://tass': 1527, 'https://armyinform': 1501, 'https://meduza': 1025, 'http://kremlin': 802, 'https://ria': 500, 'https://www.rbc': 486, 'https://www.kommersant': 265, 'https://www.interfax': 233}


In [11]:
df_urls = pd.DataFrame(list(url_counts.items()), columns=["URL", "Count"])

In [12]:
df_urls

Unnamed: 0,URL,Count
0,https://www.youtube.com/watch?v=acvu2LBumGo,1
1,https://eur-lex.europa,3
2,https://static.rusi.org/201907_op_surkov_leaks...,1
3,https://isw.pub/UkrWar022725,7
4,http://www.kremlin,125
...,...,...
85376,https://t.me/boris_rozhin/74205;,1
85377,https://t.me/luhanskaVTSA/7772,1
85378,https://t.me/RtrDonetsk/13635,2
85379,https://t.me/andriyshTime/5472,1


In [13]:
telegram_df = df_urls[df_urls["URL"].str.contains("t.me", na=False)]
top_20_telegram = telegram_df.nlargest(20, "Count")
top_20_telegram

Unnamed: 0,URL,Count
2390,https://www.moscowtimes,42
2524,https://www.themoscowtimes,33
129,https://www.currenttime,13
40176,https://t.me/osirskiy/670,13
64870,https://t.me/wargonzo/14641,12
73922,https://t.me/superdolgov/9446,12
3182,https://t.me/GeneralStaffZSU/21384;,11
3183,https://t.me/GeneralStaffZSU/21352;,11
3184,https://t.me/GeneralStaffZSU/21349;,11
8116,https://t.me/Khortytsky_wind/4075,11


In [14]:
def extract_channel(url):
    if "t.me/" in url:
        return url.split("t.me/")[1].split("/")[0] 
    return None

df_urls["Channel"] = df_urls["URL"].apply(extract_channel)
channel_counts = df_urls.groupby("Channel")["Count"].sum().reset_index()
top_20_channels = channel_counts.nlargest(20, "Count")

top_20_channels

Unnamed: 0,Channel,Count
782,dva_majors,5005
1614,rybar,4694
1191,mod_russia,4455
1968,wargonzo,4383
312,RVvoenkor,3765
115,DnevnikDesantnika,3581
635,boris_rozhin,3494
1779,tass_agency,3187
1921,voenkorKotenok,2373
1505,readovkanews,1396
