***Text datasets***

In [1]:
pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m32.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=3a55acd4129088f87669f91d123c1c168c0aa12d3a96e2e29b42098ac73d118e
  Stored in directory: /root/.cache/pip/wheels/c1/67/88/e844b5b022812e15a52e4eaa38a1e709e99f06f6639d7e3ba7
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


***Train First Dataset***

In [2]:
import pandas as pd
import re
from langdetect import detect

file1="/content/row_data.csv"

df1 = pd.read_csv(file1)

#Combine title + description
df1['text'] = df1['title'].fillna('') + ' ' + df1['description'].fillna('')

def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove non-alphabetic characters
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

df1['text_clean'] = df1['text'].apply(clean_text)

def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

df1 = df1[df1['text_clean'].apply(is_english)].copy()

df1.reset_index(drop=True, inplace=True)

print(f"dataset size: {len(df1)}")
print(df1[['text', 'text_clean']].head())


dataset size: 76172
                                                text  \
0  Chinese delegation to Sri Lanka inks SME pacts...   
1  Minister invites UAE for investments in Sri La...   
2  List state - owned enterprises to increase tra...   
3  Sri Lanka plans to set up fabric processing zo...   
4    Boosting exports , counter - terror joint work    

                                          text_clean  
0  chinese delegation to sri lanka inks sme pacts...  
1  minister invites uae for investments in sri lanka  
2  list state owned enterprises to increase trans...  
3  sri lanka plans to set up fabric processing zo...  
4         boosting exports counter terror joint work  


In [3]:
df1.head()

Unnamed: 0,title,description,source,published_at,url,group_name,year,month,day_of_week,is_festival,text,text_clean
0,Chinese delegation to Sri Lanka inks SME pacts...,,,2019-12-10 14:30:00+00:00,https://www.lankabusinessonline.com/chinese-de...,Economy,2019,12,Tuesday,0,Chinese delegation to Sri Lanka inks SME pacts...,chinese delegation to sri lanka inks sme pacts...
1,Minister invites UAE for investments in Sri Lanka,,,2019-12-04 21:30:00+00:00,http://www.dailynews.lk/2019/12/05/local/20483...,Economy,2019,12,Wednesday,0,Minister invites UAE for investments in Sri La...,minister invites uae for investments in sri lanka
2,List state - owned enterprises to increase tra...,,,2019-12-12 00:15:00+00:00,http://www.dailynews.lk/2019/12/12/finance/205...,Economy,2019,12,Thursday,0,List state - owned enterprises to increase tra...,list state owned enterprises to increase trans...
3,Sri Lanka plans to set up fabric processing zo...,,,2019-12-05 08:45:00+00:00,http://www.xinhuanet.com/english/2019-12/05/c_...,Economy,2019,12,Thursday,0,Sri Lanka plans to set up fabric processing zo...,sri lanka plans to set up fabric processing zo...
4,"Boosting exports , counter - terror joint work",,,2019-12-16 00:00:00+00:00,http://www.dailynews.lk/2019/12/16/local/20577...,Economy,2019,12,Monday,0,"Boosting exports , counter - terror joint work",boosting exports counter terror joint work


In [4]:
cols=['title','description','source','published_at','url','group_name','year','month','day_of_week','is_festival','text','text_clean']
for i in cols:
  print(i,len(df1[i].unique()))
  print(df1[i].isnull().sum())

title 56045
0
description 1
76172
source 1
76172
published_at 46365
0
url 76172
0
group_name 4
0
year 7
0
month 12
0
day_of_week 7
0
is_festival 2
0
text 56045
0
text_clean 54287
0


In [5]:
df1 = df1.drop(columns=["description", "source", "url"])

In [6]:
df1.head()

Unnamed: 0,title,published_at,group_name,year,month,day_of_week,is_festival,text,text_clean
0,Chinese delegation to Sri Lanka inks SME pacts...,2019-12-10 14:30:00+00:00,Economy,2019,12,Tuesday,0,Chinese delegation to Sri Lanka inks SME pacts...,chinese delegation to sri lanka inks sme pacts...
1,Minister invites UAE for investments in Sri Lanka,2019-12-04 21:30:00+00:00,Economy,2019,12,Wednesday,0,Minister invites UAE for investments in Sri La...,minister invites uae for investments in sri lanka
2,List state - owned enterprises to increase tra...,2019-12-12 00:15:00+00:00,Economy,2019,12,Thursday,0,List state - owned enterprises to increase tra...,list state owned enterprises to increase trans...
3,Sri Lanka plans to set up fabric processing zo...,2019-12-05 08:45:00+00:00,Economy,2019,12,Thursday,0,Sri Lanka plans to set up fabric processing zo...,sri lanka plans to set up fabric processing zo...
4,"Boosting exports , counter - terror joint work",2019-12-16 00:00:00+00:00,Economy,2019,12,Monday,0,"Boosting exports , counter - terror joint work",boosting exports counter terror joint work


In [7]:
import pandas as pd
from collections import Counter
import re

all_text = " ".join(df1['text_clean'].astype(str))

words = re.findall(r'\b[a-z]{3,}\b', all_text)

word_counts = Counter(words)

top_words = word_counts.most_common(200)

print("Top 200 most repeated words:")
for w, c in top_words:
    print(w, ":", c)


Top 200 most repeated words:
sri : 45398
lanka : 41547
the : 14886
for : 11478
and : 9884
with : 6241
india : 6194
covid : 5339
news : 4755
lankan : 4613
new : 4149
from : 4110
president : 3236
crisis : 3159
china : 3120
asia : 2997
economic : 2956
latest : 2806
world : 2737
colombo : 2509
over : 2310
minister : 2090
island : 2073
daily : 2044
says : 1962
after : 1946
indian : 1864
cases : 1696
today : 1679
mirror : 1648
first : 1644
imf : 1616
breaking : 1526
xinhua : 1480
amid : 1469
schools : 1465
will : 1395
debt : 1376
port : 1354
virus : 1326
trade : 1320
economy : 1313
south : 1306
bank : 1292
chinese : 1288
business : 1259
foreign : 1248
day : 1235
international : 1195
its : 1186
coronavirus : 1176
year : 1156
education : 1142
lankaweb : 1133
tourism : 1123
more : 1107
hit : 1107
english : 1102
not : 1060
pakistan : 1059
health : 1056
government : 1037
support : 1029
biz : 1006
national : 994
global : 984
visit : 971
city : 970
adaderana : 965
against : 931
people : 929
govt : 

In [8]:
UP_WORDS = [
    'achievement', 'agreement reached', 'aid received', 'awards', 'assistance',
    'better', 'boost', 'budget', 'business', 'construction begins',
    'deal signed', 'debt restructuring progress', 'development', 'economy',
    'employment rise', 'expand', 'expansion', 'exports', 'exports rise',
    'fdi', 'friendship', 'foreign', 'foreign investment', 'gain', 'growth',
    'imf approval', 'imf tranche released', 'improvement', 'inauguration',
    'increase', 'ties', 'inflation decreases', 'infrastructure development',
    'interest rate cut', 'investment', 'jobs created', 'launched',
    'loan granted', 'market', 'new project', 'opening ceremony', 'positive',
    'project', 'prosperity', 'rate cut', 'record', 'recovery', 'reform',
    'relief', 'remittances increase', 'resolution', 'rise', 'rupee strengthens',
    'salary increase', 'settlement', 'stability', 'strengthen', 'successful',
    'support', 'support package', 'surplus', 'tourism', 'tourism recovery',
    'tourist arrivals', 'trade', 'upgrade', 'win'
]


DOWN_WORDS = [
    'arrest', 'bankruptcy', 'collapse', 'controversy', 'corruption',
    'covid', 'coronavirus', 'crackdown', 'crisis', 'currency depreciation',
    'debt', 'debt crisis', 'death', 'deaths', 'dead', 'decline', 'default',
    'deficit', 'disaster', 'disruption', 'drop', 'export drop', 'fall',
    'fiscal', 'flooding', 'cases', 'flood', 'fraud', 'fuel', 'fuel shortage',
    'hit', 'import restrictions', 'inflation', 'inflation high', 'instability',
    'interest rate hike', 'landslide', 'loss', 'losses', 'mismanagement',
    'outage', 'pandemic', 'power cut', 'power failure', 'pressure',
    'price hike', 'protest', 'protests', 'resignation', 'rupee weakens',
    'shortage', 'storm', 'slowdown', 'storm warning', 'strike', 'tension',
    'unemployment', 'unrest', 'violence', 'warn'
]



In [9]:
def classify_keywords(text):
    text = text.lower()
    if any(word in text for word in UP_WORDS):
        return "up"
    elif any(word in text for word in DOWN_WORDS):
        return "down"
    else:
        return "neutral"

df1["label"] = df1["text_clean"].apply(classify_keywords)


In [10]:
df1['published_at'] = pd.to_datetime(df1['published_at'], errors='coerce')

# Create a new column with only the day of the month
df1['day_only'] = df1['published_at'].dt.day

df1.drop(columns=['published_at'], inplace=True)

In [11]:
df1['month-day'] = df1['month'].apply(lambda x: f"{int(x):02d}") + '-' + df1['day_only'].apply(lambda x: f"{int(x):02d}")
df1.drop(columns=['month', 'day_only'], inplace=True)

In [12]:
df1 = df1.drop(['is_festival','text'], axis=1)
df1.head()

Unnamed: 0,title,group_name,year,day_of_week,text_clean,label,month-day
0,Chinese delegation to Sri Lanka inks SME pacts...,Economy,2019,Tuesday,chinese delegation to sri lanka inks sme pacts...,neutral,12-10
1,Minister invites UAE for investments in Sri Lanka,Economy,2019,Wednesday,minister invites uae for investments in sri lanka,up,12-04
2,List state - owned enterprises to increase tra...,Economy,2019,Thursday,list state owned enterprises to increase trans...,up,12-12
3,Sri Lanka plans to set up fabric processing zo...,Economy,2019,Thursday,sri lanka plans to set up fabric processing zo...,neutral,12-05
4,"Boosting exports , counter - terror joint work",Economy,2019,Monday,boosting exports counter terror joint work,up,12-16


In [13]:
tourism_season = ["12","01","02","03","04"]

def seasonal_rules(row):
    month = row["month-day"].split("-")[0]

    if row["label"] == "neutral" and month in tourism_season and "tourism" in row["text_clean"]:
        return "up"

    return row["label"]

df1['label'] = df1.apply(seasonal_rules, axis=1)

In [14]:
crisis_months = {
    2019: {"04","05","06","07"},
    2020: {"03","04","05","06","07","08"},
    2022: {"03","04","05","06","07","08"},
    2023: {"08","09","10","11","12"}
}

def crisis_rule(row):
    if row["label"] != "neutral":
        return row["label"]

    year = int(row["year"])
    month = row["month-day"].split("-")[0]
    group = str(row["group_name"]).lower()

    if year == 2019 and month in crisis_months[2019]:
        if "Education" in group or "Culture" in group:
            return "down"

    if year == 2020 and month in crisis_months[2020]:
        if "health" in group or "economy" in group or "Culture" in group:
            return "down"

    if year == 2022 and month in crisis_months[year]:
        if "economy" in group:
            return "down"

    if year == 2023 and month in crisis_months[year]:
        if "Education" in group or "economy" in group or "Culture" in group:
            return "down"

    return "neutral"

df1['label'] = df1.apply(crisis_rule, axis=1)

In [15]:
df1 = df1.drop(['title'],axis=1)

In [16]:
df1.head()

Unnamed: 0,group_name,year,day_of_week,text_clean,label,month-day
0,Economy,2019,Tuesday,chinese delegation to sri lanka inks sme pacts...,neutral,12-10
1,Economy,2019,Wednesday,minister invites uae for investments in sri lanka,up,12-04
2,Economy,2019,Thursday,list state owned enterprises to increase trans...,up,12-12
3,Economy,2019,Thursday,sri lanka plans to set up fabric processing zo...,neutral,12-05
4,Economy,2019,Monday,boosting exports counter terror joint work,up,12-16


In [17]:
df1.rename(columns={
    'text_clean': 'Content',
    'group_name': 'Catagory'
}, inplace=True)

df1[['Month', 'Day']] = df1['month-day'].str.split('-', expand=True)

df1 = df1[['Content', 'Catagory', 'year', 'Month', 'Day', 'day_of_week', 'label']]

df1.rename(columns={'group_name': 'GroupName', 'label': 'Label','year': 'Year'}, inplace=True)

df1.head()

Unnamed: 0,Content,Catagory,Year,Month,Day,day_of_week,Label
0,chinese delegation to sri lanka inks sme pacts...,Economy,2019,12,10,Tuesday,neutral
1,minister invites uae for investments in sri lanka,Economy,2019,12,4,Wednesday,up
2,list state owned enterprises to increase trans...,Economy,2019,12,12,Thursday,up
3,sri lanka plans to set up fabric processing zo...,Economy,2019,12,5,Thursday,neutral
4,boosting exports counter terror joint work,Economy,2019,12,16,Monday,up


In [18]:
df1.to_csv("df1.csv", index=False)

***Train Second Dataset***

In [19]:
import re
from langdetect import detect
import pandas as pd
import numpy as np

In [20]:
file="/content/news.csv"
df2=pd.read_csv(file)
df2.head()

Unnamed: 0,heading,source,published_date,published_time,content
0,Mobiles to aid straying fisherman,dailymirror,1999-11-30,00:00,
1,Education and Health policies must be formulated,dailymirror,1999-11-30,00:00,
2,Video: Granting US$1 Mn unconstitutional: Ex-CJ,dailymirror,1999-11-30,00:00,
3,UNP MP Ekanayake crosses over to government bench,dailymirror,2009-12-09,10:04,United National Party MP for the Matale distri...
4,Under-Secretary of State Blake arrives in Sri ...,dailymirror,2009-12-09,10:08,The United States Assistant-Secretary of State...


In [21]:
df2.drop(['published_time',],axis=1,inplace=True)

In [22]:
df2['text'] = df2['heading'].fillna('') + ' ' + df2['content'].fillna('')

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df2['text_clean'] = df2['text'].apply(clean_text)

def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

df2 = df2[df2['text_clean'].apply(is_english)].copy()

df2['published_date'] = pd.to_datetime(df2['published_date'], errors='coerce')
df2['year'] = df2['published_date'].dt.year
df2['month'] = df2['published_date'].dt.month
df2['day_of_week'] = df2['published_date'].dt.day_name()

df2['day_only'] = df2['published_date'].dt.day
df2['month-day'] = df2['month'].apply(lambda x: f"{int(x):02d}") + '-' + df2['day_only'].apply(lambda x: f"{int(x):02d}")

# Drop columns
df2 = df2.drop(columns=['heading', 'content', 'source', 'published_date', 'month', 'day_only', 'text'])

df2.head()

Unnamed: 0,text_clean,year,day_of_week,month-day
0,mobiles to aid straying fisherman,1999,Tuesday,11-30
1,education and health policies must be formulated,1999,Tuesday,11-30
3,unp mp ekanayake crosses over to government be...,2009,Wednesday,12-09
4,undersecretary of state blake arrives in sri l...,2009,Wednesday,12-09
5,parliament extends emergency by another month ...,2009,Wednesday,12-09


In [23]:
import pandas as pd
from collections import Counter
import re

all_text = " ".join(df2['text_clean'].astype(str))

words = re.findall(r'\b[a-z]{3,}\b', all_text)

word_counts = Counter(words)

top_words = word_counts.most_common(200)

print("Top 200 most repeated words:")
for w, c in top_words:
    print(w, ":", c)

Top 200 most repeated words:
the : 1017736
and : 286470
said : 154954
that : 131429
for : 114635
was : 105419
sri : 80257
with : 78607
from : 70828
had : 67157
lanka : 57790
have : 54773
will : 54184
minister : 54159
were : 53940
has : 53618
this : 52617
not : 49743
are : 49677
been : 48624
government : 44277
who : 41135
police : 38066
president : 33884
also : 33870
they : 33044
today : 30591
would : 30568
colombo : 30453
his : 29111
which : 28932
their : 27679
people : 27449
country : 26517
after : 25646
over : 25499
all : 24835
covid : 23600
there : 22289
two : 21891
new : 21019
against : 19905
out : 19315
its : 19182
should : 18962
during : 18746
when : 18683
but : 18620
court : 18440
general : 18172
national : 17920
ministry : 17880
while : 17326
under : 17176
health : 16979
one : 16492
other : 16310
made : 16245
into : 16021
rajapaksa : 15644
lankan : 15229
former : 15210
public : 15199
arrested : 15141
state : 14967
more : 14940
any : 14594
prime : 14474
yesterday : 14250
such : 

In [24]:
UP_WORDS = [
    'achievement', 'agreement reached', 'aid received', 'awards', 'assistance',
    'better', 'boost', 'budget', 'business', 'construction begins',
    'deal signed', 'debt restructuring progress', 'development', 'economy',
    'employment rise', 'expand', 'expansion', 'exports', 'exports rise',
    'fdi', 'friendship', 'foreign', 'foreign investment', 'gain', 'growth',
    'imf approval', 'imf tranche released', 'improvement', 'inauguration',
    'increase', 'ties', 'inflation decreases', 'infrastructure development',
    'interest rate cut', 'investment', 'jobs created', 'launched',
    'loan granted', 'market', 'new project', 'opening ceremony', 'positive',
    'project', 'prosperity', 'rate cut', 'record', 'recovery', 'reform',
    'relief', 'remittances increase', 'resolution', 'rise', 'rupee strengthens',
    'salary increase', 'settlement', 'stability', 'strengthen', 'successful',
    'support', 'support package', 'surplus', 'tourism', 'tourism recovery',
    'tourist arrivals', 'trade', 'upgrade', 'win'
]


DOWN_WORDS = [
    'arrest', 'bankruptcy', 'collapse', 'controversy', 'corruption',
    'covid', 'coronavirus', 'crackdown', 'crisis', 'currency depreciation',
    'debt', 'debt crisis', 'death', 'deaths', 'dead', 'decline', 'default',
    'deficit', 'disaster', 'disruption', 'drop', 'export drop', 'fall',
    'fiscal', 'flooding', 'cases', 'flood', 'fraud', 'fuel', 'fuel shortage',
    'hit', 'import restrictions', 'inflation', 'inflation high', 'instability',
    'interest rate hike', 'landslide', 'loss', 'losses', 'mismanagement',
    'outage', 'pandemic', 'power cut', 'power failure', 'pressure',
    'price hike', 'protest', 'protests', 'resignation', 'rupee weakens',
    'shortage', 'storm', 'slowdown', 'storm warning', 'strike', 'tension',
    'unemployment', 'unrest', 'violence', 'warn', 'food insecurity', 'famine',
    'hunger', 'accident', 'explosion', 'attack', 'terror', 'riot', 'conflict',
    'violations', 'lawsuit', 'scandal', 'sabotage', 'strike action', 'recession',
    'deflation', 'economic slowdown', 'power outage', 'health crisis',
    'environmental disaster', 'disease outbreak', 'earthquake', 'tsunami',
    'cyclone', 'pandemic alert', 'epidemic', 'corruption scandal', 'civil unrest'
]

In [25]:
def classify_keywords(text):
    text = text.lower()
    if any(word in text for word in UP_WORDS):
        return "up"
    elif any(word in text for word in DOWN_WORDS):
        return "down"
    else:
        return "neutral"

df2["label"] = df2["text_clean"].apply(classify_keywords)

In [26]:
df2['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
up,59389
neutral,53598
down,23157


In [27]:
real_crisis_periods = {
    2017: {"05", "06", "07"},        # Major flood
    2019: {"04"},                    # Easter bomb attack (April 2019)
    2022: {"07", "08"},              # Galle Face protest + Fuel shortage
    2020: {"03", "04", "05"},        # COVID first wave
}

def fallback_crisis_rule(row):
    if row["label"] != "neutral":
        return row["label"]

    year = int(row["year"])
    month = row["month-day"].split("-")[0].zfill(2)

    if year in real_crisis_periods and month in real_crisis_periods[year]:
        return "down"
    return row["label"]


df2["label"] = df2.apply(fallback_crisis_rule, axis=1)
df2["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
up,59389
neutral,51497
down,25258


In [28]:
indicator_groups = {
    "Economy": [
        "economy", "business", "finance", "trade", "exports", "imports",
        "budget", "bank", "market", "fdi", "growth", "investment",
        "development", "financial", "economic", "recovery", "inflation",
        "debt", "deficit", "currency", "rate", "loan", "project"
    ],
    "Education": [
        "education", "schools", "students", "teachers", "learning",
        "university", "curriculum", "college", "exam", "literacy",
        "study", "research", "scholar", "training", "program"
    ],
    "Health": [
        "health", "hospital", "medicine", "disease", "covid", "medical",
        "pandemic", "cases", "vaccination", "treatment", "healthcare",
        "infection", "doctor", "nurse", "emergency", "clinic", "patient",
        "epidemic", "mental health"
    ],
    "Culture": [
        "culture", "religion", "festival", "sports", "celebration",
        "arts", "heritage", "tradition", "ceremony", "tourism",
        "tourist", "travel", "holiday", "trip", "event", "cultural",
        "performance", "exhibition", "museum"
    ]
}

def assign_group(text):
    text_lower = str(text).lower()
    for group, keywords in indicator_groups.items():
        if any(keyword in text_lower for keyword in keywords):
            return group
    return np.nan

df2['group_name'] = df2['text_clean'].apply(assign_group)


df2['group_name'].value_counts()

Unnamed: 0_level_0,count
group_name,Unnamed: 1_level_1
Economy,32292
Health,13460
Education,8755
Culture,7124


In [29]:
df2.rename(columns={
    'text_clean': 'Content',
    'group_name': 'Catagory'
}, inplace=True)

df2[['Month', 'Day']] = df2['month-day'].str.split('-', expand=True)

df2 = df2[['Content', 'Catagory', 'year', 'Month', 'Day', 'day_of_week', 'label']]

df2.rename(columns={'group_name': 'GroupName', 'label': 'Label','year': 'Year'}, inplace=True)

df2.head()

Unnamed: 0,Content,Catagory,Year,Month,Day,day_of_week,Label
0,mobiles to aid straying fisherman,,1999,11,30,Tuesday,neutral
1,education and health policies must be formulated,Education,1999,11,30,Tuesday,neutral
3,unp mp ekanayake crosses over to government be...,,2009,12,9,Wednesday,neutral
4,undersecretary of state blake arrives in sri l...,,2009,12,9,Wednesday,neutral
5,parliament extends emergency by another month ...,Health,2009,12,9,Wednesday,up


In [30]:
df2.to_csv("df2.csv", index=False)

***Combine the two sets***

In [31]:
import pandas as pd

df_combined = pd.concat([df1, df2], ignore_index=True)
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

print(df_combined.shape)

(212316, 7)


In [32]:
df_combined.head()

Unnamed: 0,Content,Catagory,Year,Month,Day,day_of_week,Label
0,cricket a win for england in encounter ahead,,2014,12,11,Thursday,up
1,lankan tamils lock nigerian officials,,2014,2,26,Wednesday,neutral
2,two unp noncabinet ministers in hot water with...,,2019,9,14,Saturday,neutral
3,pada yathra to begin from galaha junction the ...,,2016,7,27,Wednesday,down
4,bogawanthalawa police constable attacked insid...,,2014,6,30,Monday,down


In [33]:
df_combined['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
neutral,89716
up,80703
down,41897


In [35]:
df_combined.to_csv("final_csv",index=False)

In [36]:
import pandas as pd

df_down = df_combined[df_combined['Label'] == 'down']
df_up = df_combined[df_combined['Label'] == 'up']
df_neutral = df_combined[df_combined['Label'] == 'neutral']

df_up_sampled = df_up.sample(60000, random_state=42)
df_neutral_sampled = df_neutral.sample(60000, random_state=42)

df_down_oversampled = df_down.sample(60000, replace=True, random_state=42)

df_balanced = pd.concat([df_down_oversampled, df_up_sampled, df_neutral_sampled]).reset_index(drop=True)

df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print(df_balanced['Label'].value_counts())


Label
neutral    60000
down       60000
up         60000
Name: count, dtype: int64


In [38]:
df_balanced['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
neutral,60000
down,60000
up,60000


In [39]:
df_balanced.to_csv("final.csv",index=False)

***Clustering to catagories***

In [2]:
!pip install sentence-transformers scikit-learn pandas numpy



In [3]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import pandas as pd
import re
from langdetect import detect

In [4]:
df=pd.read_csv("/content/final.csv")

In [5]:
df.drop(['Catagory'],axis=1,inplace=True)

In [6]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
X_text = df['Content'].tolist()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
X_text_emb = model.encode(
    X_text,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True
)

Batches:   0%|          | 0/2813 [00:00<?, ?it/s]

In [8]:
k = 12  #number of clusters you want
kmeans = KMeans(n_clusters=k, random_state=42)
df['cluster'] = kmeans.fit_predict(X_text_emb)

cluster_to_category = {
    0: "Energy",
    1: "Logistics",
    2: "Education",
    3: "Health",
    4: "Finance",
    5: "Government",
    6: "Tourism",
    7: "Agriculture",
    8: "Social",
    9: "Technology",
    10: "Economy",
    11: "Other"
}

In [9]:
df['Category'] = df['cluster'].map(cluster_to_category)
df['Category'] = df['Category'].fillna("Other")

df.drop(['cluster'], axis=1, inplace=True)

print(df[['Content', 'Category']].head())


                                             Content    Category
0   mayor of west harrow london felicitated in kandy      Energy
1  children between years to be jabbed from jan t...   Education
2  india sends food medicine donation to crisis h...     Economy
3  chavakachcheri accident victims bodies airlift...  Technology
4  establishing sri lanka as a hub for universiti...     Economy


In [10]:
df.head()

Unnamed: 0,Content,Year,Month,Day,day_of_week,Label,Category
0,mayor of west harrow london felicitated in kandy,2018,8,7,Tuesday,neutral,Energy
1,children between years to be jabbed from jan t...,2022,1,5,Wednesday,neutral,Education
2,india sends food medicine donation to crisis h...,2022,5,23,Monday,down,Economy
3,chavakachcheri accident victims bodies airlift...,2016,12,18,Sunday,down,Technology
4,establishing sri lanka as a hub for universiti...,2023,12,11,Monday,up,Economy


In [12]:
df['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
Tourism,25377
Energy,22935
Economy,19277
Government,18420
Finance,16052
Logistics,15121
Agriculture,12625
Technology,11944
Education,11004
Social,10322


In [13]:
df.to_csv("final_clustered.csv",index=False)