## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import nltk
import random
from scipy.spatial import distance
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import re
import string
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import networkx as nx
from keybert import KeyBERT
from transformers import BertModel, BertTokenizer, AutoModelForSequenceClassification, AutoTokenizer
from flair.models import SequenceTagger
from flair.data import Sentence
from google.colab import drive
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

# Download spaCy model
!pip install -U spacy
!python -m spacy download en_core_web_sm
import spacy

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Importing all Datasets

In [None]:
df1=pd.read_csv(r"D:\Sana\Datasets\dawn (full-data).csv", encoding= 'utf-8')

In [None]:
df2=pd.read_csv(r"D:\Sana\Datasets\pakistan_today(full-data).csv", encoding= 'utf-8')

In [None]:
df3=pd.read_csv(r"D:\Sana\Datasets\tribune(full-data).csv", encoding= 'utf-8')

In [None]:
df4=pd.read_csv(r"D:\Sana\Datasets\daily_times(full-data).csv", encoding= 'utf-8')

In [None]:
df5=pd.read_csv(r"D:\Sana\Datasets\business_recorder(full-data).csv", encoding= 'utf-8')

In [None]:
def remove_unnamed(df):
    return df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Remove 'Unnamed' columns from each DataFrame
df1 = remove_unnamed(df1)
df2 = remove_unnamed(df2)
df3 = remove_unnamed(df3)
df4 = remove_unnamed(df4)
df5 = remove_unnamed(df5)

# Display the shape of each DataFrame after removing 'Unnamed' columns
print("Shape of df1:", df1.shape)
print("Shape of df2:", df2.shape)
print("Shape of df3:", df3.shape)
print("Shape of df4:", df4.shape)
print("Shape of df5:", df5.shape)


##### Merge all three datasets

In [None]:
# Define the common date range for all DataFrames
start_date = '2020-01-01'
end_date = '2023-05-31'

# Filter and merge DataFrames
merged_df = pd.concat([df[(df['date'] >= start_date) & (df['date'] <= end_date)] for df in [df1, df2, df3, df4, df5]])

# Reset index
merged_df.reset_index(drop=True, inplace=True)

# Print the first few rows of the merged DataFrame
print(merged_df.shape)

In [None]:
merged_df = merged_df.loc[:, ~merged_df.columns.str.contains('^Unnamed')]

##### Check for missing dates in each dataset

In [None]:
# Group the data by 'source' and aggregate unique values in 'date'
unique_dates_by_source = merged_df.groupby('source')['date'].unique()

# Print the unique dates for each source
for source, unique_dates in unique_dates_by_source.items():
    print(f"Source: {source}")
    print(f"Unique Dates: {unique_dates}")

    # Generate a range of dates from January 2022 to December 2022
    all_dates = pd.date_range(start='2022-01-01', end='2022-12-31')

    # Find missing dates by comparing the range of dates with unique dates
    missing_dates = all_dates.difference(unique_dates)

    print(f"Missing Dates: {missing_dates}")
    print()

##### Delete null rows and drop duplicates

In [None]:
print(f"Shape Before dropna: {merged_df.shape}")
merged_df = merged_df.dropna(how='all')
merged_df = merged_df.dropna()
print(f"Shape After dropna: {merged_df.shape}")

In [None]:
print(f"Shape Before Duplicate Removal: {merged_df.shape}")

# Create a boolean mask to identify duplicated rows
duplicates_mask = merged_df.duplicated(subset=['headline'], keep='first')

# Filter and display the duplicated rows
duplicates = merged_df[duplicates_mask]
print("Duplicated Rows:")
print(duplicates)

# Drop the duplicates
merged_df.drop_duplicates(subset=['headline'], inplace=True)

print(".......................................")
print(f"Shape After Duplicate Removal: {merged_df.shape}")

In [None]:
print(f"Shape Before Duplicate Removal: {merged_df.shape}")
merged_df.drop_duplicates(subset=['description'], inplace=True)
#merged.drop_duplicates(inplace=True)
print(".......................................")
print(f"Shape After Duplicate Removal: {merged_df.shape}")

##### Handling Non-Ascii Characters

In [None]:
def replace_characters(text):
    if isinstance(text, str):
        #text = text.replace("Ã¢ÂÂ", "“")
        text = text.replace("Ã¢ÂÂ", "”")
        text = text.replace("Ã¢ÂÂ", "—")
        text = text.replace(" Ã¢ÂÂ", "—")
        text = text.replace("Ã¢ÂÂ ", "”")
        text = text.replace("Ã¢ÂÂ", "’")  # Replace "Ã¢ÂÂ" with "‘"
        text = text.replace("Ã¢ÂÂ", "‘")
        text = text.replace(" Ã¢ÂÂ", "“")
        text = text.replace(" Ã¢ÂÂ", "”")
        return text
    else:
        return text

# Apply the replacement function to all columns in the DataFrame
merged_df = merged_df.applymap(replace_characters)

In [None]:
def replace_more_characters(text):
    if isinstance(text, str):
        text = text.replace("Ã¢ÂÂ", "“")
        text = text.replace("Ã¢Â£", "£")
        text = text.replace("Ã¢ÂÂ", "–")
        return text
    else:
        return text

# Apply the replacement function to all columns in the DataFrame
merged_df = merged_df.applymap(replace_more_characters)

In [None]:
import re
import pandas as pd

def clean_non_Ascii(text):
    if isinstance(text, str):
        # Remove non-ASCII characters using a regular expression
        cleaned_text = re.sub(r'[^\x00-\x7F]+', '', text)
        # Remove extra whitespaces
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
        return cleaned_text
    else:
        return text

# Clean the 'News' column using the clean_non_Ascii function and print indices of changed rows
changed_indices = []

for idx, row in merged_df.iterrows():
    original_text = row['description']
    cleaned_text = clean_non_Ascii(original_text)

    if original_text != cleaned_text:
        merged_df.at[idx, 'description'] = cleaned_text
        changed_indices.append(idx)

print(f"Changes have been made in rows with indices: {changed_indices}")

In [None]:
main_categories = merged_df['categories'].unique()
print(main_categories)

# Standardizing Categories

In [None]:
import re

patterns_to_remove = [
    r'00012\. Where the taxable income exceeds Rs 21',
    r'should be in accordance with the law and the constitution',
    r'no foreign power shall be allowed to topple an elected government through a conspiracy',
    r'and also — courtesy the word ‘shall’ — that he is bound to do so. Once the assembly opens',
    r'within fourteen days of the receipt of the requisition.""This means two things: that the Speaker has until March 22 to summon the assembly',
    r'the constitution say that `` the speaker shall summon the national assembly to meet',
    r'the opposition ha also moved to summon it . it’s submitted a requisition ( a fancy word for an official request ) under article 54 ( 3 ) . once a requisition is received',
    r'that show president biden is willing to make difficult choices',
    r'having said that',
    r'which was presented on March 8',
    r'Ã\x99Â¾Ã\x98Â§Ã\x9AÂ©Ã\x98Â³Ã\x98ÂªÃ\x98Â§Ã\x99Â\x86',
    r'and also — courtesy the word ‘shall’ — that he is bound to do so. Once the assembly opens',
    r'within fourteen days of the receipt of the requisition.""This means two things: that the Speaker has until March 22 to summon the assembly',
    r'the opposition ha also moved to summon it . it’s submitted a requisition ( a fancy word for an official request ) under article 54 ( 3 ) . once a requisition is received'
    r'the opposition ha also moved to summon it . it’s submitted a requisition ( a fancy word for an official request ) under article 54 ( 3 ) . once a requisition is received',
    r'the opposition ha also moved to summon it . it’s submitted a requisition \( a fancy word for an official request \) under article 54 \( 3 \) . once a requisition is received'
    r'and also Ã¢Â\x80Â\x94 courtesy the word Ã¢Â\x80Â\x98shallÃ¢Â\x80Â\x99 Ã¢Â\x80Â\x94 that he is bound to do so. Once the assembly opens'
    r' the opposition ha also moved to summon it . itÃ¢Â\x80Â\x99s submitted a requisition ( a fancy word for an official request ) under article 54 ( 3 ) . once a requisition is received '
    r' the opposition ha also moved to summon it . it’s submitted a requisition \( a fancy word for an official request \) under article 54 \( 3 \) . once a requisition is received'
    r' the opposition ha also moved to summon it . it’s submitted a requisition ( a fancy word for an official request ) under article 54 ( 3 ) . once a requisition is received '
]
# Create a combined regular expression pattern
pattern = '|'.join(patterns_to_remove)

# Use the pattern to filter the DataFrame
merged_df = merged_df[~merged_df['categories'].str.contains(pattern, case=False, regex=True)]

In [None]:

# Define the text to remove (with extra spaces)
text_to_remove = '00012.   Where the taxable income exceeds        Rs 21'

# Use the 'Category' column to filter the DataFrame
merged_df = merged_df[~merged_df['categories'].str.replace(r'\s', '', regex=True).str.contains(text_to_remove.replace(' ', ''))]


In [None]:
import re

# Pattern to remove
pattern_to_remove = re.escape(
    "the opposition ha also moved to summon it . it’s submitted a requisition ( a fancy word for an official request ) under article 54 ( 3 ) . once a requisition is received"
)

# Apply the pattern to remove the specified string
merged_df['categories'] = merged_df['categories'].str.replace(pattern_to_remove, '', case=False, regex=True)


In [None]:
pd.DataFrame(main_categories, columns=['categories'])

In [None]:
output_path = r"D:\Sana\Extra Material\Ful_Data_Unique_Cat.csv"
unique_category_source_df.to_csv(output_path, index=False)

print(f'DataFrame saved to: {output_path}')

In [None]:
category_mappings = {
    'Pakistan Today': {
    'Sports': 'Others',
    'NATIONAL & Top Headlines': 'National',
    'NATIONAL & Top Headlines & World': 'National',
    'Sports': 'Others',
    'World': 'International',
    'NATIONAL': 'National',
    'NATIONAL & Top Headlines': 'National',
    'Editorials & Opinion': 'Editorial',
    'Comment & Opinion': 'Others',
    'Opinion': 'Others',
    'Top Headlines & World': 'International',
    'Letters & Opinion': 'Others',
    'ISLAMABAD & NATIONAL': 'National',
    'Business': 'Business',
    'Sports & Top Headlines': 'Others',
    'NATIONAL & World': 'International',
    'HEADLINES': 'Others',
    'NATIONAL & Top Headlines & World': 'International',
    'LAHORE & NATIONAL': 'National',
    'HEADLINES & NATIONAL': 'National',
    'CITY': 'National',
    'NATIONAL & top Featured & World': 'International',
    'KARACHI & NATIONAL': 'National',
    'CITY & LAHORE': 'National',
    'MULTAN & NATIONAL': 'National',
    'HEADLINES & World': 'International',
    'NATIONAL & Sports': 'Others',
    'Uncategorized': 'Others',
    'NATIONAL & top Featured': 'National',
    'NATIONAL & top Featured & Top Headlines': 'National',
    'HEADLINES & Sports': 'Others',
    'CITY & HEADLINES': 'National',
    'LAHORE & NATIONAL & Top Headlines': 'National',
    'NATIONAL & PESHAWAR & Top Headlines': 'National',
    'HEADLINES & NATIONAL & World': 'International',
    'E-papers & Pakistan Today': 'Others',
    'CITY & PESHAWAR': 'National',
    'HEADLINES & NATIONAL & Top Headlines': 'National',
    'LAHORE': 'National',
    'NATIONAL & PESHAWAR': 'National',
    'Sponsored Content': 'Others',
    'Top Headlines': 'National',
    'HEADLINES & Letters & Opinion': 'Others',
    'CITY & LAHORE & Opinion': 'National',
    'CITY & KARACHI & NATIONAL': 'National',
    'HEADLINES & Top Headlines': 'National',
    'ISLAMABAD & Pakistan Today': 'National',
    'ISLAMABAD': 'National',
    'MULTAN': 'National',
    'PESHAWAR': 'National',
    'top Featured & Top Headlines': 'National',
    'top Featured': 'National',
    'KARACHI': 'National',
    'PESHAWAR & Sports': 'National',
    'NATIONAL & Opinion': 'National',
    'Sports & top Featured': 'Others',
    'Comment & Letters': 'Others',
    'Letters': 'Others',
    'CITY & ISLAMABAD': 'National',
    'CITY & ISLAMABAD & NATIONAL': 'National',
    'CITY & KARACHI': 'National',
    'Comment & NATIONAL & Opinion': 'National',
    'NATIVE CONTENT': 'Others',
    'NATIONAL & Top Non Business': 'National',
    'Book Review & NATIONAL': 'Others',
    'Entertainment & NATIONAL': 'Others',
    'Comment': 'Others',
    'Analysis & NATIONAL': 'Others',
    'Editorials & Letters & Opinion': 'Editorial',
    'Comment & HEADLINES & Opinion': 'Others',
    'FEATURED & top Featured' : 'National',
    'FEATURED & Top Non Business': 'National',
    'FEATURED': 'National',
    'GOVERNANCE': 'National',
    'GOVERNANCE & HEADLINES': 'National',
    'Top Non Business & World': 'International',
    'GOVERNANCE & NATIONAL': 'National',
    'HEADLINES & KARACHI': 'National',
    'HEADLINES & LAHORE': 'National',
    'HEADLINES & PESHAWAR': 'National',
    'INTERVIEW & Top Headlines': 'Others',
    'CITY & HEADLINES & LAHORE': 'National',
    'KARACHI & LAHORE & NATIONAL': 'National',
    'KARACHI & LAHORE': 'National',
    'HEADLINES & NATIONAL & Top Non Business': 'National',
    'FEATURED & Top Headlines': 'National',
    'KARACHI & NATIONAL & Top Headlines': 'National',
    'HEADLINES & Top Headlines & World': 'International',
    'Editorials & HEADLINES & Opinion': 'National',
    'HEADLINES & LAHORE & NATIONAL': 'National',
    'Analysis & HEADLINES & NATIONAL & Top Headlines': 'National',
    'Book Review': 'Others',
    'HEADLINES & LAHORE & NATIONAL & Top Headlines': 'National',
    'GOVERNANCE & HEADLINES & NATIONAL': 'National',
    'LAHORE & NATIONAL & PESHAWAR': 'National',
    'Entertainment': 'Others',
    'Comment & Editorials': 'Others',
    'HEADLINES & Sports & Top Headlines': 'Others',
    'SPONSORED': 'Others',
    'CITY & HEADLINES & NATIONAL': 'National',
    'CITY & NATIONAL': 'National',
    'FEATURED & NATIONAL': 'National',
    'ISLAMABAD & KARACHI & LAHORE': 'National',
    'HEADLINES & NATIONAL & Top Headlines & World': 'National',
    'LAHORE & Top Headlines': 'National',
    'HEADLINES & ISLAMABAD & NATIONAL': 'National',
    'HEADLINES & NATIONAL & Sports & Top Headlines': 'Others',
    'NATIONAL & Sports & Top Headlines': 'Others',
    'Editorials':'Editorial',
    'Sports & World': 'Others',
    'HEADLINES & ISLAMABAD & LAHORE & NATIONAL': 'National',
    'CITY & FEATURED & LAHORE': 'National',
    'Entertainment & World': 'Others',
    'HEADLINES & ISLAMABAD & NATIONAL & Pakistan Today': 'National',
    'NATIONAL & Sports & Top Headlines & Top Non Business': 'Others',
    'HEADLINES & KARACHI & NATIONAL': 'National',
    'OIC & World': 'International',
    'Agriculture & NATIONAL': 'National',
    'CITY & Education & MULTAN': 'National',
    'Cartoon & Letters': 'Others',
    'FEATURED & HEADLINES & NATIONAL': 'National',
    'FEATURED & Sports': 'Others',
    'ISLAMABAD & SPONSORED': 'National',
    'top Featured & World': 'International',
    'HEADLINES & NATIONAL & top Featured': 'National',
    'NATIONAL & Pakistan Today': 'National',
    'Cartoon & Editorials': 'Others',
    'CITY & LAHORE & NATIONAL': 'National',
    'Education & NATIONAL': 'National',


},
        'Business Recorder': {
        'Markets ': 'Business',
        'Markets': 'Business',
        'MARKETS': 'Business',
        'MARKETS ': 'Business',
        'Business': 'Business',
        'Business ': 'Business',
        'World ': 'International',
        'World': 'International',
        'Pakistan': 'National',
        'Pakistan ': 'National',
        'Sports ': 'Others',
        'Sports': 'Others',
        'Editorials': 'Editorial',
        'Editorials ': 'Editorial',
        'Business & Finance ': 'Business',
        'Business & Finance': 'Business',
        'Perspectives': 'Others',
        'Perspectives ': 'Others',
        'BR Research': 'Others',
        'BR Research ': 'Others',
        'Life & Style': 'Others',
        'Life & Style ': 'Others',
        'Opinion': 'Opinion',
        'Opinion ': 'Opinion',
        'Technology': 'Others',
        'Technology ': 'Others',
        'Print': 'Print',
        'Print ': 'Print',
        'Top Stories': 'National',
        'Top News': 'National',
        'Budget': 'National',
        'Epaper': 'Others',
        'Brief Recordings': 'Others',
        'Entertainment': 'Others',
        'Supplements': 'Business',
        'Weather': 'Others',
        'US Elections 2020': 'Others',
        'Asia Cup': 'Others',
        'Budget 2022-23': 'Business',
        'Budget 2021-22': 'Business',
        'Top Stories ': 'National',
        'Top News ': 'National',
        'Budget ': 'National',
        'Epaper ': 'Others',
        'Brief Recordings ': 'Others',
        'Entertainment ': 'Others',
        'Supplements ': 'Business',
        'Weather ': 'Others',
        'US Elections 2020 ': 'Others',
        'Asia Cup ': 'Others',
        'Budget 2022-23 ': 'Business',
        'Budget 2021-22 ': 'Business'
    },
    'Dawn': {
        'Pakistan': 'National',
        'Pakistan ': 'National',
        'Business': 'Business',
        'Business ': 'Business',
        'World': 'International',
        'World ': 'International',
        'Prism': 'Others',
        'Prism ': 'Others',
        'Sport': 'Others',
        'Sport ': 'Others'
    },
    'Daily Times': {
'Pakistan':'National',
'Reviews'
'Arts, Culture &amp; Books':'Others',
'Infotainment':'Others',
'Perspectives':'Others',
'Commentary / Insight':'Others',
'Letters':'Others',
'Op-Ed':'Others',
'Editorial':'Editorial',
'Pakistan & Top Stories':'National',
'Business':'Business',
'Blogs':'Others',
'World':'International',
'Pakistan & World':'National',
'Pakistan & Uncategorized':'National',
'Top Stories':'Others',
'Trending & World':'International',
'Top Stories & World':'International',
'Lifestyle':'Others',
'Sports':'Others',
'Trending':'Others',
'Infotainment & Trending':'Others',
'Pakistan & Trending':'National',
'Uncategorized':'Others',
'Islamabad':'National',
'Sindh':'National',
'Punjab':'National',
'Punjab & Trending':'National',
'Blogs & Trending':'Others',
'Pakistan & Top Stories & World':'National',
'Uncategorized & World':'International',
'Islamabad & Pakistan':'National',
'Khyber Pakhtunkhwa & Pakistan':'National',
'Pakistan & Punjab':'National',
'Business & Pakistan':'Business',
'Balochistan & Pakistan':'National',
'Business & World':'Business',
'Sports & Top Stories':'Others',
'Business & Trending':'Business',
'Business & Top Stories':'Business',
'Khyber Pakhtunkhwa':'National',
'Pakistan & Sports':'Others',
'Sports & Uncategorized':'Others',
'Featured':'Others',
'Balochistan & Reviews':'National',
'Sponsored Content':'Others',
'Sports & Trending':'Others',
'Business & Pakistan & Top Stories':'Business',
'Infotainment & Top Stories':'Others',
'Perspectives & World':'International',
'Entertainment':'Others',
'Pakistan & Sponsored Content':'National',
'Business & Uncategorized':'Business',
'Featured & World':'International',
'Lifestyle & Trending':'Others',
'Trending & Trending & World':'International',
'Pakistan & Sindh':'National',
'Infotainment & Pakistan':'Others',
'Pakistan & Trending & Uncategorized':'National',
'Business & Sponsored Content':'Business',
'Arts, Culture &amp; Books & Top Stories':'Others',
'Pakistan & Sports & World':'Others',
'International':'International',
'Lifestyle & Uncategorized':'Others',
'Top Stories & Uncategorized & World':'International',
'Lifestyle & Top Stories':'Others',
'Lifestyle & Pakistan':'Others',
'Op-Ed & Pakistan':'National',
'Cartoons':'Others',
'World & World':'International',
'Pakistan & Pakistan & Uncategorized':'National',
'Entertainment & World':'Others',
'Health & World':'International',
'Health':'Others',
'Blog & Blogs':'Others',
'Pakistan & Uncategorized & World':'National',
'Pakistan & Top Stories & Uncategorized':'National',
'Pakistan & Reviews':'National',
'Film And Drama Reviews':'Others',
'Lahore':'National',
'Entertainment & Lifestyle & Trending':'Others',
'Lahore & Pakistan':'National',
'Sports & Top Stories & Trending':'Others',
'Pakistan & Sindh & Top Stories':'National',
'Top Stories & Trending & World':'International',
'Entertainment & Pakistan & Trending':'Others',
'Pakistan & Top Stories & Trending':'National',
'Pakistan & Sindh & Top Stories & Trending':'National',
'Sports & World':'Others',
'Arts, Culture &amp; Books & Trending':'Others',
'Entertainment & Lifestyle':'Others',
'Pakistan & Sports & Top Stories':'Others',
'Sponsored Content & Trending':'Others',
'Entertainment & Trending':'Others',
'Entertainment & Trending & World':'Others',
'Sports & Top Stories & World':'Others',
'Entertainment & Pakistan & Top Stories':'Others',
'Entertainment & Top Stories':'Others',
'Blog':'Others',
'Culture':'Others',
'Pakistan & Punjab & Top Stories':'National',
'Lahore & Pakistan & Punjab & Top Stories':'National',
'Islamabad & Pakistan & Top Stories':'National',
'International & World':'International',
'International & Top Stories & World':'International',
'Pakistan & Pakistan & Top Stories':'National',
'Pakistan & Pakistan':'National',
'Pakistan & Pakistan & Sindh & Top Stories':'National',
'International & Top Stories':'International',
'International & World & World':'International',
'Business & Perspectives':'Business',
'Entertainment & Music & Uncategorized':'Others',
'Sports & Sports':'Others',
'Featured & Lahore':'National',
'Top Stories & World & World':'International',
'Entertainment & TV':'Others',
'TV':'Others',
'Sports & Sports & Uncategorized':'Others',
'Entertainment & International':'Others',
'Business & Lahore & Pakistan & Punjab':'Business',
'Business & Punjab':'Business',
'Pakistan & Punjab & World':'National',
'Lahore & Pakistan & Pakistan':'National',
'Featured & Gilgit Baltistan & Pakistan & Pakistan & Trending':'National',
'Entertainment & Music':'Others',
'Entertainment & Fashion & Lifestyle':'Others',
'Business & Islamabad & Pakistan':'Business',
'Islamabad & Pakistan & Punjab':'National',
'Pakistan & Pakistan & Punjab & Top Stories':'National',
'Balochistan & Pakistan & Pakistan & Top Stories':'National',
'Lifestyle & TV':'Others',
'Islamabad & Pakistan & Pakistan & Top Stories':'National',
'Pakistan & Pakistan & World':'National',
'Entertainment & Film And Drama Reviews':'Others',
'Business & Khyber Pakhtunkhwa':'Business',
'Balochistan':'National',
'Balochistan & Business':'Business',
'Lahore & Punjab':'National',
'Khyber Pakhtunkhwa & Pakistan & Pakistan':'National',
'Business & Pakistan & Uncategorized':'Business',
'Lahore & Pakistan & Top Stories':'National',
'Entertainment & Lifestyle & TV':'Others',
'Lifestyle & Music':'Others',
'Khyber Pakhtunkhwa & Pakistan & Top Stories':'National',
'Business & Pakistan & Punjab':'Business',
'Business & Pakistan & Sindh & Top Stories':'Business',
'Balochistan & Islamabad & Pakistan & Pakistan':'National',
'Islamabad & Pakistan & Sindh':'National',
'Lifestyle & Movies':'Others',
'Balochistan & Islamabad & Punjab & Sindh':'National',
'Balochistan & Islamabad & Khyber Pakhtunkhwa & Pakistan & Punjab & Sindh & Top Stories':'National',
'Featured & Pakistan & Top Stories':'National',
'Entertainment & Lifestyle & Music':'Others',
'Entertainment & Lifestyle & Movies':'Others',
'Business & Pakistan & Pakistan':'Business',
'Health & Pakistan & Sindh':'National',
'Islamabad & Pakistan & Pakistan':'National',
'Khyber Pakhtunkhwa & Pakistan & Pakistan & Top Stories':'National',
'Business & Islamabad & Pakistan & Pakistan':'Business',
'Islamabad & Pakistan & Pakistan & Sindh & Top Stories':'National',
'Islamabad & Pakistan & Pakistan & Punjab & Sindh & Top Stories':'National',
'Pakistan & Pakistan & Punjab':'Business',
'Business & Islamabad & Pakistan & Pakistan & Top Stories':'Business',
'Business & Pakistan & Pakistan & Top Stories':'Business',
'Commentary / Insight & Trending':'Others',
'Pakistan & Punjab & Trending':'National',
'Health & Pakistan & Punjab':'National',
'Lahore & Pakistan & Punjab':'National',
'Business & Pakistan & Punjab & Top Stories':'Business',
'Balochistan & Islamabad & Khyber Pakhtunkhwa & Pakistan & Pakistan & Punjab & Sindh & Top Stories':'National',
'Pakistan & Pakistan & Sindh':'National',
'Balochistan & Pakistan & Pakistan':'National',
'Islamabad & Pakistan & Pakistan & Punjab & Top Stories':'National',
'Featured & Pakistan & Punjab':'National',
'Business & Pakistan & Pakistan & Punjab':'Business',
'Khyber Pakhtunkhwa & Pakistan & Pakistan & Sindh & Top Stories':'National',
'Featured & Pakistan & Trending & Uncategorized':'National',
'Top Stories & Trending & Trending & World':'International',
'Fashion & Lifestyle':'Others',
'Balochistan & Business & Pakistan & Pakistan & Top Stories':'Business',
'Business & Featured & Pakistan':'Business',
'Business & Pakistan & Sindh':'Business',
'Health & Khyber Pakhtunkhwa & Pakistan':'National',
'Gossip & Lifestyle':'Others',
'Lifestyle & Trending & TV':'Others',
'Lifestyle & Movies & Music & Trending':'Others',
'Pakistan & Pakistan & Top Stories & World':'National',
'Health & Pakistan':'National',
'Gossip & Lifestyle & Trending':'Others',
'Business & Pakistan & Pakistan & Punjab & Top Stories':'Business',
'Health & Pakistan & Top Stories':'National',
'Celebrity Interviews & Featured & Lifestyle':'Others',
'Business & Sindh':'Business',
'Lifestyle & Movies & Pakistan':'Others',
'Gossip & International & Lifestyle & Trending':'Others',
'Fashion & Featured & Gossip & Lifestyle & Trending':'Others',
'Entertainment & Gossip & Lifestyle':'Others',
'Pakistan & Pakistan & Sindh & Top Stories & Trending':'National',
'Khyber Pakhtunkhwa & Pakistan & Pakistan & Punjab & Sindh & Top Stories':'National',
'Business & Pakistan & Pakistan & Sindh':'Business',
'Pakistan & Sindh & Uncategorized':'National',
'Lifestyle & Music & TV':'Others',
'Balochistan & Khyber Pakhtunkhwa & Pakistan & Pakistan & Punjab & Sindh & Top Stories':'National',
'Commentary / Insight & Pakistan':'Others',
'Lifestyle & Movies & Trending':'Others',
'Business & Top Stories & World':'Business',
'Fashion & Lifestyle & Trending':'Others',
'Business & Khyber Pakhtunkhwa & Pakistan':'Business',
'Gilgit Baltistan & Pakistan':'National',
'Islamabad & Pakistan & Punjab & Top Stories':'National',
'Business & Pakistan & Pakistan & Sindh & Top Stories':'Business',
'Balochistan & Business & Pakistan & Pakistan':'Business',
'Featured & Pakistan':'National',
'Pakistan & Pakistan & World & World':'National',
'Business & Health & Pakistan':'Business',
'Top Stories & Uncategorized':'National',
'Khyber Pakhtunkhwa & Pakistan & Punjab':'National',
'International & Pakistan & Top Stories':'National',
'Pakistan & Punjab & Sindh & Top Stories':'National',
'Punjab & Top Stories':'National',
'Balochistan & Khyber Pakhtunkhwa & Pakistan & Punjab & Sindh & Top Stories':'National',
'Punjab & Sindh':'National',
'Balochistan & Pakistan & Top Stories':'National',
'Pakistan & Pakistan & Sports':'Others',
'Khyber Pakhtunkhwa & Pakistan & Sindh & Top Stories':'National',
'Business & Sindh & Top Stories':'Business',
'Business & Islamabad':'Business',
'Khyber Pakhtunkhwa & Pakistan & Sindh':'National',
'Pakistan & Perspectives':'National',
'Pakistan & Sindh & Trending':'National',
'Sindh & Sports':'Others',
'Lahore & Top Stories':'National',
'Sindh & Top Stories':'National',
'Lifestyle & World':'Others',
'Balochistan & Top Stories':'National',
'Celebrity Interviews & Lifestyle':'Others',
'Pakistan & Trending & Trending':'National',
'Gilgit Baltistan & Pakistan & Top Stories':'National',
'Gilgit Baltistan & Pakistan & Punjab':'National',
'Arts, Culture &amp; Books & Lifestyle':'Others',
'International & Pakistan':'National',
'Health & Pakistan & World':'National',
'Featured & Lifestyle':'Others',
'Entertainment & Lifestyle & Pakistan':'Others',
'Lifestyle & Lifestyle':'Others',
'Gilgit Baltistan':'National',
'Islamabad & Top Stories':'National',
'Pakistan & Sports & Sports':'Others',
'Pakistan & Sports & Uncategorized':'Others',
'International & Pakistan & World':'International',
'Business & Sports':'Business',
'Karachi':'National',
'Lifestyle & Sports':'Others',
'Karachi & Pakistan & Top Stories':'National',
'Business & Lifestyle & Pakistan':'Business',
'Karachi & Pakistan & Sindh':'National',
'Blogs & Lifestyle':'Others',
'Karachi & Sindh':'National',
'Islamabad & Lahore & Pakistan':'National',
'Karachi & Pakistan':'National',
'International & Sports':'Others',
'Finance':'Business',
'Featured & Pakistan & World':'National',
'Pakistan & Trending & Trending & World':'National',
'Business & International':'Business',
'Entertainment & Pakistan':'Others',
'Fashion':'Others',
'Balochistan & Business & Pakistan & Top Stories':'Business',
'Khyber Pakhtunkhwa & Top Stories':'National',
'Culture & World':'Others',
'Fashion & Gilgit Baltistan':'Others',
'Science and Technology':'National',
'Travel':'Others',
'Health & Punjab':'National',
'Off-beat & World':'Others',
'Karachi & Sindh & Top Stories':'National',
'Business & Finance & Pakistan':'Business',
'Business & Finance':'Business',
'International & Movies':'Others',
'Kashmir & Pakistan':'National',
'International & Lifestyle':'Others',
'Pakistan & Punjab & Sindh':'National',
'Arts, Culture &amp; Books & Entertainment & World':'Others',
'Featured & International & Lifestyle':'Others',
'Arts, Culture &amp; Books & Pakistan':'Others',
'Entertainment & Infotainment & International & Science and Technology & Social Mania & World':'Others',
'Entertainment & Karachi & Sindh':'Others',
'International & Lifestyle & Pakistan':'Others',
'Arts, Culture &amp; Books & Lifestyle & Pakistan':'Others',
'Off-beat & Top Stories':'Others',
'Entertainment & Lifestyle & World':'Others',
'Infotainment & Lifestyle':'Others',
'Business & Pakistan & Science and Technology':'Business',
'International & Science and Technology':'International',
'Education & Pakistan':'National',
'Kashmir':'National',
'Arts, Culture &amp; Books & Entertainment':'Others',
'Pakistan & Travel':'National',
'Travel & World':'Others',
'Science and Technology & World':'International',
'Fashion & International & Lifestyle':'Others',
'Entertainment & Infotainment & International':'Others',
'Blogs & Perspectives':'Others',
'Infotainment & Lahore & Pakistan':'Others',
'Health & International':'International',
'Infotainment & International & World':'Others',
'Pakistan & Science and Technology':'National',
'Culture & Infotainment & Pakistan':'Others',
'Balochistan & Health':'National',
'Top Stories & Travel':'Others',
'Business & Finance & International':'Business',
'Infotainment & Lifestyle & World':'Others',
'Arts, Culture &amp; Books & Featured & Featured':'Others',
'Film And Drama Reviews & International & Lifestyle':'Others',
'Education & Lahore & Pakistan':'National',
'Education & Pakistan & Top Stories':'National',
'Blog & Lifestyle & Pakistan':'Others',
'Blogs & Culture':'Others',
'Business & Finance & International & Pakistan':'Business',
'Education':'Others',
'Celebrity Interviews & Lifestyle & Pakistan':'Others',
'Infotainment & World':'Others',
'Education & International':'International',
'Lifestyle & Pakistan & Trending':'Others',
'Off-beat & Top Stories & World':'Others',
'Health & Lahore & Pakistan & Punjab & Top Stories':'National',
'Pakistan & Top Stories & Travel':'National',
'Health & Lahore & Pakistan & Top Stories':'National',
'Education & Pakistan & Punjab':'National',
'Business & Pakistan & World':'Business',
'Off-beat':'Others',
'Health & Islamabad & Pakistan & Top Stories':'National',
'International & Lifestyle & Movies':'Others',
'Celebrity Interviews & International & Lifestyle':'Others',
'Off-beat & Pakistan':'Others',
'Business & Reviews':'Business',
'Fashion & Lifestyle & Pakistan':'Others',
'Business & Finance & Top Stories':'Business',
'Health & Lahore & Top Stories':'National',
'Health & Pakistan & Punjab & Top Stories':'National',
'Health & Social Mania':'Others',
'Reviews & Sports':'Others',
'International & Kashmir':'International',
'Business & Finance & Pakistan & Top Stories':'Business',
'Business & International & Pakistan':'Business',
'Islamabad & Lifestyle & Pakistan':'Others',
'Arts, Culture &amp; Books & Health':'Others',
'Lahore & Pakistan & Sports':'Others',
'Islamabad & Pakistan & Sports':'Others',
'Entertainment & Karachi & Pakistan & Sports & Top Stories':'National',
'Lahore & Pakistan & Sports & Top Stories':'National',
'Balochistan & Education':'National',
'Islamabad & Lahore & Pakistan & Sports & Top Stories':'Others',
'Islamabad & Lahore & Sports & Top Stories':'National',
'International & Travel':'International',
'Islamabad & Sports':'National',
'Islamabad & Lahore & Pakistan & Sports':'Others',
'Off-beat & Uncategorized':'National',
'Karachi & Pakistan & Sindh & Sports':'Others',
'Pakistan & World & World':'National',
'Arts, Culture &amp; Books & Culture':'Others',
'Health & Top Stories & World':'International',
'Kashmir & Pakistan & Top Stories':'National',
'Kashmir & Pakistan & World':'National',
'Health & Top Stories':'National',
'Islamabad & Pakistan & Trending':'National',
'Gilgit Baltistan & Pakistan & Sports':'Others',
'Kashmir & Top Stories & World':'National',
'Health & Islamabad & Pakistan & Pakistan':'National',
'Music':'Others',
'Islamabad & Pakistan & World':'National',
'International & Islamabad & Pakistan':'National',
'Pakistan & Sindh & Sports':'Others',
'Health & Islamabad':'National',
'Arts, Culture &amp; Books & Perspectives':'Others',
'Arts, Culture &amp; Books & Featured':'Others',
'Kashmir & Top Stories':'National',
'Karachi & Pakistan & Sindh & Top Stories':'National',
'Lifestyle & Reviews':'Others',
'Health & Islamabad':'National',
'Islamabad & Lifestyle':'Others',
'Business & Finance & Pakistan & Uncategorized':'Business',
'Social Mania & TGIF & Trending & World':'Others',
'Business & Science and Technology & TGIF & World':'Business',
'Culture & Pakistan':'Others',
'Lifestyle & Science and Technology & Sports':'Others',
'Business & Trending & World':'Business',
'Business & Top Stories & Trending & Uncategorized & World':'Business',
'Business & Top Stories & Trending & World':'Business',
'Business & Pakistan & Sports':'Business',
'Business & Pakistan & Trending & World':'Business',
'Business & Education & Pakistan & Trending':'Business',
'Islamabad & Lahore & Pakistan & Top Stories':'National',
'Health & Khyber Pakhtunkhwa':'National',
'Islamabad & Kashmir & Top Stories':'National',
'Lahore & Punjab & Top Stories':'National',
'Punjab & Uncategorized':'National',
'Education & Punjab':'National',
'Education & Pakistan & Trending & World':'National',
'Arts, Culture &amp; Books & Education':'Others',
'Gilgit Baltistan & Top Stories':'National',
'Blogs & Pakistan':'National',
'Arts, Culture &amp; Books & Blog':'Others',
'Education & Top Stories':'Others',
'Featured & Op-Ed':'Others',
'Blogs & Sports':'Others',
'Blogs & Featured':'Others',
'Science and Technology & Top Stories':'Others',
'Lifestyle & Pakistan & Top Stories':'National',
'Blogs & Lifestyle & Uncategorized':'Others',
'Science and Technology & Trending':'Others',
'Education & International & World':'International',
'Infotainment & Science and Technology':'Others',
'Balochistan & Pakistan & Sindh':'National',
'Balochistan & Sports':'Others',
'International & Karachi & Lifestyle':'Others',
'Balochistan & Health & Pakistan':'National',
'Lifestyle & Top Stories & Uncategorized':'Others',
'Blogs & Gilgit Baltistan & Sports':'Others',
'Entertainment & International & Lifestyle':'Others',
'Gossip':'Others',
'Kashmir & Pakistan & Uncategorized':'National',
'Region':'Others',
'International & Science and Technology & World':'International',
'Infotainment & International':'Others',
'Arts, Culture &amp; Books & World':'Others',
'Entertainment & Science and Technology & Top Stories':'Others',
'Featured & Uncategorized':'Others',
'Entertainment & Infotainment':'Others',
'Entertainment & Lifestyle & Lifestyle':'Others',
'sci-tec & World':'Others',
'Health & Lifestyle':'Others',
'Entertainment & Lifestyle & Uncategorized':'Others',
  },
    'Tribune': {
'Pakistan, Khyber Pakhtunkhwa, Dera Ismail Khan ':'National',
'Pakistan, Khyber Pakhtunkhwa, Dera Ismail Khan':'National',
 'Pakistan, Punjab':'National',
 'World':'International',
 'Pakistan, K-P':'National',
 'Sports':'Others',
 'Pakistan, Business':'Business',
 'Business':'Business',
 'Pakistan':'National',
 'Life & Style, Film, Gossip':'Others',
 'Food':'Others',
 'Life & Style, TV':'Others',
 'Technology':'Others',
 'Sindh':'National',
 'Life & Style, Film':'Others',
 'Life & Style, Gossip':'Others',
 'Life & Style, Music':'Others',
 'Punjab':'National',
 'K-P':'National',
 'Opinion':'Others',
 'Editorial':'Editorial',
 'Balochistan':'National',
 'Sindh, Health':'National',
 'Pakistan, Life & Style':'Others',
 'Jammu & Kashmir, Health':'National',
 'Pakistan, Sindh':'National',
 'K-P, Music':'Others',
 'World, Jammu & Kashmir':'International',
 'Life & Style':'Others',
 'Pakistan, Balochistan':'National',
 'Business, World':'National',
 'Pakistan, World':'National',
 'Gilgit Baltistan':'National',
 'Jammu & Kashmir':'National',
 'World, Technology':'International',
 'Life & Style, Art and Books, Music':'Others',
 'Life & Style, Fashion, Gossip':'Others',
 'Life & Style, Music, Gossip':'Others',
 'Pakistan, Jammu & Kashmir':'National',
 'Sindh, Punjab':'National',
 'Life & Style, Fashion':'Others',
 'Life & Style, Film, TV':'Others',
 'Balochistan, Business':'Business',
 'Life & Style, Health':'Others',
 'World, Sports':'Others',
 'Punjab, Business':'Business',
 'Music, Film':'Others',
 'TV':'Others',
 'Life & Style, Music, Food':'Others',
 'Pakistan, Health':'National',
 'World, Gilgit Baltistan':'National',
 'World, Life & Style':'Others',
 'World, Music':'Others',
 'Balochistan, K-P':'National',
 'Sindh, Technology':'National',
 'Film':'Others',
 'Sindh, Life & Style, Music':'Others',
 'Life & Style, Gossip, TV':'Others',
 'Life & Style, Art and Books':'Others',
 'K-P, Technology':'National',
 'Magazine':'Others',
 'Film, Gossip':'Others',
 'Life & Style, Theatre':'Others',
 'Business, Technology':'Business',
 'Balochistan, Gilgit Baltistan':'National',
 'K-P, Health':'National',
 'Pakistan, Gilgit Baltistan':'National',
 'Life & Style, Film, Fashion':'Others',
 'Fashion':'Others',
 'Punjab, World':'National',
 'Pakistan, Sports':'Others',
 'Pakistan, Technology':'National',
 'Balochistan, Health':'National',
 'Pakistan, Sindh, Art and Books':'Others',
 'Life & Style, Fashion, TV':'Others',
 'World, Health':'International',
 'TV, Theatre':'Others',
 'Life & Style, Food':'Others',
 'Pakistan, Film':'Others',
 'Health':'Others',
 'World, K-P':'National',
 'Life & Style, K-P':'Others',
 'Art and Books':'Others',
 'Opinion, Health':'Others',
 'Life & Style, Music, TV':'Others',
 'Sports, Life & Style':'Others',
 'Sindh, Jammu & Kashmir':'National',
 'Sindh, Business':'Business',
 'Life & Style, Film, Theatre':'Others',
 'Sindh, Sports':'Others',
 'archives':'Others',
 'Music, Health':'Others',
 'Punjab, Technology':'National',
 'Punjab, Health':'National',
 'Life & Style, Food, Gossip':'Others',
 'Life & Style, Food, Health':'Others',
 'Sports, K-P':'Others',
 'World, archives':'International',
 'Sindh, Life & Style':'Others',
 'Life & Style, Music, Theatre':'Others',
 'World, Videos':'Others',
 'Sports, Videos':'Others',
 'Opinion, Technology':'Others',
 'Sindh, World':'National',
 'Pakistan, K-P, Health':'National',
 'Pakistan, Sindh, Punjab':'National',
 'Life & Style, Opinion':'Others',
 'Balochistan, Life & Style':'Others',
 'Life & Style, Art and Books, Film':'Others',
 'Pakistan, Opinion':'National',
 'Punjab, Jammu & Kashmir':'National',
 'World, Opinion':'International',
 'Punjab, Sports':'Others',
 'Life & Style, Technology':'Others',
 'Gossip':'Others',
 'Life & Style, Music, Film':'Others',
 'Life & Style, Health, TV':'Others',
 'Technology, Games':'Others',
 'Pakistan, World, Jammu & Kashmir':'National',
 'Life & Style, Music, Fashion':'Others',
 'Life & Style, Art and Books, Health':'Others',
 'Sindh, Videos':'Others',
 'Punjab, Food':'Others',
 'Life & Style, Film, Health':'Others',
 'Sports, Multan, Cities':'Others',
 'Music, Fashion':'Others',
 'World, Fashion':'Others',
 'Videos':'Others',
 'Music, Gossip':'Others',
 'World, Food, Technology':'International',
 'Food, Health':'Others',
 'Gossip, TV':'Others',
 'Business, Jammu & Kashmir':'Business',
 'Sindh, Balochistan':'National',
 'Opinion, Editorial':'Editorial',
 'Pakistan, archives':'National',
 'Jammu & Kashmir, Gilgit Baltistan':'National',
 'Punjab, K-P':'National',
 'Business, K-P':'Business',
 'Life & Style, Fashion, Health':'Others',
 'World, Azad Jammu & Kashmir':'International',
 'Life &amp; Style, TV':'Others',
 'Sindh, Business, Health':'National',
 'Sports, Business':'National',
 'Punjab, Business, Lahore, Cities':'National',
 'Punjab, Life & Style, Gossip':'Others',
 'Life & Style, Art and Books, TV':'Others',
 'Sindh, Karachi, Cities':'National',
 'Sindh, archives':'National',
 'Pakistan, Islamabad':'National',
 'life and style':'Others',
 'life and style, Music':'Others',
 'Islamabad':'National',
 'Pakistan, Health, Food':'Others',
 'Pakistan, Khyber-Pakhtunkhwa':'National',
 'Music, Film, Theatre':'Others',
 'Music':'Others',
 'Cricket':'Others',
 'Film, Gossip, Bollywood':'Others',
 'Khyber-Pakhtunkhwa':'National',
 'Sindh, Karachi':'National',
 'Khyber-Pakhtunkhwa, Pakistan':'National',
 'World, Azad Jammu & Kashmir, Cities':'National',
 'Pakistan, Islamabad, Cities':'National',
 'Gossip, TV, Music':'Others',
 'Music, Pakistan, Life & Style':'Others',
 'Bollywood, Film':'Others',
 'Pakistan, Azad Jammu & Kashmir':'National',
 'Art and Books, Film, Games':'Others',
 'Pakistan, Khyber-Pakhtunkhwa, Cities':'National',
 'Sindh, Hyderabad':'National',
 'Sindh, Cities':'National',
 'Pakistan, Sindh, Cities':'National',
 'Art and Books, Film':'Others',
 'Pakistan, Lahore':'Others',
 'Sports, TV, Gossip':'Others',
 'Punjab, Pakistan, Lahore':'National',
 'Games':'Others',
 'Khyber-Pakhtunkhwa, Swat':'National',
 'Pakistan, Sindh, Karachi, Business':'Business',
 'Pakistan, Punjab, Khyber-Pakhtunkhwa':'National',
 'Football':'Others',
 'Pakistan, Peshawar':'National',
 'TV, Sports':'Others',
 'Khyber-Pakhtunkhwa, Cities, Peshawar':'National',
 'Rawalpindi':'National',
 'Slideshows, World':'Others',
 'Pakistan, Azad Jammu & Kashmir, Cities':'National',
 'Pakistan, Cities, Khyber-Pakhtunkhwa':'National',
 'Gossip, Film':'Others',
 'Health, ADVICE':'Others',
 'TV, Film':'Others',
 'Pakistan, Lahore, Cities':'National',
 'Pakistan, Gwadar, Cities':'National',
 'Rawalpindi, Punjab':'National',
 'Punjab, Rawalpindi':'National',
 'Khyber-Pakhtunkhwa, Peshawar':'National',
 'Punjab, Lahore':'National',
 'Pakistan, Balochistan, Cities':'National',
 'Sindh, khairpur':'National',
 'Sports, TV':'Others',
 'Islamabad, Pakistan, Cities':'National',
 'Pakistan, Khyber-Pakhtunkhwa, Punjab':'National',
 'Pakistan, Sindh, Karachi':'National',
 'Khyber-Pakhtunkhwa, Abbottabad':'National',
 'TV, Gossip':'Others',
 'Pakistan, Khyber-Pakhtunkhwa, Gilgit-Baltistan':'National',
 'World, Pakistan':'National',
 'Khyber-Pakhtunkhwa, Cities':'National',
 'Fashion, Gossip':'Others',
 'Islamabad, World':'National',
 'Punjab, Multan':'National',
 'Punjab, Faisalabad':'National',
 'Pakistan, Islamabad, Sindh':'National',
 'Pakistan, Karachi, Cities':'National',
 'Islamabad, Business':'Business',
 'Sindh, Pakistan':'National',
 'Pakistan, Islamabad, Punjab':'National',
 'Pakistan, Sindh, Karachi, Cities':'National',
 'Pakistan, Cities, Azad Jammu & Kashmir':'National',
 'Health, Life & Style, ADVICE':'Others',
 'Islamabad, Balochistan':'National',
 'Pakistan, Gilgit-Baltistan':'National',
 'Pakistan, Punjab, Cities':'National',
 'Sindh, tharparkar':'National',
 'Business, Gilgit-Baltistan':'Business',
 'Khyber Pakhtunkhwa':'National',
 'World, Bollywood':'Others',
 'Film, TV, Life & Style, life and style':'Others',
 'TV, Film, Life & Style':'Others',
 'Film, Life & Style':'Others',
 'Pakistan, Cities, Lahore':'National',
 'Life & Style, Gossip, Film':'Others',
 'Film, Art and Books':'Others',
 'Food, ADVICE, Health':'Others',
 'Gossip, Fashion':'Others',
 'Gossip, Life & Style':'Others',
 'Pakistan, Khyber-Pakhtunkhwa, Mardan':'National',
 'Pakistan, Punjab, Lahore':'National',
 'Life & Style, Gossip, Fashion':'Others',
 'Pakistan, Cities':'National',
 'Islamabad, Rawalpindi':'National',
 'Islamabad, Pakistan':'National',
 'Pakistan, Islamabad, World':'National',
 'Pakistan, Sindh, Nawabshah':'National',
 'Balochistan, Islamabad, Pakistan':'National',
 'Fashion, Life & Style':'Others',
 'Health, ADVICE, Life & Style':'Others',
 'Music, Life & Style':'Others',
 'Pakistan, Peshawar, Islamabad, Cities':'National',
 'Punjab, Gilgit-Baltistan':'National',
 'Health, World':'Others',
 'Gossip, Life & Style, Music':'Others',
 'Gossip, Music':'Others',
 'Gossip, Film, TV':'Others',
 'Technology, Life & Style':'Others',
 'Sindh, Pakistan, Karachi, Cities':'National',
 'Gossip, TV, Film, Life & Style':'Others',
 'Pakistan, World, Islamabad, Cities':'National',
 'Khyber-Pakhtunkhwa, Mardan':'National',
 'Sindh, Nawabshah':'National',
 'Azad Jammu & Kashmir, Pakistan':'National',
 'TV, Gossip, Life & Style':'Others',
 'Film, TV, Life & Style':'Others',
 'Khyber-Pakhtunkhwa, Peshawar, Sports':'Others',
 'Film, Gossip, Life & Style':'Others',
 'Rawalpindi, Islamabad':'National',
 'Sindh, sukkur':'National',
 'Pakistan, World, Islamabad':'National',
 'Pakistan, Sindh, Karachi, Islamabad':'National',
 'Business, Pakistan':'Business',
 'ADVICE, Life & Style, Film':'Others',
 'Film, Life & Style, TV, Gossip':'Others',
 'Music, TV':'Others',
 'Pakistan, Azad Jammu & Kashmir, Islamabad':'National',
 'Music, Gossip, Life & Style':'Others',
 'Islamabad, Cities, Pakistan':'National',
 'Islamabad, Punjab':'National',
 'World, Business':'Business',
 'Health, Life & Style':'Others',
 'Technology, World':'Others',
 'Islamabad, Khyber-Pakhtunkhwa':'National',
 'Pakistan, Sindh, Karachi, Hyderabad':'National',
 'Life & Style, TV, Gossip':'Others',
 'Khyber Pakhtunkhwa, Mardan':'National',
 'ADVICE, Life & Style':'Others',
 'Pakistan, Sindh, Balochistan':'National',
 'Gossip, Life & Style, TV':'Others',
 'Music, Film, Gossip':'Others',
 'Khyber Pakhtunkhwa, Islamabad':'National',
 'Pakistan, World, Azad Jammu & Kashmir':'National',
 'World, Khyber Pakhtunkhwa, Peshawar, Islamabad, Pakistan':'National',
 'Rawalpindi, Pakistan, Business':'Business',
 'Khyber Pakhtunkhwa, Peshawar':'National',
 'Sindh, dadu':'National',
 'Fashion, Gossip, Life & Style':'Others',
 'Pakistan, Karachi, Sindh':'National',
 'Pakistan, Balochistan, Quetta':'National',
 'TV, Life & Style':'Others',
 'Pakistan, Islamabad, World, Cities':'National',
 'Pakistan, Khyber Pakhtunkhwa, Cities':'National',
 'Pakistan, Karachi, Sindh, Cities':'National',
 'Pakistan, Rawalpindi':'National',
 'Punjab, Film':'Others',
 'Khyber Pakhtunkhwa, Swat':'National',
 'Balochistan, Gwadar':'National',
 'TV, Life & Style, Gossip':'Others',
 'Sindh, Karachi, Pakistan, Cities':'National',
 'Pakistan, Sindh, Islamabad':'National',
 'Pakistan, Punjab, Islamabad, Cities':'National',
 'Khyber Pakhtunkhwa, Nowshera':'National',
 'Balochistan, Quetta':'National',
 'TV, Film, Gossip':'Others',
 'Abbottabad':'National',
 'Pakistan, Karachi, Peshawar':'National',
 'Pakistan, Khyber Pakhtunkhwa':'National',
 'Sports, Hockey':'Others',
 'Art and Books, Life & Style':'Others',
 'Pakistan, World, Balochistan':'National',
 'Theatre':'Others',
 'Gossip, Film, Life & Style':'Others',
 'Pakistan, Business, Khyber Pakhtunkhwa':'Business',
 'Film, TV, Gossip, Life & Style':'Others',
 'Life & Style, Sports':'Others',
 'Music, Art and Books, Life & Style, Film':'Others',
 'Pakistan, Sindh, Karachi, Hyderabad, Cities':'National',
 'Life & Style, ADVICE':'Others',
 'Pakistan, Sindh, Punjab, Azad Jammu & Kashmir, Gilgit-Baltistan, Khyber Pakhtunkhwa, Balochistan':'National',
 'Pakistan, Karachi':'National',
 'Fashion, Music':'Others',
 'Film, TV':'Others',
 'Pakistan, Khyber Pakhtunkhwa, Mansehra, Nowshera, Charsadda':'National',
 'Fashion, Gossip, TV':'Others',
 'Karachi, Pakistan':'National',
 'Pakistan, Azad Jammu & Kashmir, Muzaffarabad':'National',
 'Pakistan, Azad Jammu & Kashmir, World':'National',
 'Film, Music, Gossip':'Others',
 'Pakistan, Punjab, Rawalpindi':'National',
 'Pakistan, Lahore, Punjab':'National',
 'World, Newslab':'International',
 'Karachi':'National',
 'Pakistan, Islamabad, Azad Jammu & Kashmir':'National',
 'Gossip, Fashion, Life & Style':'Others',
 'Pakistan, Rawalpindi, Karachi':'National',
 'Art and Books, Gossip, Life & Style':'Others',
 'Film, Theatre, Life & Style':'Others',
 'Pakistan, Punjab, Islamabad':'National',
 'Gilgit-Baltistan, Pakistan':'National',
 'Peshawar, Khyber Pakhtunkhwa, Pakistan':'National',
 'Khyber Pakhtunkhwa, Pakistan, Cities':'National',
 'Life & Style, TV, Film':'Others',
 'Punjab, Pakistan':'National',
 'Business, Sindh, Karachi':'Business',
 'Lahore, Pakistan':'National',
 'Pakistan, World, Business':'Business',
 'Life & Style, Bollywood, Art and Books':'Others',
 'TV, Fashion':'Others',
 'Khyber Pakhtunkhwa, Pakistan, Peshawar':'National',
 'Pakistan, Punjab, Faisalabad':'National',
 'Life & Style, Bollywood':'Others',
 'Pakistan, Punjab, Gujranwala':'National',
 'Pakistan, Khyber Pakhtunkhwa, Peshawar':'National',
 'Gossip, Bollywood':'Others',
 'Khyber Pakhtunkhwa, Pakistan, Charsadda':'National',
 'Pakistan, World, Technology':'International',
 'Pakistan, Balochistan, Khyber Pakhtunkhwa':'National',
 'Gilgit-Baltistan, Pakistan, Skardu':'National',
 'Punjab, Pakistan, Gujranwala':'National',
 'Sindh, Pakistan, Karachi':'National',
 'Balochistan, Pakistan':'National',
 'Pakistan, Sindh, sukkur':'National',
 'T.Edit':'Others',
'Fashion, Film':'Others',
'Pakistan, Azad Jammu & Kashmir, Rawalpindi':'National',
'Pakistan, Punjab, Multan':'National',
'Khyber Pakhtunkhwa, Pakistan':'National',
'Gilgit-Baltistan':'National',
'Pakistan, Islamabad, Rawalpindi':'National',
'Pakistan, Khyber Pakhtunkhwa, Mansehra':'National',
'World, Pakistan, Azad Jammu & Kashmir':'National',
'Pakistan, Gilgit-Baltistan, gilgit':'National',
'Sindh, Karachi, Pakistan':'National',
'Pakistan, Khyber Pakhtunkhwa, Punjab':'National',
'Pakistan, Khyber Pakhtunkhwa, Swat':'National',
'Pakistan, Islamabad, Karachi':'National',
'Pakistan, Sindh, Balochistan, Karachi':'National',
'Pakistan, Islamabad, Lahore':'National',
'Food, Life & Style':'Others',
'Film, Music':'Others',
'Pakistan, POLITICS':'National',
'Pakistan, World, Multan, Punjab':'National',
'Pakistan, Sindh, Balochistan, Islamabad, Khyber Pakhtunkhwa, Punjab':'National',
'Pakistan, Sindh, Punjab, Balochistan, Khyber Pakhtunkhwa':'National',
'Art and Books, Fashion':'Others',
'Pakistan, Gilgit-Baltistan, Abbottabad':'National',
'World, Pakistan, Islamabad':'National',
'Pakistan, Punjab, Khyber Pakhtunkhwa, Bannu, Dera Ismail Khan':'National',
'Pakistan, Islamabad, Khyber Pakhtunkhwa':'National',
'Pakistan, Sindh, Karachi, Quetta, Balochistan':'National',
'Life & Style, Business':'Others',
'Pakistan, Punjab, Lahore, Islamabad':'National',
'Punjab, Pakistan, Rawalpindi':'National',
'Hockey':'Others',
'Pakistan, Sindh, khairpur':'National',
'Pakistan, Islamabad, Sindh, Punjab, Balochistan, Khyber Pakhtunkhwa':'National',
'Punjab, Lahore, Pakistan':'National',
'Khyber Pakhtunkhwa, Pakistan, Swat':'National',
'Pakistan, Punjab, Sindh, Balochistan, Islamabad, Gilgit-Baltistan, Azad Jammu & Kashmir':'National',
'World, Pakistan, Balochistan':'National',
'Pakistan, Quetta':'National',
'Multan, Pakistan, Punjab':'National',
'Pakistan, Cricket':'Others',
'Gossip, Art and Books':'Others',
'Karachi, Pakistan, Islamabad':'National',
'Sindh, Pakistan, Cities':'National',
'Karachi, Sindh':'National',
'Pakistan, Rawalpindi, Punjab':'National',
'Islamabad, Pakistan, Punjab':'National',
'Lahore':'National',
'Azad Jammu & Kashmir':'National',
'Islamabad, Peshawar':'National',
'Pakistan, Islamabad, Punjab, Khyber Pakhtunkhwa, Balochistan':'National',
'Pakistan, Islamabad, Lahore, Punjab':'National',
'Sindh, Pakistan, sukkur':'National',
'Khyber Pakhtunkhwa, Peshawar, Pakistan':'National',
'Pakistan, Punjab, Sindh':'National',
'Islamabad, Karachi, Lahore, Pakistan':'National',
'Karachi, Pakistan, Sindh':'National',
'Pakistan, Peshawar, Khyber Pakhtunkhwa':'National',
'Pakistan, Punjab, Islamabad, Khyber Pakhtunkhwa, Balochistan, Azad Jammu & Kashmir, Gilgit-Baltistan, Sindh':'National',
'Islamabad, Rawalpindi, Pakistan':'National',
'Khyber Pakhtunkhwa, Gilgit-Baltistan':'National',
'Pakistan, Khyber Pakhtunkhwa, Nowshera':'National',
'Quetta, Pakistan, Balochistan':'National',
'Pakistan, Sindh, Karachi, Punjab, Lahore':'National',
'Life & Style, World':'Others',
'Pakistan, Islamabad, Business':'Business',
'Islamabad, Khyber Pakhtunkhwa':'National',
'Pakistan, Punjab, Khyber Pakhtunkhwa':'National',
'Pakistan, Quetta, Balochistan':'National',
'Pakistan, Islamabad, Mirpur':'National',
'Pakistan, Punjab, Khyber Pakhtunkhwa, Sindh':'National',
'Health, Latest':'Others',
'Pakistan, Balochistan, Zhob':'National',
'Technology, Business':'Business',
'Lahore, Punjab':'National',
'Pakistan, Muzaffarabad':'National',
'Pakistan, Sindh, Opinion':'National',
'Pakistan, Khyber Pakhtunkhwa, Sindh':'National',
'Pakistan, Karachi, World':'National',
'Technology, Business, World':'Business',
'Pakistan, Islamabad, Gilgit-Baltistan':'National',
'Peshawar, Khyber Pakhtunkhwa':'National',
'Khyber Pakhtunkhwa, Pakistan, Islamabad':'National',
'Pakistan, Sindh, Life & Style':'Others',
'Pakistan, Sindh, Hyderabad':'National',
'gilgit':'National',
'Sports, Tennis':'Others',
'Sports, Football':'Others',
'Pakistan, Gilgit-Baltistan, Islamabad':'National',
'TV, Music, Fashion':'Others',
'Pakistan, Business, Islamabad':'Business',
'Pakistan, Karachi, Sindh, Balochistan':'National',
'Fashion, TV, Music':'Others',
'World, Pakistan, Sports':'Others',
'Sports, World':'Others',
'Pakistan, Sports, Cricket':'Others',
'Sports, Pakistan':'Others',
'Technology, Pakistan':'Others',
'Pakistan, gilgit':'National',
'Film, Fashion':'Others',
'TV, Film, Music':'Others',
'TV, Music':'Others',
'Film, TV, Art and Books':'Others',
'Karachi, Life & Style':'Others',
'Music, Film, TV':'Others',
'Azad Jammu & Kashmir, Khyber Pakhtunkhwa':'National',
'POLITICS, World':'International',
'Pakistan, Punjab, Opinion':'National',
'Pakistan, Gwadar, Balochistan':'National',
'POLITICS':'National',
'Film, Music, TV, Art and Books':'Others',
'Gossip, TV, Film':'Others',
'Multimedia':'Others',
'Azad Jammu & Kashmir, World':'International',
'Pakistan, Technology, Sindh, Karachi':'National',
'Pakistan, Sindh, tharparkar':'National',
'Pakistan, Sindh, Sanghar':'National',
'Fashion, TV':'Others',
'Trends':'Others',
'POLITICS, Pakistan':'National',
'Life & Style, Spotlight':'Others',
'Spotlight':'Others',
'Film, Art and Books, TV, Music':'Others',
'POLITICS, Life & Style':'Others',
'Spotlight, Music':'Others',
'Music, Spotlight':'Others',
'Gossip, Spotlight':'Others',
'Spotlight, TV':'Others',
'Spotlight, Gossip':'Others',
'Pakistan, Khyber Pakhtunkhwa, Mardan':'National',
'Spotlight, Film':'Others',
'Health, Spotlight':'Others',
'Spotlight, Fashion':'Others',
'Technology, Food':'Others',
'Art and Books, Spotlight':'Others',
'Pakistan, Balochistan, Derabugti':'National',
'Sports, Pakistan, Cricket':'Others',
'Pakistan, Balochistan, Gwadar':'National',
'Pakistan, Life & Style, MOVIES':'Others',
'Film, Spotlight':'Others',
'Business, Life & Style':'Business',
'Pakistan, Cricket, Sports':'Others',
'Pakistan, Balochistan, Football, Gwadar':'Others',
'Pakistan, Sindh, Thatta':'National',
'Karachi, Sindh, Pakistan':'National',
'Pakistan, Faisalabad, Punjab':'National',
'TV, Spotlight':'Others',
'Technology, Health':'Others',
'Health, Technology':'Others',
'Pakistan, Swat, Khyber Pakhtunkhwa':'National',
'Sports, Technology':'Others',
'Pakistan, Hyderabad, Sindh':'National',
'Spotlight, Food':'Others',
'Bollywood, Gossip':'Others',
'Technology, Sports':'Others',
'Pakistan, Larkana':'National',
'Pakistan, Jacobabad':'National',
'Art and Books, Theatre':'Others',
'World, Azad Jammu & Kashmir, Pakistan':'National',
'Sports, Spotlight':'Others',
'Spotlight, Sports':'Others',
'Latest':'Others',
'Punjab, Islamabad':'National',
'Tennis':'Others',
'Football, Sports':'Others',
'Pakistan, Business, World':'Business',
'Pakistan, Business, Life & Style':'Business',
'Fashion, Spotlight':'Others',
    }
}
# Function to map categories based on the category_mappings dictionary
def map_categories(row):
    source = row['source']
    category = row['categories']

    if source in category_mappings and category in category_mappings[source]:
        return category_mappings[source][category]
    else:
        # Increment a counter when 'Others' is returned
        global others_count
        others_count += 1
        print(f'Category mapping not found for row: Source={source}, Category={category}')
        #return category  # Use the original category if not found in mapping
        return 'Others'  # Append 'Others' to the 'main_cat' column if not found in mapping


# Initialize the counter
others_count = 0

# Apply the mapping function to the DataFrame
merged_df['main_cat'] = merged_df.apply(map_categories, axis=1)

# Print the count of 'Others'
print(f'Total number of rows with "Others": {others_count}')


In [None]:
# Get unique main categories
main_categories = merged['main_cat'].unique()
print(main_categories)

# Text Preprocessing

Select Only Business News, World News, National News

In [None]:
merged_df = merged_df[merged_df['main_cat'] != 'Others']
print(merged_df.shape)

In [None]:
main_categories = merged_df['main_cat'].unique()
print(main_categories)

In [None]:
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    # Remove special characters and digits
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespaces

    # Remove "Published in Dawn" with a regular expression
    text = re.sub(r'Published in Dawn, [A-Za-z]+\s\d{1,2}(st|nd|rd|th)?,\s\d{4}', '', text)

    # Remove "Published in Dawn" with a regular expression
    text = re.sub(r'published in dawn, [A-Za-z]+\s\d{1,2}(st|nd|rd|th)?,\s\d{4}', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove "(adsbygoogle = window.adsbygoogle || [ ] ) .push ( { } )"
    text = re.sub(r'\(adsbygoogle\s*=\s*window.adsbygoogle\s*\|\|\s*\[\s*\]\s*\)\s*\.\s*push\s*\(\s*\{\s*\}\s*\)', '', text)

    # Remove copyright notice with a regular expression
    text = re.sub(r'copyright business recorder, \d{4}', '', text)

    # Remove copyright notice with a regular expression
    text = re.sub(r'Copyright Business Recorder, \d{4}', '', text)

    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Stem the tokens
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Join the tokens back into a single string
    cleaned_text = ' '.join(tokens)

    return cleaned_text

def text_for_display(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespaces
    #text = text.lower()
    # Remove "Published in Dawn" with a regular expression
    text = re.sub(r'Published in Dawn, [A-Za-z]+\s\d{1,2}(st|nd|rd|th)?,\s\d{4}', '', text)
    text = re.sub(r'\(adsbygoogle\s*=\s*window.adsbygoogle\s*\|\|\s*\[\s*\]\s*\)\s*\.\s*push\s*\(\s*\{\s*\}\s*\)', '', text)
    text = re.sub(r'Copyright Business Recorder, \d{4}', '', text)
    text = re.sub(r'<[^>]+>', '', text)
    return text

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))  # Set of stopwords for English language
    tokens = word_tokenize(text)  # Tokenize the text into individual words
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]  # Filter out stopwords
    filtered_text = ' '.join(filtered_tokens)  # Join the filtered words back into a single string
    return filtered_text

def remove_punc(text):
     #text = re.sub(r'[^\w\s.]', '', text)  #Remove punctuation marks other than .
     text = re.sub(r'[^\w\s]', '', text)  # Remove all punctuation marks, including periods
     return text

def extract_first_15_sentences(text):
    if text is not None:
        sentences = sent_tokenize(text)
        first_15_sentences = ' '.join(sentences[:15])
        return first_15_sentences
    else:
        return ""

def clean_wo_stemming_tokenization(text):
    # Remove special characters and digits
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespaces

    # Remove "Published in Dawn" with a regular expression
    text = re.sub(r'Published in Dawn, [A-Za-z]+\s\d{1,2}(st|nd|rd|th)?,\s\d{4}', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove "(adsbygoogle = window.adsbygoogle || [ ] ) .push ( { } )"
    text = re.sub(r'\(adsbygoogle\s*=\s*window.adsbygoogle\s*\|\|\s*\[\s*\]\s*\)\s*\.\s*push\s*\(\s*\{\s*\}\s*\)', '', text)

    # Remove copyright notice with a regular expression
    text = re.sub(r'copyright business recorder, \d{4}', '', text)

    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Join the tokens back into a single string
    cleaned_text = ' '.join(tokens)

    return cleaned_text

merged_df['Combined'] = merged_df['Headline'] + ' ' + merged_df['News']

merged_df ['Combined_Cleaned']= merged_df['Combined'].apply(clean_text)

merged_df ['CC_wo_Stopwords']= merged_df['Combined_Cleaned'].apply(remove_stopwords)

merged_df['CC_wo_Stopwords_Punc']= merged_df ['CC_wo_Stopwords'].apply(remove_punc)

merged_df['CC_wo_Stem_Lema_Punc']= merged_df ['Combined'].apply(text_for_display)

merged_df ['CC_wo_Stem_Lema_Punc']= merged_df ['CC_wo_Stem_Lema_Punc'].apply(remove_punc)

merged_df['Cleaned_description']= merged_df['News'].apply(text_for_display)


print(merged_df.info())
print(merged_df.shape)


In [None]:
# Convert the 'CC_wo_Stem_Lema_Punc' column to lowercase
merged_df['CC_wo_Stem_Lema_Punc'] = merged_df['CC_wo_Stem_Lema_Punc'].str.lower()

In [None]:
# Initialize a variable to keep track of the maximum length
max_length = 0

# Iterate through the rows and find the maximum length
for text in filtered_df['Combined_Cleaned']:
    length = len(text)
    if length > max_length:
        max_length = length

# Print the maximum length
print("Maximum Length of 'Combined_Cleaned':", max_length)

# Renaming Columns

In [None]:
column_mapping = {
    'headline': 'Headline',
    'date': 'DateTime',
    'link': 'Url',
    'source': 'Source',
    'categories': 'Category',
    'description': 'News'
}

merged = merged.rename(columns=column_mapping)
print(merged.shape)
merged.to_csv('updated.csv', index=False)

In [None]:
column_mapping = {
    'Source': 'source'
}

merged = merged.rename(columns=column_mapping)

In [None]:
column_mapping = {
    'Sentiment_Prediction': 'sentiment'
}
merged = merged.rename(columns=column_mapping)

# Deciding sub-cat of News based upon keyword list

In [None]:
# Combine single-word and double-word keywords, symbols, and company names for the Oil & Gas sector
oil_gas_keywords = [
    "Crude Oil",
    "Natural Gas",
    "Exploration",
    "Drilling",
    "Reservoir",
    "Refining",
    "Petrochemical",
    "Upstream",
    "Downstream",
    "Midstream",
    "Offshore",
    "Onshore",
    "Oilfield Services",
    "Exploration and Production (E&P)",
    "Refinery",
    "Exploration and Production Companies",
    "Oil Price",
    "Gas Price",
    "Oilfield Technology",
    "Energy Sector",
    "Energy Stocks",
    "Commodity Prices",
    "Energy Demand",
    "OPEC (Organization of the Petroleum Exporting Countries)",
    "Shale Oil",
    "Shale Gas",
    "Reserves",
    "Crude Oil Inventories",
    "Natural Gas Storage",
    "Energy Infrastructure",
    "Energy Supply Chain",
    "Oil Rig",
    "Gas Pipeline",
    "Energy Exploration Contracts",
    "Energy Market Trends",
    "Energy Investments",
    "Energy Policy",
    "Energy Regulations",
    "Renewable Energy Competition",
    "MARI Mari Petroleum Company Limited",
    "OGDC Oil & Gas Development Company Limited",
    "POL Pakistan Oilfields Limited",
    "PPL Pakistan Petroleum Limited",
    "APL Attock Petroleum Limited",
    "BPL Burshane LPG (Pakistan) Limited",
    "HASCOLDEF Hascol Petroleum Limited",
    "HTL Hi-Tech Lubricants Limited",
    "OBOY Oilboy Energy Limited",
    "PSO Pakistan State Oil Company Limited",
    "SHEL Shell Pakistan Limited",
    "SNGP Sui Northern Gas Pipelines Limited",
    "SSGC Sui Southern Gas Company Limited",
    "Carbon Emissions",
    "Cubic Feet",
    "Exploration and Production",
    "Well Integrity",
    "Sui Southern Gas Company",
    "Public",
    "Reserve",
    "Burshane LPG",
    "Total Parco",
    "Sui Northern Gas Pipelines Limited",
    "Basic Power",
    "Refinery",
    "Oil & Gas Development Company",
    "Reserves",
    "Barrel",
    "LPG Association",
    "Offshore Drilling",
    "Khyber Pakhtunkhwa Oil & Gas Company (KPOGCL)",
    "Oil Reserves",
    "Pakistan Refinery Limited (PRL)",
    "High-Tech Lubricants Limited (HTL)",
    "Crude Oil",
    "Zeba",
    "Hydraulic Fracturing",
    "Drill Bit",
    "Energy Mix",
    "Pipelines",
    "LNG",
    "Carbon Storage",
    "Mari Petroleum",
    "Kannauj",
    "Pakistan Petroleum Limited (PPL)",
    "Commercial Cylinder",
    "Sui Northern Gas Pipelines Limited",
    "Pakistan Refinery Limited (PRL)",
    "Oil Production",
    "Shell Oil and Gas",
    "Liquefied Natural Gas",
    "Energy Security",
    "Attock Refinery Limited",
    "Engro Elengy Terminal",
    "Sui Southern",
    "Platform",
    "Oilfield Services",
    "Exploration and Production (E&P)",
    "Reservoir",
    "Crude Oil Inventories",
    "Liquefied Natural Gas",
    "LNG",
    "Exploration",
    "Exploration and Production",
    "Carbon Capture and Storage",
    "Pakistan State Oil",
    "Pakistan Petroleum Limited",
    "Well Integrity",
    "Sui Southern Gas Company",
    "Public",
    "Reserve",
    "Burshane LPG",
    "Total Parco",
    "Sui Northern Gas Pipelines Limited",
    "Basic Power",
    "Refinery",
    "Oil & Gas Development Company",
    "Reserves",
    "Barrel",
    "LPG Association",
    "Offshore Drilling",
    "KPOGCL",
    "Oil Reserves",
    "Pakistan Refinery Limited",
    "High-Tech Lubricants Limited (HTL)",
    "Crude Oil",
    "Zeba",
    "Hydraulic Fracturing",
    "Drill Bit",
    "Energy Mix",
    "Pipelines",
    "LNG",
    "Carbon Storage",
    "Mari Petroleum",
    "Kannauj",
    "PPL",
    "Commercial Cylinder",
    "Sui Northern Gas Pipelines Limited",
    "Pakistan Refinery Limited",
    "Oil Production",
    "Shell Oil and Gas",
    "Liquefied Natural Gas",
    "Energy Security",
    "Attock Refinery Limited",
    "Engro Elengy Terminal",
    "Sui Southern",
    "Platform",
    "Oilfield Services",
    "Exploration and Production (E&P)",
    "Reservoir",
    "Crude Oil Inventories",
    "Liquefied Natural Gas",
    "LNG",
    "Exploration",
    "Exploration and Production",
    "Carbon Capture and Storage",
    "Pakistan State Oil",
    "Pakistan Petroleum Limited",
    "Well Integrity",
    "Sui Southern Gas Company",
    "Public",
    "Reserve",
    "Burshane LPG",
    "Total Parco",
    "Sui Northern Gas Pipelines Limited",
    "Basic Power",
    "Refinery",
    "Oil & Gas Development Company",
    "Reserves",
    "Barrel",
    "LPG Association",
    "Offshore Drilling",
    "KPOGCL",
    "Oil Reserves",
    "Pakistan Refinery Limited",
    "High-Tech Lubricants Limited (HTL)",
    "Crude Oil",
    "Zeba",
    "Hydraulic Fracturing",
    "Drill Bit",
    "Energy Mix",
    "Pipelines",
    "LNG",
    "Carbon Storage",
    "Mari Petroleum",
    "Kannauj",
    "PPL",
    "Commercial Cylinder",
    "Sui Northern Gas Pipelines Limited",
    "Pakistan Refinery Limited",
    "Oil Production",
    "Shell Oil and Gas",
    "Liquefied Natural Gas",
    "Energy Security",
    "Attock Refinery Limited",
    "Engro Elengy Terminal",
    "Sui Southern",
    "Platform",
    "Oilboy Energy Limited",
    "Hascol Petroleum",
    "Pakistan Oilfields Limited",
    "Puma Energy Pakistan",
    "Total Parco",
    "Attock Petroleum Limited",
    "Cubic Feet",
    "Natural Gas",
    "Downstream",
    "OBOY",
    "Cenergyico",
    "Sui Southern Gas Company",
    "Bone",
    "Coastal",
    "Hascol Petroleum",
    "Petroleum Products",
    "Diesel",
    "United Energy Pakistan",
    "Pipeline Installation",
    "Pak-Arab Refinery",
    "Cubic Feet",
    "Natural Gas",
    "Downstream",
    "OBOY",
    "Cenergyico",
    "NGLs",
    "Emissions Reduction",
    "SSGC",
    "Bone Crystallization",
    "Mud Logging",
    "Ministry of Energy",
    "Boring",
    "Pakistan Petroleum",
    "Refining",
    "Cylinder",
    "Better Oil Recovery",
    "Fracking",
    "Cenergyico",
    "Crude Oil",
    "Vein",
    "Nishpa Drilling Lease",
    "Production",
    "Fuel Extraction in Pakistan",
    "Attock Refinery Limited",
    "Oil & Gas Development Company",
    "Pakistan Refinery Limited"
]
# Combine single-word and double-word keywords, symbols, and company names
cement_keywords = [
    "Cement",
    "Cement Manufacturing",
    "Concrete",
    "Construction Materials",
    "Building Materials",
    "Cement Production",
    "Cement Companies",
    "Cement Industry",
    "Cement Stocks",
    "Cement Demand",
    "Cement Prices",
    "Cement Sales",
    "Cement Plants",
    "Cement Market",
    "Cement Production Capacity",
    "Cement Export",
    "Cement Import",
    "Cement Trade",
    "Cement Infrastructure",
    "Cement Projects",
    "Cement Supply Chain",
    "Cement Consumption",
    "Cement Manufacturing Companies",
    "Cement Sector Trends",
    "Cement Investments",
    "Cement Regulations",
    "Cement Production Growth",
    "Cement Stock Performance",
    "Cement Sector Analysis",
    "Cement Sector News",
    "DCL",
    "Dewan Cement Limited",
    "DGKC",
    "D.G. Khan Cement Company Limited",
    "DNCCDEF",
    "Dandot Cement Company Limited",
    "FCCL",
    "Fauji Cement Company Limited",
    "FECTC",
    "Fecto Cement Limited",
    "FLYNG",
    "Flying Cement Company Limited",
    "GWLC",
    "Gharibwal Cement Limited",
    "KOHC",
    "Kohat Cement Company Limited",
    "LUCK",
    "Lucky Cement Limited",
    "MLCF",
    "Maple Leaf Cement Factory Limited",
    "PIOC",
    "Pioneer Cement Limited",
    "POWER",
    "Power Cement Limited",
    "SMCPL",
    "Safe Mix Concrete Limited",
    "THCCL",
    "Thatta Cement Company Limited",
    "ZELPDEF",
    "Zeal Pak Cement Factory Limited"
]
economic_keywords = [
    "Trading", "Interest Rate", "Trade Balance", "Federal Board of Revenue", "Foreign Direct Investment",
    "Poverty", "Supply and Demand", "Economic Cycle", "Interest Rates", "Market Forces",
    "Pakistan International Airlines (PIA)", "Depression", "Productivity", "Yield", "Recession",
    "Consumer Price Index (CPI)", "Valuable Publications", "Central Bank", "Market Capitalization",
    "Trade", "Economic Policy", "International Monetary Fund (IMF)", "Electricity Bills", "Decrease in Demand",
    "Exchange Rate", "Innovation", "Public Debt", "Interest Rate", "Dollar", "European Central Bank", "Money",
    "Monetary Policy", "Inflation", "Market Performance", "Infrastructure", "Innovation", "Trade",
    "Pakistan Stock Exchange (PSX)", "Business", "Productivity", "Prices", "Reforms", "China-Pakistan Economic Corridor (CPEC)",
    "Economic Forecast", "Taxes", "Unemployment", "International Monetary Fund (IMF)", "Economic Development", "Interest Rate",
    "Economic Policy", "Pakistan Peoples Party (PPP)", "Competition", "State Bank", "Infrastructure", "Entrepreneurship",
    "SBP (State Bank of Pakistan)", "International Monetary Fund (IMF)", "Economic Indicators", "Debt", "Supply and Demand",
    "Bill", "Commercial Banks", "Fiscal Policy", "Investment", "Inflation", "Gross Domestic Product (GDP)", "Globalization",
    "Loan", "Billion", "Budget", "Income Distribution", "Capitalization", "Labor Force", "Surplus",
    "Indirect Foreign Investment", "Shares", "Federal Board of Revenue (FBR)", "Relief", "Producer Price Index (PPI)",
    "Balance of Relationships", "Market Shares", "Trade Balance", "Exchange", "Taxation", "Gross Domestic Product (GDP)",
    "Revenue", "Exchange Publication", "Economic Cycle", "Electricity", "Electricity Bills", "Inflation", "Market Forces",
    "State Bank of Pakistan", "FD",
    "Budget Deficit", "Trade Balance", "Consumer Spending", "Business Investment",
    "Consumer Confidence", "Economic Growth", "Recession",
    "Economic Indicators", "Central Bank", "Taxation", "Tariffs",
    "Interest Rate", "Trade Balance", "Federal Board of Revenue", "Foreign Direct Investment", "FBR",
    "Federal Board of Revenue", "Poverty", "Supply and Demand", "Economic Cycle", "Interest Rates", "Market Forces",
    "Pakistan International Airlines (PIA)", "Productivity", "Depression", "Yield",
    "National Debt", "Economic Stimulus", "Economic Recovery",
    "Labor Market", "Industrial Production", "Manufacturing Sector"
]

In [None]:
# Function to find sub-categories based on keywords
def find_subcategories(row):
    sub_categories = []

    if not pd.isna(row):  # Check for NaN values
        # Tokenize the text
        tokens = row.split()

        if any(keyword in tokens for keyword in oil_gas_keywords):
            sub_categories.append('Oil & Gas')

        if any(keyword in tokens for keyword in cement_keywords):
            sub_categories.append('Cement')

        if any(keyword in tokens for keyword in economic_keywords):
            sub_categories.append('Economic')

    if not sub_categories:  # No matching sub-categories found
        sub_categories.append('NotAssigned')

    return sub_categories

# Apply the function to each row and create a 'sub-cat' column in df1
merged_df['sub-cat'] = merged_df['Cleaned_description'].apply(find_subcategories)


In [None]:
economic_rows = merged_df[merged_df['sub-cat'].apply(lambda x: 'Economic' in x)]
print(economic_rows)
print(economic_rows.shape)

In [None]:
economic_news = merged_df.loc[merged_df['sub-cat'].apply(lambda x: 'Economic' in x)]['Headline']
print(economic_news)

In [None]:
oil_rows = merged_df[merged_df['sub-cat'].apply(lambda x: 'Oil & Gas' in x)]
print(oil_rows)
print(oil_rows.shape)

In [None]:
oil_rows = merged_df.loc[merged_df['sub-cat'].apply(lambda x: 'Oil & Gas' in x)]['Headline']
print(oil_rows)

In [None]:
cement_rows = merged_df[merged_df['sub-cat'].apply(lambda x: 'Cement' in x)]
print(cement_rows)
print(cement_rows.shape)

In [None]:
cement_rows = merged_df.loc[merged_df['sub-cat'].apply(lambda x: 'Cement' in x)]['Headline']
print(cement_rows)

In [None]:
merged_df.to_csv(r"D:\Sana\Extra Material\Updated_2022_Data.csv", encoding='utf-8',index=False)

# Sentiment Extraction

##### Finetuning Finbert For Sentiment Extraction

In [None]:
data = pd.read_csv("/content/all-data (1).csv", error_bad_lines=False,encoding="unicode_escape")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [None]:
# Tokenize and encode the news articles
news_articles = data['News'].tolist()

encoded_data = tokenizer.batch_encode_plus(
    news_articles,
    truncation=True,
    padding=True,
    return_attention_mask=True,
    return_tensors='pt'
)

input_ids = encoded_data['input_ids']
attention_masks = encoded_data['attention_mask']

# Map sentiment labels to numerical values
sentiments = data['Sent'].tolist()
sentiment_mapping = {'positive': 2, 'negative': 0, 'neutral': 1}
labels = [sentiment_mapping[sentiment] for sentiment in sentiments]


In [None]:
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_ids, labels, random_state=42, test_size=0.2
)

train_masks, val_masks, _, _ = train_test_split(
    attention_masks, input_ids, random_state=42, test_size=0.2
)

# Convert train_inputs, train_masks, and train_labels to tensors
train_inputs = torch.tensor(train_inputs)
train_masks = torch.tensor(train_masks)
train_labels = torch.tensor(train_labels)

val_inputs = torch.tensor(val_inputs)
val_masks = torch.tensor(val_masks)
val_labels = torch.tensor(val_labels)


In [None]:
# Create Tensor datasets
train_data = TensorDataset(train_inputs, train_masks, train_labels)
val_data = TensorDataset(val_inputs, val_masks, val_labels)

# Set batch size and create data loaders
batch_size = 8
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Set optimization parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Fine-tune the model
epochs = 4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

from torch.cuda.amp import autocast, GradScaler

scaler = GradScaler()

for epoch in range(epochs):
    model.train()
    train_loss = 0
    train_predictions = []
    train_true_labels = []

    for batch in train_loader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

        outputs = model(**inputs)
        loss = outputs.loss
        logits = outputs.logits

        train_predictions.extend(logits.argmax(dim=1).tolist())
        train_true_labels.extend(batch[2].tolist())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        train_loss += loss.item()

    # Perform validation after each epoch
    model.eval()
    val_loss = 0
    val_predictions = []
    val_true_labels = []

    for batch in val_loader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

        with torch.no_grad():
            outputs = model(**inputs)
            loss = outputs.loss
            logits = outputs.logits

        val_predictions.extend(logits.argmax(dim=1).tolist())
        val_true_labels.extend(batch[2].tolist())

        val_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    avg_val_loss = val_loss / len(val_loader)

    # Calculate evaluation metrics
    train_accuracy = accuracy_score(train_true_labels, train_predictions)
    train_precision = precision_score(train_true_labels, train_predictions, average='weighted')
    train_recall = recall_score(train_true_labels, train_predictions, average='weighted')
    train_f1 = f1_score(train_true_labels, train_predictions, average='weighted')

    val_accuracy = accuracy_score(val_true_labels, val_predictions)
    val_precision = precision_score(val_true_labels, val_predictions, average='weighted')
    val_recall = recall_score(val_true_labels, val_predictions, average='weighted')
    val_f1 = f1_score(val_true_labels, val_predictions, average='weighted')

    print(f'Epoch {epoch + 1}/{epochs}')
    print(f'Training loss: {avg_train_loss:.4f}')
    print(f'Training accuracy: {train_accuracy:.4f}')
    print(f'Training precision: {train_precision:.4f}')
    print(f'Training recall: {train_recall:.4f}')
    print(f'Training F1-score: {train_f1:.4f}')
    print(f'Validation loss: {avg_val_loss:.4f}')
    print(f'Validation accuracy: {val_accuracy:.4f}')
    print(f'Validation precision: {val_precision:.4f}')
    print(f'Validation recall: {val_recall:.4f}')
    print(f'Validation F1-score: {val_f1:.4f}\n')



In [None]:
# Define the directory path to save and load the model
save_directory = '/content/drive/MyDrive/Datasets/finetuned_model/'

# Save the fine-tuned model
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

##### Loading Saved Finbert For Sentiment Prediction

(breaking news into sentences and then getting sentiment of each sentence)

In [None]:
save_directory = '/content/drive/MyDrive/finetuned_model/'

# Load the fine-tuned model
loaded_model = AutoModelForSequenceClassification.from_pretrained(save_directory)
loaded_tokenizer = AutoTokenizer.from_pretrained(save_directory)

In [None]:
news_articles =  merged_df['Combined_Cleaned'].tolist()

In [None]:
# Tokenize and encode the news articles
encoded_data = loaded_tokenizer.batch_encode_plus(
    news_articles,
    truncation=True,
    padding=True,
    return_attention_mask=True,
    return_tensors='pt'
)

input_ids = encoded_data['input_ids']
attention_masks = encoded_data['attention_mask']


In [None]:
# Initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

loaded_model = loaded_model.to(device)

# Set the model to evaluation mode
loaded_model.eval()

# Define the mapping from numerical labels to sentiment categories
numerical_to_sentiment = {2: 'positive', 0: 'negative', 1: 'neutral'}

# Function to calculate the majority sentiment
def calculate_majority_sentiment(sentiments):
    positive_count = sentiments.count('positive')
    negative_count = sentiments.count('negative')
    neutral_count = sentiments.count('neutral')

    if positive_count > negative_count and positive_count > neutral_count:
        return 'positive'
    elif negative_count > positive_count and negative_count > neutral_count:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment analysis to each sentence
sentiment_predictions = []
for description in merged['Combined_Cleaned']:
    sentences = nltk.sent_tokenize(description)
    sentence_sentiments = []

    for sentence in sentences:
        encoded_input = loaded_tokenizer(sentence, truncation=True, padding=True, return_tensors='pt')
        input_ids = encoded_input['input_ids'].to(device)
        attention_mask = encoded_input['attention_mask'].to(device)

        with torch.no_grad():
            outputs = loaded_model(input_ids, attention_mask)
            logits = outputs.logits

        predicted_class = torch.argmax(logits, dim=1).item()
        sentiment = numerical_to_sentiment[predicted_class]
        sentence_sentiments.append(sentiment)

    majority_sentiment = calculate_majority_sentiment(sentence_sentiments)
    sentiment_predictions.append(majority_sentiment)


In [None]:
# Add the sentiment predictions as a new column
merged_df['sentiment'] = sentiment_predictions

# Drop the unnamed columns
unnamed_columns = [col for col in merged_df.columns if 'Unnamed' in col]
merged_df = merged_df.drop(columns=unnamed_columns)

# Save the updated dataframe to a CSV file
merged_df.to_csv('updated.csv', index=False)

# Download the updated CSV file
from google.colab import files
files.download('updated.csv')

In [None]:
# Create a mapping dictionary
sentiment_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}

# Apply the mapping to the 'sentiment' column
merged_df['sentiment'] = merged_df['sentiment'].map(sentiment_mapping)

# Keyword Extraction

In [None]:
# Function to extract keywords using TextRank
def extract_keywords_textrank(text):
    sentences = nltk.sent_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_sentences = []

    for sentence in sentences:
        words = word_tokenize(sentence)
        filtered_words = [word.lower() for word in words if word.lower() not in stop_words and word.isalpha() and len(word) > 2]
        filtered_sentence = ' '.join(filtered_words)
        filtered_sentences.append(filtered_sentence)

    G = nx.Graph()
    sentence_tokens = [word_tokenize(sentence) for sentence in filtered_sentences]

    for sentence in sentence_tokens:
        G.add_nodes_from(sentence)

    window_size = 2
    for sentence in sentence_tokens:
        for i, word in enumerate(sentence):
            for j in range(i + 1, min(i + window_size, len(sentence))):
                G.add_edge(sentence[i], sentence[j])

    scores = nx.pagerank(G)
    num_keywords = 20
    keywords = sorted(scores, key=scores.get, reverse=True)[:num_keywords]

    return keywords

def extract_keywords_tfidf(text):
    # Create a TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')

    # Fit and transform your text to TF-IDF features
    tfidf_matrix = tfidf_vectorizer.fit_transform([text])

    # Get the feature names (words)
    feature_names = tfidf_vectorizer.get_feature_names_out()

    # Get the TF-IDF values for the words in the text
    tfidf_values = tfidf_matrix.toarray()

    # Calculate the average TF-IDF value for each word
    average_tfidf_values = tfidf_values.mean(axis=0)

    # Sort the words by their average TF-IDF values
    keywords = [feature_names[i] for i in average_tfidf_values.argsort()[::-1] if len(feature_names[i]) > 2]

    # Select the top N keywords
    num_keywords = 20
    top_keywords = keywords[:num_keywords]

    return top_keywords

# Function to extract keywords using KeyBERT
def extract_keywords_keybert(text):
    model = KeyBERT('distilbert-base-nli-mean-tokens')
    keywords = model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words='english', top_n=20)
    return [kw[0] for kw in keywords if len(kw[0]) > 2]

# Function to extract keywords using spaCy
def extract_keywords_spacy(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    keywords = [token.text for token in doc if not token.is_stop and len(token.text) > 2 and token.pos_ not in ['VERB', 'ADJ']][:20]
    return keywords

# Apply the functions to each row in the DataFrame
merged_df['Keywords_TextRank'] = merged_df['CC_wo_Stem_Lema_Punc'].apply(extract_keywords_textrank)
merged_df['Keywords_TFIDF'] = merged_df['CC_wo_Stem_Lema_Punc'].apply(extract_keywords_tfidf)
merged_df['Keywords_KeyBERT'] = merged_df['CC_wo_Stem_Lema_Punc'].apply(extract_keywords_keybert)
merged_df['Keywords_spaCy'] = merged_df['CC_wo_Stem_Lema_Punc'].apply(extract_keywords_spacy)

In [None]:
# Convert lists to strings before saving to CSV
merged_df['Keywords_TextRank'] = merged_df['Keywords_TextRank'].apply(lambda x: ', '.join(x) if isinstance(x, list) else '')
merged_df['Keywords_TFIDF'] = merged_df['Keywords_TFIDF'].apply(lambda x: ', '.join(x) if isinstance(x, list) else '')
merged_df['Keywords_KeyBERT'] = merged_df['Keywords_KeyBERT'].apply(lambda x: ', '.join(x) if isinstance(x, list) else '')
merged_df['Keywords_spaCy'] = merged_df['Keywords_spaCy'].apply(lambda x: ', '.join(x) if isinstance(x, list) else '')

In [None]:
# Convert all words to lowercase in the Keywords_CC column and combine them into a single list
merged_df['Keywords_CC'] = merged_df.apply(lambda row: [word.lower() for col in ['Keywords_TextRank', 'Keywords_TFIDF', 'Keywords_KeyBERT', 'Keywords_spaCy'] for word in row[col].split(', ')], axis=1)

# Convert the combined keywords to a set to get unique words
merged_df['Keywords_CC'] = merged_df['Keywords_CC'].apply(set)

# Convert the set back to a list
merged_df['Keywords_CC'] = merged_df['Keywords_CC'].apply(list)

# Drop the intermediate columns
merged_df = merged_df.drop(columns=['Keywords_TextRank', 'Keywords_TFIDF', 'Keywords_KeyBERT', 'Keywords_spaCy'])


In [None]:
merged_df.head(1)

# Entity Extraction

##### Using Flair

In [None]:
import pandas as pd
from flair.models import SequenceTagger
from flair.data import Sentence

# Load the Flair named entity recognition (NER) tagger
tagger = SequenceTagger.load('"flair/ner-english"')

# Define a function to extract entities from a text using Flair
def extract_entities(text):
    sentence = Sentence(text)
    tagger.predict(sentence)

    persons = []
    organizations = []
    others = []

    for entity in sentence.get_spans('ner'):
        if entity.tag == 'PER':
            persons.append(entity.text)
        elif entity.tag == 'ORG':
            organizations.append(entity.text)
        else:
            others.append(entity.text)

    return persons, organizations, others

#merged_first_10 = merged.head(10).copy()
#merged_first_10 = merged.copy()

# Initialize empty lists to store the entities for each row
persons_list = []
organizations_list = []
others_list = []

# Apply the function to each row of the first 10 instances and store entities in the lists
for _, row in merged_df.iterrows():
    entities_data = extract_entities(row['CC_wo_Stopwords'])
    persons_list.append(entities_data[0])
    organizations_list.append(entities_data[1])
    others_list.append(entities_data[2])

In [None]:
# Convert the lists to sets to remove duplicates, then back to lists
persons_list = [list(set(p)) for p in persons_list]
organizations_list = [list(set(o)) for o in organizations_list]
others_list = [list(set(o)) for o in others_list]

# Assign the lists to the DataFrame columns
merged_df['NER_Persons'] = persons_list
merged_df['NER_Organizations'] = organizations_list
merged_df['NER_Others'] = others_list

# Now, the "NER_Persons", "NER_Organizations", and "NER_Others" columns contain unique lists of entities for the first 10 instances
print(merged_df[["NER_Persons", "NER_Organizations", "NER_Others"]])


In [None]:
merged_df.to_csv('updated.csv', index=False)

# Load Roberta For Embedding Calculation

In [None]:
from tqdm import tqdm
import torch
from transformers import AutoModel, AutoTokenizer
from tqdm import tqdm
import pandas as pd

# Define your RoBERTa model name
roberta_model_name = "roberta-base"

# Load the RoBERTa model and tokenizer
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_name)
roberta_model = AutoModel.from_pretrained(roberta_model_name)

# Define the maximum sequence length
max_seq_length = 512  # You can adjust this based on your model's capabilities

# Create a tqdm instance to track progress
tqdm.pandas()

# Create an empty list to store the RoBERTa embeddings
roberta_embeddings = []

# Process each row in your dataset
for text in tqdm(merged_df['CC_wo_Stem_Lema_Punc']):
    # Truncate the text if it exceeds the maximum sequence length
    text = text[:max_seq_length]

    # RoBERTa embeddings
    inputs = roberta_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        output = roberta_model(**inputs)
    roberta_embedding = output.last_hidden_state.mean(dim=1).numpy()
    roberta_embeddings.append(roberta_embedding)

# Add a new 'Embedding_Vector' column to the DataFrame
merged_df['Embedding_Vector'] = roberta_embeddings

# Event Extraction Functions

##### Common Functions

In [None]:
def cosine(x, y):
    x = x.flatten()
    y = y.flatten()
    return 1 - distance.cosine(x, y)


def intersection(lst1, lst2):
    # Use of hybrid method
    temp = set(lst2)
    lst3 = [value for value in lst1 if value in temp]
    return lst3

def intersection_rate(words_list_, words_list1_):
    if not words_list_ or not words_list1_:
        return 0.0

    # Citation calc
    min_ = words_list_.copy()
    max_ = words_list1_.copy()
    if len(max_) < len(min_):
        min_ = words_list1_.copy()
        max_ = words_list_.copy()

    intersection_list = intersection(min_, max_)
    intersection_rate_ = len(intersection_list) / len(min_)

    return intersection_rate_


def similarity_score(sim_cos, citation):
    return sim_cos * 0.8 + citation * 0.2

##### Majority Voting and Purity Calculation

In [None]:
from collections import Counter

def majority_voting(elements):
    if not elements:
        return None

    counter = Counter(elements)
    majority_count = max(counter.values())
    majority_elements = [element for element, count in counter.items() if count == majority_count]

    if len(majority_elements) == 1:
        return majority_elements[0]
    else:
        return None

elements = ['pak', 'biz', 'pak', 'ent', 'biz', 'world', 'biz', 'pak']
majority = majority_voting(elements)

if majority is not None:
    print(f"The majority element is: {majority}")
else:
    print("No majority element found.")


In [None]:
from collections import Counter

def calculate_majority_purity(elements):
    if not elements:
        return None

    counter = Counter(elements)
    # print(f"counter: {counter}")
    majority_count = max(counter.values())
    # print(f"majoity_count: {majority_count}")
    majority_elements = [element for element, count in counter.items() if count == majority_count]
    # print(f"majority_elements: {majority_elements}")

    majority_percentage = (majority_count / len(elements))
    return majority_percentage

elements = ['pak', 'pak', 'pak', 'pak', 'biz', 'world', 'pak', 'pak']
purity = calculate_majority_purity(elements)

if purity is not None:
    print(f"The purity of the majority element is: {purity}")
else:
    print("No majority element found.")


In [None]:
'''
if pos = neg --> neu
if pos = neg = neu --> neu
if pos = neu --> pos
if neg = neu --> neg
'''
from collections import Counter

def find_majority(votes):
    vote_count = Counter(votes)
    top_two = vote_count.most_common(2)
    if len(top_two)>1 and top_two[0][1] == top_two[1][1]:
        if top_two[0][0] != 1 and top_two[1][0] != 1:
          # It is a tie so neutral
          return 1     # 0 --> neg, 1 --> neu, 2 --> pos
        elif top_two[0][0] == 1:
          return top_two[1][0]
        elif top_two[1][0] == 1:
          return top_two[0][0]
    return top_two[0][0]

find_majority([0,0,0,2,2,1,1,1]) # It is a tie

##### Category Priority Wise

In [None]:
merged['main_cat'].unique()

In [None]:
def select_priority_category(categories):
    priority_mapping = {
        'Business': 1,
        'International': 2,
        'National': 3,
        'Print': 4,
        'Editorial': 5,
        'Opinion': 6
    }

    selected_category = None
    selected_priority = float('inf')

    for category in categories:
        priority = priority_mapping.get(category, float('inf'))
        if priority < selected_priority:
            selected_category = category
            selected_priority = priority

    return selected_category

# Event Extraction

In [None]:
# Replace ['NotAssigned'] with an empty list
merged_df['sub-cat'] = merged_df['sub-cat'].apply(lambda x: [] if x == "['NotAssigned']" else x)

In [None]:
# Count the occurrences of ['NotAssigned'] in the 'sub-cat' column
count_not_assigned = (merged_df['sub-cat'] == "['Oil & Gas']").sum()

print(f"Number of rows with ['NotAssigned'] in the 'sub-cat' column: {count_not_assigned}")

In [None]:
# Convert 'DateTime' column to datetime
merged_df['DateTime'] = pd.to_datetime(merged_df['DateTime'])

# Define the start and end dates for the first 6 months of 2022
#start_date = pd.Timestamp('2022-01-01')
#end_date = pd.Timestamp('2022-12-31')

# Filter rows with news from the first 6 months of 2022
#test_df = merged_df[(merged_df['DateTime'] >= start_date) & (merged_df['DateTime'] <= end_date)]

# To make a copy of the DataFrame, use .copy()
#test_df = test_df.copy()

# Display the 'test_df'
#print(test_df.shape)

In [None]:
print(merged_df.index.min())  # Minimum valid index
print(merged_df.index.max())  # Maximum valid index


In [None]:
# Define a function to convert string representation of lists back to actual lists
def convert_str_to_list(string):
    try:
        # Use ast.literal_eval to safely evaluate the string as a list
        return ast.literal_eval(string)
    except (ValueError, SyntaxError):
        # If the string cannot be converted to a list, return it as is
        return string

In [None]:
def string_to_array(string_value):
    # Remove the square brackets from the string
    string_value = string_value.strip('[]')

    # Convert the string to a numpy array
    array_value = np.fromstring(string_value, sep=' ')

    # Reshape the array to its original shape
    return array_value.reshape((768,))

merged_df['Embedding_Vector'] = merged_df['Embedding_Vector'].apply(string_to_array)

In [None]:
# Set a random seed for reproducibility
random.seed(42)

# Get the total number of rows in your DataFrame
num_rows = merged_df.shape[0]

# Define the number of random samples you want (e.g., 10)
num_samples = 10

# Get a random sample of row indices
random_indices = random.sample(range(num_rows), num_samples)

# Loop through the random indices and print the shape of the embedding vectors
for index in random_indices:
    row = merged_df.iloc[index]
    embedding_vector = row['Embedding_Vector']
    if isinstance(embedding_vector, np.ndarray):
        embedding_shape = embedding_vector.shape
    else:
        embedding_shape = None  # Handle cases where parsing failed
    print(f"Shape of embedding vector at index {index}: {embedding_shape}")

In [None]:
import ast
cols = ['Keywords_CC',	'NER_Persons', 'NER_Organizations', 'NER_Others','sub-cat']
for col in cols:
  merged_df[col] = merged_df[col].apply(convert_str_to_list)

In [None]:
for column in merged_df.columns:
    print(column, type(merged_df[column][2891]))

In [None]:
e1 = merged_df['Embedding_Vector'][1361]
e2 = merged_df['Embedding_Vector'][1362]
print(type(e1))
print(type(e2))
print(e1)

In [None]:
e1 = merged_df['Embedding_Vector'][1361]
e2 = merged_df['Embedding_Vector'][1500]
print(merged_df['Headline'].iloc[1361])
print(merged_df['Headline'].iloc[1500])
sim_cos = cosine(e1, e2)
print(sim_cos)

In [None]:
# Define a mapping from sentiment labels to numeric values
sentiment_mapping = {"negative": 0, "neutral": 1, "positive": 2}

# Apply the mapping using a lambda function
merged_df['Sentiment_Prediction'] = merged_df['Sentiment_Prediction'].apply(lambda x: sentiment_mapping.get(x, x))


In [None]:
def event_extraction(df):
  # Add a print statement to see if the function is called
  print("Event extraction function called")

  added_urls = set()
  sim_score = []  # Initialize the sim_score list
  for ind, row in df.iterrows():
    headlines, sources, urls, tareekhs, desc = [], [], [], [], []
    cats, sim_score, sentiments, sub_cat = [], [], [], []

    ner, keywords = row['NER_Persons'], row['Keywords_CC']
    url = row['Url']
    sim_score.append(-1)
    sentiments.append(row['Sentiment_Prediction'])
    cats.append(row['main_cat'])
    urls.append(url)
    sources.append(row['source'])
    tareekhs.append(row['DateTime'])
    desc.append(row['News'])
    sub_cat.append(row['sub-cat'])
    headlines.append(row['Headline'])

    # Skip url if already part of a previous event
    if url not in added_urls:
      vector1 = row['Embedding_Vector']
      # Get Words list of the main article
      article = row['CC_wo_Stopwords_Punc']
      article  = re.sub(r'(\d+)(\D)', r'\1 \2', article)
      article  = re.sub(r'(\d+)', r' \1', article)
      words = article.replace('nbsp', '').split()

      # Get a subset of indexes greater than the index of currrent article
      all_indexes = df.index.tolist()
      subset_indexes = [index for index in all_indexes if index > ind]

      # Compare with all the other articles in the dataframe
      # similarities = []
      for index in subset_indexes:
        article2 = df.iloc[index]['CC_wo_Stopwords_Punc']
        article2  = re.sub(r'(\d+)(\D)', r'\1 \2', article2)
        article2  = re.sub(r'(\d+)', r' \1', article2)
        words2 = article2.replace('nbsp', '').split()

        vector2 = df.iloc[index]['Embedding_Vector']

        # Find Similarity/Citation
        citation = intersection_rate(words, words2)
        sim_cos = cosine(vector1, vector2)
        score = similarity_score(sim_cos, citation)


        if score > 0.6 and (citation > 0.5):
          new_url = df.iloc[index]['Url']
          if new_url not in added_urls:   # Check if the url has already been identified as part of another event
            urls.append(new_url)
            added_urls.add(new_url)     # Add the new url to the added_urls set to avoid repetition
            sources.append(df.iloc[index]['source'])
            tareekhs.append(df.iloc[index]['DateTime'])
            desc.append(df.iloc[index]['News'])
            cats.append(df.iloc[index]['main_cat'])
            headlines.append(df.iloc[index]['Headline'])
            sentiments.append(df.iloc[index]['Sentiment_Prediction'])
            sub_cat.append(df.iloc[index]['sub-cat'])
            keywords += df.iloc[index]['Keywords_CC']
            ner += df.iloc[index]['NER_Persons']
            if -1 in sim_score:
              sim_score.remove(-1)
            sim_score.append(score)

            #print(f"Citation: {citation}\nCosine: {sim_cos}\nScore: {score}")
            # print(urls)
            # print("----------------------------------------")

      url_category_dict = {}
      url_subcat_dict = {}
      url_senti_dict = {}
      for url, category in zip(urls, cats):
        if url not in url_category_dict:
            url_category_dict[url] = category

      for url, sub in zip(urls, sub_cat):
         if url not in url_subcat_dict:
             url_subcat_dict[url] = sub

      for url, sentiment in zip(urls, sentiments):
        if url not in url_senti_dict:
            url_senti_dict[url] = sentiment

      # Separate the unique URLs and categories from the dictionary
      urls = list(url_category_dict.keys())
      cats = list(url_category_dict.values())
      sub_cat = list(url_subcat_dict.values())
      # Flatten the list of lists and then convert it to a set to remove duplicates
      sub_cat = list(set(sub_category for sublist in sub_cat for sub_category in sublist))
      sentiments = list(url_senti_dict.values())
      sources = list(set(sources))
      desc = list(set(desc))
      keywords = list(set(keywords))
      ner = list(set(ner))
      maj_cat = majority_voting(cats)
      priority = select_priority_category(cats)
      purity = calculate_purity(priority, cats)
      maj_purity = calculate_majority_purity(cats)
      maj_senti = find_majority(sentiments)
      new_row = {'Event': df.iloc[ind]['Headline'], 'Sources': sources,'Headlines':headlines, 'Urls': urls, 'Description': desc, 'Articles': len(urls),
                  'Categories': cats, 'Sub_Category': sub_cat, 'Major_Cat': maj_cat, 'Priority_cat': priority, 'Priority_Purity': purity, 'Maj_Purity': maj_purity,
                  'Sentiment_Prediction': maj_senti, 'Keywords': keywords, 'Similarity_Scores': sim_score, 'NER_Persons': ner, 'Start Date': start_date, 'End Date': end_date}
      global events
      events = pd.concat([events, pd.DataFrame([new_row])], ignore_index=True)


##### One Day Events

In [None]:
%%time
import numpy as np
import re
import torch


# Convert the "DateTime" column to a datetime format
test_df['DateTime'] = pd.to_datetime(test_df['DateTime'])

# Remove the timestamp (keep only the date part)
test_df['DateTime'] = test_df['DateTime'].dt.date

# Find the oldest and latest dates
oldest_date = test_df['DateTime'].min()
latest_date = test_df['DateTime'].max()

print("Oldest Date:", oldest_date)
print("Latest Date:", latest_date)

dates = list(test_df['DateTime'].unique())
dates = np.sort(dates)
print(len(dates), dates[0:5])

# List of column names
cols = ['Event', 'Sources', 'Urls', 'Articles', 'Start Date', 'End Date',]
# Create an empty dataframe to store all the events
global events
events = pd.DataFrame(columns=cols)

for date in dates:
  global start_date
  global end_date
  start_date = date
  end_date = date

  df = test_df[test_df['DateTime']==date]
  df.reset_index(drop=True, inplace=True)

  print(f"START_DATE: {start_date}, END_DATE: {end_date}")
  event_extraction(df)

events.to_csv(r"D:\Sana\Extra Material\July_Dec_Events.csv", encoding='utf-8', index=False)
print(f"Columns: {events.columns}\nShape: {events.shape}")
events.head(3)

In [None]:
events['Urls'] = events['Urls'].apply(tuple)

print(f"Shape Before Duplicate Removal: {events.shape}")
# events_all1.drop_duplicates(inplace=True)
events.drop_duplicates(subset='Urls' ,inplace=True)
events.reset_index(drop=True)
print(".......................................")
print(f"Shape After Duplicate Removal: {events.shape}")


indsub1, indsub2 = [], []
articles = np.sort(events['Articles'].unique())
for art in articles:
  sub1 = events[events['Articles']==art]
  sub2 = events[events['Articles']>art]
  for index, row in sub1.iterrows():
    url = row['Urls']
    # print(type(url), url)
    for ind, row2 in sub2.iterrows():
      urls = row2['Urls']
      if set(url).issubset(set(urls)):
        indsub1.append(index)
        indsub2.append(ind)
  # print(art, sub1.shape, sub2.shape)

print(f"indeces dropped: {len(set(indsub1))}")

# Drop rows by indices
events.drop(indsub1, inplace=True)

# Reset index after dropping rows
events.reset_index(drop=True, inplace=True)

# Print the resulting DataFrame
print(events.shape)

In [None]:
economic_rows = merge_df[merge_df['Sub_Category'].apply(lambda x: isinstance(x, list) and 'Economic' in x if isinstance(x, (list, np.ndarray)) else ('Economic' in x if isinstance(x, str) else False))]
print(economic_rows.shape)

##### Five Day Events

In [None]:
import pandas as pd
import numpy as np

merged_df['DateTime'] = pd.to_datetime(merged_df['DateTime'])

# # Find the oldest and latest dates
oldest_date5 = merged_df['DateTime'].min()
latest_date5 = merged_df['DateTime'].max()

print("Oldest Date:", oldest_date5)
print("Latest Date:", latest_date5)

dates5 = list(merged_df['DateTime'].unique())
dates5 = np.sort(dates5)
dates5 = pd.date_range(start=oldest_date5, end=latest_date5, freq='D')
print(len(dates5), dates5[0:5])

# List of column names
cols = ['Event', 'Sources', 'Urls', 'Articles', 'Start Date', 'End Date']

# Create an empty dataframe to store all the events
global events
events = pd.DataFrame(columns=cols)

for i in range(4, len(dates5)):
    window_dates = dates5[i-4:i+1]  # Get the current 5-day window
    # Find the oldest and latest dates
    global start_date
    global end_date
    start_date = min(window_dates)
    end_date = max(window_dates)
    #print(f"Window: Dates: {window_dates}\n start: {start_date}\nend: {end_date}")
    # break
    df = merged_df[merged_df['DateTime'].isin(window_dates)]  # Filter dataframe within the window dates
    df.reset_index(drop=True, inplace=True)

    print(f"START_DATE: {start_date}, END_DATE: {end_date}")
    event_extraction(df)

events_all5 = events
print(f"Columns: {events_all5.columns}\nShape: {events_all5.shape}")
events_all5.head(3)


In [None]:
events_all5['Urls'] = events_all5['Urls'].apply(tuple)

print(f"Shape Before Duplicate Removal: {events_all5.shape}")
# events_all1.drop_duplicates(inplace=True)
events_all5.drop_duplicates(subset='Urls' ,inplace=True)
events_all5.reset_index(drop=True)
print(".......................................")
print(f"Shape After Duplicate Removal: {events_all5.shape}")


indsub1, indsub2 = [], []
articles = np.sort(events_all5['Articles'].unique())
for art in articles:
  sub1 = events_all5[events_all5['Articles']==art]
  sub2 = events_all5[events_all5['Articles']>art]
  for index, row in sub1.iterrows():
    url = row['Urls']
    # print(type(url), url)
    for ind, row2 in sub2.iterrows():
      urls = row2['Urls']
      if set(url).issubset(set(urls)):
        indsub1.append(index)
        indsub2.append(ind)
  # print(art, sub1.shape, sub2.shape)

print(f"indeces dropped: {len(set(indsub1))}")

# Drop rows by indices
events_all5.drop(indsub1, inplace=True)

# Reset index after dropping rows
events_all5.reset_index(drop=True, inplace=True)

events_all5.to_csv(r"D:\Sana\Extra Material\5Days_Events.csv", encoding='utf-8', index=False)

# Print the resulting DataFrame
print(events_all5.shape)