# Group Number 2 - Members

* Ali Annan 202475973
* Kinan Morad 202471895
* Sasha Nasser 202473486
* Romanos Rizk 202471561
* Rita Salloum 202371596

# Importing The Libraries and Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import gc
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import ast
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

In [2]:
data = pd.read_csv("C:/Users/Usr/Desktop/newestnewnewbalance_dataset.csv")

In [3]:
data.shape

(50000, 20)

## Subsetting the Data

In [4]:
# Load your dataset into a DataFrame named 'data' (assuming you have already loaded it)

# Define features to balance
features_to_balance = ['section_name', 'type_of_material', 'news_desk']

# Calculate value counts for each feature
value_counts = {}
for feature in features_to_balance:
    value_counts[feature] = data[feature].value_counts()

# Determine minimum target counts for each feature
min_target_counts = {feature: value_counts[feature].min() for feature in features_to_balance}

# Set desired sample size
desired_sample_size = 20000

# Initialize balanced dataset
balanced_dataset = pd.DataFrame()

# Iterate through features to balance
for feature in tqdm(features_to_balance):
    unique_values = value_counts[feature].index
    # Iterate through unique values of the feature
    for unique_value in unique_values:
        if len(balanced_dataset) >= desired_sample_size:
            break  # Stop when the desired sample size is reached
        samples_to_select = min_target_counts[feature]
        selected_indices = data[data[feature] == unique_value].sample(min(samples_to_select, desired_sample_size - len(balanced_dataset)), random_state=42).index
        balanced_dataset = pd.concat([balanced_dataset, data.loc[selected_indices]])

# Check if desired sample size is not reached and oversample
remaining_samples = desired_sample_size - len(balanced_dataset)
if remaining_samples > 0:
    oversample_indices = data.sample(remaining_samples).index
    balanced_dataset = pd.concat([balanced_dataset, data.loc[oversample_indices]])

100%|██████████| 3/3 [00:02<00:00,  1.06it/s]


In [6]:
data = balanced_dataset
del balanced_dataset

In [7]:
data.shape

(20000, 20)

# Preprocessing and Feature Engineering

## Removing Non Relevant COlumns and Rows

In [8]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

# Assuming you have a DataFrame 'df' with columns 'word_count' (numerical) and 'section_name' (categorical)
# You may need to preprocess your data and encode categorical variables before performing correlation analysis

# Compute Cramér's V for 'word_count' and 'section_name'
def cramers_v(confusion_matrix):
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    rcorr = r - ((r - 1) ** 2) / (n - 1)
    kcorr = k - ((k - 1) ** 2) / (n - 1)
    return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))

# Create a cross-tabulation of 'word_count' and 'section_name'
confusion_matrix = pd.crosstab(data['word_count'], data['section_name'])

# Compute Cramér's V
correlation = cramers_v(confusion_matrix)

print("Correlation between 'word_count' and 'section_name' (Cramér's V):", correlation)

Correlation between 'word_count' and 'section_name' (Cramér's V): 0.0


In [9]:
# Assuming 'data' is your DataFrame
columns_to_drop = ['web_url', 'print_section', 'print_page', 'source', 'multimedia', 
                   'document_type', 'news_desk', 'byline', 'type_of_material', '_id', 'uri', 'word_count']
data = data.drop(columns=columns_to_drop)


In [10]:
# Defining the allowed sections
allowed_sections = ['Arts', 'Automobiles', 'Blogs', 'Books', 'Business Day', 'College', 'Climate', 
                    'Education', 'Fashion & Style', 'Food', 'Health', 'Home & Garden', 'Job Market', 
                    'Movies', 'Parenting', 'Podcasts', 'Real Estate', 'Science', 'Sports', 'Technology', 
                    'Theater', 'Travel', 'U.S.', 'World']

# Filtering the rows based on allowed sections
data = data[data['section_name'].isin(allowed_sections)]

## Dropping Null Values

In [15]:
data.drop(columns='headline', inplace= True)

## Extracting the Headline of each Article

In [13]:
# Print the first few raw entries of the 'headline' column
print(data['headline'].head())


24656    {'main': 'Sprint and SK Telecom Said to Discus...
42082    {'main': '‘It’s a Weird Feeling’: Seattle Hunk...
45004    {'main': 'With Bombing, Iraqis Escalate Guerri...
14331    {'main': 'U.S. Team Loses More Players for Qua...
14535    {'main': 'Displaying the Discreet Charm and Ca...
Name: headline, dtype: object


In [14]:
# Convert string representations of dictionaries to actual dictionaries
data['headline'] = data['headline'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Now, extract the 'main' key from each dictionary in the 'headline' column
data['main_headline'] = data['headline'].apply(lambda x: x['main'] if 'main' in x else None)

# Check the first few entries of the main_headline to confirm success
print(data['main_headline'].head())


24656    Sprint and SK Telecom Said to Discuss Partnership
42082    ‘It’s a Weird Feeling’: Seattle Hunkers Down A...
45004    With Bombing, Iraqis Escalate Guerrilla Tactic...
14331          U.S. Team Loses More Players for Qualifiers
14535    Displaying the Discreet Charm and Casual Grace...
Name: main_headline, dtype: object


In [16]:
# Combine the columns into one
data['romanos'] = data['main_headline'].astype(str) + " " + data['snippet'].astype(str) + "  " + data['abstract'].astype(str) + " " + data['lead_paragraph'].astype(str)

# Check the first few entries to ensure it's combined correctly
print(data['romanos'].head())

24656    Sprint and SK Telecom Said to Discuss Partners...
42082    ‘It’s a Weird Feeling’: Seattle Hunkers Down A...
45004    With Bombing, Iraqis Escalate Guerrilla Tactic...
14331    U.S. Team Loses More Players for Qualifiers Th...
14535    Displaying the Discreet Charm and Casual Grace...
Name: romanos, dtype: object


## Dropping Non Relevant Columns

In [17]:
# Drop the specified columns
data.drop(columns=['snippet', 'lead_paragraph', 'main_headline', 'abstract'], inplace=True)

# Check the DataFrame to confirm that the columns have been removed
print(data.head())


                                                keywords  \
24656                                                 []   
42082  [{'name': 'subject', 'value': 'Coronavirus (20...   
45004  [{'name': 'glocations', 'value': 'Iraq', 'rank...   
14331  [{'name': 'persons', 'value': 'Castillo, Edgar...   
14535  [{'name': 'persons', 'value': 'Louis, Murray',...   

                        pub_date  section_name subsection_name  \
24656  2008-07-16 11:43:29+00:00  Business Day             NaN   
42082  2020-03-06 02:33:26+00:00          U.S.             NaN   
45004  2003-03-30 05:00:00+00:00         World             NaN   
14331  2012-10-10 23:57:43+00:00        Sports          Soccer   
14535  2007-09-06 04:00:00+00:00          Arts           Dance   

                                                 romanos  
24656  Sprint and SK Telecom Said to Discuss Partners...  
42082  ‘It’s a Weird Feeling’: Seattle Hunkers Down A...  
45004  With Bombing, Iraqis Escalate Guerrilla Tactic...  
14331 

## Extracting the Keywords of each Article

In [18]:
# Define a function to safely convert string representations to actual lists
def convert_to_list(keyword_string):
    try:
        return ast.literal_eval(keyword_string) if isinstance(keyword_string, str) else keyword_string
    except (ValueError, SyntaxError):
        return None  # Return None if the string cannot be converted

# Apply this conversion function to the entire 'keywords' column
data['keywords'] = data['keywords'].apply(convert_to_list)



In [19]:
def format_keywords(keywords):
    # Ensure that keywords is a list and not None
    if isinstance(keywords, list) and keywords:
        # Extract 'name' and 'value' and format into a string, excluding 'rank' and 'major'
        keyword_strings = [f"{keyword['name']}: {keyword['value']}" for keyword in keywords if 'name' in keyword and 'value' in keyword]
        # Join all strings into a single sentence
        return ', '.join(keyword_strings)
    return None

# Apply the formatting function to the 'keywords' column again
data['keyword_sentences'] = data['keywords'].apply(format_keywords)

# Check the results
print(data['keyword_sentences'].head())


24656                                                 None
42082    subject: Coronavirus (2019-nCoV), subject: Tel...
45004    glocations: Iraq, glocations: Najaf (Iraq), gl...
14331    persons: Castillo, Edgar, persons: Donovan, La...
14535    persons: Louis, Murray, organizations: Common ...
Name: keyword_sentences, dtype: object


## Dropping Non Relevant Columns

In [20]:
data.drop(columns= 'keywords', inplace=True)

In [21]:
data.drop(columns = 'pub_date', inplace = True)

## Combining all the Text into a 'combined_text' Column

In [23]:
# Combine the text from 'romanos' and 'keyword_sentences' into a new column 'combined_text'
data['combined_text'] = data['romanos'] + " " + data['keyword_sentences'] + " " + data['subsection_name']

# Drop the original columns if needed
data.drop(['romanos', 'keyword_sentences', 'subsection_name'], axis=1, inplace=True)

# Print the DataFrame to verify the changes
print(data.head())


          section_name                                      combined_text
14331           Sports  U.S. Team Loses More Players for Qualifiers Th...
14535             Arts  Displaying the Discreet Charm and Casual Grace...
1694   Fashion & Style  Mitch McEwen, Dina Paulson The couple are to b...
23472             Food  Cocktail School Lets Novices Be the Bartender ...
15576         Podcasts  Why Are All Eyes on the Virginia Governor’s Ra...


## Converting Article Text to Lowercase

In [24]:
# Define a function for preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    return text

# Apply preprocessing to each text column
data['combined_text'] = data['combined_text'].apply(preprocess_text)

# Print the first few rows to verify the result
print(data.head())

          section_name                                      combined_text
14331           Sports  u.s. team loses more players for qualifiers th...
14535             Arts  displaying the discreet charm and casual grace...
1694   Fashion & Style  mitch mcewen, dina paulson the couple are to b...
23472             Food  cocktail school lets novices be the bartender ...
15576         Podcasts  why are all eyes on the virginia governor’s ra...


In [25]:
data= data.reset_index(drop=True)

# LLM Model - BERT

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score
from tqdm import tqdm  # Import tqdm for progress bar

# Split the data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Define the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize text and convert to numerical format
def tokenize_and_encode(text, max_length):
    encoded_text = tokenizer.encode_plus(
        text,
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_tensors='pt'  # Return PyTorch tensors
    )
    return encoded_text['input_ids'], encoded_text['attention_mask']

# Define BERT model
num_labels = len(data['section_name'].unique())
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=num_labels,
    output_attentions=False,
    output_hidden_states=False,
)

# Convert labels to numerical format
label2id = {label: i for i, label in enumerate(data['section_name'].unique())}
train_data['label'] = train_data['section_name'].map(label2id)
test_data['label'] = test_data['section_name'].map(label2id)

# Tokenize and encode train and test data
max_length = 128  # Maximum sequence length
train_input_ids, train_attention_mask = zip(*train_data['combined_text'].apply(lambda x: tokenize_and_encode(x, max_length)))
test_input_ids, test_attention_mask = zip(*test_data['combined_text'].apply(lambda x: tokenize_and_encode(x, max_length)))

# Convert to PyTorch tensors
train_input_ids = torch.cat(train_input_ids, dim=0)
train_attention_mask = torch.cat(train_attention_mask, dim=0)
train_labels = torch.tensor(train_data['label'].values)

test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_mask = torch.cat(test_attention_mask, dim=0)
test_labels = torch.tensor(test_data['label'].values)

# Create DataLoader for train and test sets
batch_size = 32
train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)

test_dataset = TensorDataset(test_input_ids, test_attention_mask, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Train BERT model
epochs = 3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

for epoch in range(epochs):
    progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}', leave=False)
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = [t.to(device) for t in batch]
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        progress_bar.set_postfix({'Loss': loss.item()})

# Evaluate BERT model
model.eval()
predicted_labels = []
true_labels = []

with torch.no_grad():
    progress_bar = tqdm(test_dataloader, desc='Evaluation', leave=False)
    for batch in progress_bar:
        input_ids, attention_mask, labels = [t.to(device) for t in batch]
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels.extend(logits.argmax(dim=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, predicted_labels)
print("Accuracy:", accuracy)

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                                         

Accuracy: 0.8863361547762999


