In [None]:
import json

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import pandas as pd
import re
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score
from sklearn.preprocessing import LabelEncoder


load training dataset from json file

In [None]:
# Define the path to the JSON file
json_file_path = '/content/drive/MyDrive/nytimes_train.json'

# Read the JSON file
with open(json_file_path, 'r') as file:
    data = json.load(file)
# Print a summary of the first few article samples
num_samples_to_print = 5
for i, article in enumerate(data[:num_samples_to_print]):
    print(f"Article {i+1}: {article}")



Article 1: {'section': 'Theater', 'headline': "Before 'Moonlight' and 'The Walking Dead,' a Friendship Born in the Classroom", 'article_url': 'https://www.nytimes.com/2017/02/21/theater/danai-gurira-andre-holland-walking-dead.html', 'article': 'Danai Gurira and Andre Holland in a theater at New York University, where they met in the Tisch Graduate Acting Program.\n\nBehind every successful person are relationships that helped forge a path. But the stories of these friendships, collaborations, alliances, romances or rivalries often are lost in the glow of achievement. In this new feature, we explore a personal connection that made a difference in the lives of two artists. Andre Holland never thought much about writing his own monologues when he attended the Tisch graduate acting program at New York University. But one day, early in his first semester in 2003, he watched another African-American student, Danai Gurira, forgo the usual speeches by white characters and perform her own mater

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Data Processing:**

In this step, the good practice is to capture all contextual information,from the articles, but to save some memory space for the further processings, i have chosen only the abstracts from all articles in training set

In [None]:
# Extract Abstracts
abstracts = [article["abstract"] for article in data]

# Print extracted headlines
for i, headline in enumerate(abstracts):
    print(f"Article {i+1} Abstract:", headline)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Article 41006 Abstract: As the country emerges from a crippling debt crisis, the island of Crete remains popular with second-home buyers.
Article 41007 Abstract: With the 2018 prize postponed by scandal, The Times's staff book critics discuss the award's history and influence — and whom they would give it to this year if they could.
Article 41008 Abstract: Huge double sunroofs, offering an expansive view of the sky, are growing more popular and are now available from more manufacturers.
Article 41009 Abstract: The chef Iliana Regan created a hit Chicago restaurant and wrote a tough, award-winning memoir. But her real dream lives in a cabin in northern Michigan.
Article 41010 Abstract: Newer versions of Google's browser include a setting to easily mute web pages that would otherwise open with audio blaring.
Article 41011 Abstract: Is it impeachment … gate? How will we remember this moment, without a "gate"?
Article 41012 A

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



**Keyword Extraction:**

In practice, collection of  words related to Gaza war (dictionary) is done carefully and it is validated by an expert (journalist for example) someone who is updated by all information of Gaza war. due to time constrain, I have chosen the most used words that i heard about, Additionally, sometime there are predefined dictionaries are available as open source,instead of starting from scratch,using those dectionaries and modify over them would be beneficil and offer too much time.

In [None]:
# collect all related words that has relation with Gaza War
keywords = ["Palestine", "Gaza","Jerusalem", "Occupied Territories", "Hamas", "Israeli-Palestinian conflict", "West Gaza", "north Gaza""abuse","damage","Palestinian Authority", "Intifada", "Settlements","war","death","Al quds","conflict","occupation","Israel","palestanian","strip","Gaza strip","Fatah","IDF","Iron Dome system","Day of Jihad","West Bank"]


In [None]:

# Function to check for presence of keywords
def check_keywords(article):
    article_tfidf = tfidf_vectorizer.transform([article])
    article_keywords = [keyword for keyword in keywords if keyword.lower() in article.lower()]
    return article_keywords

# Check keywords in each article
for i, headline in enumerate(abstracts):
    article_keywords = check_keywords(headline)
    if article_keywords:
        print(f"Article {i+1} contains the following Palestine-related keywords:", article_keywords)
    else:
        print(f"Article {i+1} does not contain any Palestine-related keywords.")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Article 43989 does not contain any Palestine-related keywords.
Article 43990 does not contain any Palestine-related keywords.
Article 43991 does not contain any Palestine-related keywords.
Article 43992 does not contain any Palestine-related keywords.
Article 43993 does not contain any Palestine-related keywords.
Article 43994 does not contain any Palestine-related keywords.
Article 43995 does not contain any Palestine-related keywords.
Article 43996 does not contain any Palestine-related keywords.
Article 43997 does not contain any Palestine-related keywords.
Article 43998 does not contain any Palestine-related keywords.
Article 43999 does not contain any Palestine-related keywords.
Article 44000 does not contain any Palestine-related keywords.
Article 44001 does not contain any Palestine-related keywords.
Article 44002 contains the following Palestine-related keywords: ['war']
Article 44003 does not contain any Palestin

in this step which is word extraction, there are many approaches that can be applied to get more representative data, manual annotation is one of them ,TF-IDF vectorization,Bag of words,Rule-Based Approaches,or combine all appoaches.

---------------------------------------------------------------------------------------------------------

In my approach, I am planning to work with a supervised machine learning model which is Naive Bayes, so i needed a label to help in training stage.using above keywords, iteration is done over all training examples and labels were constructed.



In [None]:

# Function to check if an article is related to Palestine and assign label
def assign_label(article):
    for keyword in keywords:
        if keyword.lower() in article.lower():
            return "FreePalestine"
    return "Other"

# list to store articles with their corresponding labels
labeled_articles = []

# Assign labels to articles and store them in list
for article in data:
    label = assign_label(article['abstract'])
    labeled_articles.append(label)

labeled_articles
len(labeled_articles)

48988

for the features, which are the **abstracts** in the articles of training examples, we need some preprocessing techniques to convert them into numerical data to help the model understand those features . Below function is used to:



1.   Tokenizeand lowercaseing: convert all capital letters to small
2.   remove stop words: redundant words that will get higher probabilities because of their frequencies and they are not important in classification task
3.   Join  tokens
4.   Text Cleaning from punctuation, special characters, and numbers









In [None]:
# Tokenization, Text Cleaning, Stopword Removal
cleaned_articles = []
for article in abstracts:
    # Tokenization
    tokens = re.findall(r'\b\w+\b', article.lower())  # Split into lowercase words

    # Text Cleaning from punctuation, special characters, and numbers
    clean_tokens = [re.sub(r'[^a-zA-Z]', '', token) for token in tokens]

    # Stopword Removaing
    stopwords = ['the', 'is', 'and', 'a', 'an', 'of', 'in', 'to', 'that', 'have', 'been']
    filtered_tokens = [token for token in clean_tokens if token not in stopwords]

    # Join the tokens  into one string
    cleaned_article = ' '.join(filtered_tokens)
    cleaned_articles.append(cleaned_article)

In [None]:

len(cleaned_articles)

48988

***Vectorization(Bag of Words)***

 used to convert text data into numerical vectors, which can be used as input for machine learning models.

In [None]:

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(cleaned_articles)


In [None]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit LabelEncoder on y_dev
label_encoder.fit(y)
y = label_encoder.transform(y)

**Model Training**

In [None]:

nb_classifier = MultinomialNB()
nb_classifier.fit(X, y)


In [None]:
#preprocessing function to reuse it
def preprocess_text(articles, stopwords=None):

    cleaned_articles = []

    # Default stopwords list if not provided
    if stopwords is None:
        stopwords = ['the', 'is', 'and', 'a', 'an', 'of', 'in', 'to', 'that', 'have', 'been']

    # Iterate through each article
    for article in articles:
        # Tokenization
        tokens = re.findall(r'\b\w+\b', article.lower())  # Split into lowercase words

        # Text Cleaning from punctuation, special characters, and numbers
        clean_tokens = [re.sub(r'[^a-zA-Z]', '', token) for token in tokens]

        # Stopword Removal
        filtered_tokens = [token for token in clean_tokens if token not in stopwords]

        # Join the tokens into one string
        cleaned_article = ' '.join(filtered_tokens)
        cleaned_articles.append(cleaned_article)

    # Initialize CountVectorizer
    vectorizer = CountVectorizer()

    # Fit and transform the cleaned articles
    X = vectorizer.fit_transform(cleaned_articles)

    return X



**cross-validation**

This step is crucial in model improvement, enable to moniter the performance during training stage to prevent overfitting later and support in generalization

load validation dataset

In [None]:
# Define the path to the JSON file
json_file_path = '/content/drive/MyDrive/nytimes_dev.json'

# Read the JSON file
with open(json_file_path, 'r') as file:
    data_dev = json.load(file)
# Print a summary of the first few article samples
num_samples_to_print = 5
for i, article in enumerate(data_dev[:num_samples_to_print]):
    print(f"Article {i+1}: {article}")



Article 1: {'section': 'Well', 'headline': 'After a Cancer Diagnosis, Playing the Odds', 'article_url': 'https://www.nytimes.com/2018/01/31/well/live/after-a-cancer-diagnosis-playing-the-odds.html', 'article': 'My grandfather was a statistician, and from a young age he taught me to always consider the odds. A pragmatic man, he couldn\'t help teaching me when to hedge my bets, especially when it came to playing Uno. Unfortunately, he never could have prepared me for the odds I would face in the years to come. Shortly after celebrating my 25th birthday, I was given a diagnosis of Hodgkin\'s lymphoma. After some research and a healthy dose of naivete, I felt I could kick it pretty swiftly. After all, everyone reassured me that I had the "good kind" of cancer, with an over 90 percent survival rate. Those were odds my grandfather taught me I could get behind. But two months into treatment, the odds changed; my cancer didn\'t respond to standard chemotherapy and had begun growing out of cont

In [None]:
# Extract Abstracts from validation
abstracts_dev = [article["abstract"] for article in data_dev]

# Print extracted headlines
for i, headline in enumerate(abstracts_dev):
    print(f"Article {i+1} Abstract:", headline)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Article 1124 Abstract: Previews, openings and some last-chance picks.
Article 1125 Abstract: The bacteria in a bird's microbiome seems to play an important role in the odoriferous messages it sends to other birds.
Article 1126 Abstract: The suit accuses Apple Pay of using technology that was pitched to Apple and Visa officials, but the companies never signed a license agreement.
Article 1127 Abstract: A documentary shines a spotlight on the grim relationship between race and cancer.
Article 1128 Abstract: Every month, Netflix Australia adds a new batch of movies and TV shows to its library. Here are the titles we think are most interesting for May.
Article 1129 Abstract: The sister of Britain's future queen makes a match that the often-critical British press seems to approve of.
Article 1130 Abstract: The bar gives a modern spin to New York's piano karaoke scene and, for the co-owner and pianist Joe McGinty, fulfills a lo

In [None]:
# list to store articles with their corresponding labels
labeled_articles_dev = []

# Assign labels to articles and store them in list
for article in data_dev:
    label = assign_label(article['abstract'])
    labeled_articles_dev.append(label)

labeled_articles_dev
len(labeled_articles_dev)
labeled_articles_dev

['Other',
 'Other',
 'FreePalestine',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'FreePalestine',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'FreePalestine',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'FreePalestine',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'FreePalestine',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'FreePalestine',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'FreePalestine',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Ot

In [None]:
X_dev=preprocess_text(abstracts_dev, stopwords=None)
y_dev=labeled_articles_dev
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit LabelEncoder on y_dev
label_encoder.fit(y_dev)
y_dev = label_encoder.transform(y_dev)

In [None]:
print("Shape of X_dev:", X_dev.shape)
print("Shape of y_dev:", y_dev.shape)


Shape of X_dev: (6123, 17771)
Shape of y_dev: (6123,)


In [None]:
# Perform cross-validation with F1-score as evaluation metric
cv_scores = cross_val_score(nb_classifier, X_dev, y_dev.ravel(), cv=5, scoring='f1')

# Print the cross-validation scores
print("Cross-Validation F1-Scores:", cv_scores)
print("Mean Cross-Validation F1-Score:", cv_scores.mean())



Cross-Validation F1-Scores: [0.96126011 0.96327925 0.96236099 0.96153846 0.95972579]
Mean Cross-Validation F1-Score: 0.96163292113224


the model consodered the true positive is the "other" category that why it give this high f1 score

----------------------------------------------------------------------------------------------

**Prediction**


load test data

In [None]:
# Define the path to the JSON file
json_file_path = '/content/drive/MyDrive/nytimes_test.json'

# Read the JSON file
with open(json_file_path, 'r') as file:
    test_data = json.load(file)
# Print a summary of the first few article samples
num_samples_to_print = 5
for i, article in enumerate(test_data[:num_samples_to_print]):
    print(f"Article {i+1}: {article}")



Article 1: {'section': 'Travel', 'headline': 'Afropunk, Huichica and More: 8 Music Festivals to Hit This Year', 'article_url': 'https://www.nytimes.com/2019/03/15/travel/afropunk-huichica-and-more-8-music-festivals-to-hit-this-year.html', 'article': 'An act at the FORM festival, held at Arcosanti in Arizona.\n\n\n\n\n\nAfropunk, Huichica and More: 8 Music Festivals to Hit This Year\n\nThe Woodstock Music & Arts Festival turns 50 this year and helped define music festivals for the counterculture generation. Its resulting legacy has been mixed -- from slickly produced multiday affairs to unmitigated disasters, like the Fyre Festival -- but the wildfire-spread of festivals since has led to an increasing number of exciting, smaller-scale events organized each year. These eight U.S.-based music festivals are notable for their stunning settings, extracurricular programming and highly curated, alternative lineups. Marfa Myths is a collaboration between Ballroom Marfa, a nonprofit cultural art

In [None]:
y_test=[]
# Assign labels to articles and store them in list
for article in test_data:
    label = assign_label(article['abstract'])
    y_test.append(label)

y_test

['Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'FreePalestine',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'FreePalestine',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'FreePalestine',
 'Other',
 'Other',
 'Other',
 'FreePalestine',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'FreePalestine',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'FreePalestine',
 'Other',
 'FreePalestine',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Ot

In [None]:
# Extract Abstracts from tests
abstracts_test = [article["abstract"] for article in test_data]

# Print extracted headlines
for i, headline in enumerate(abstracts_test):
    print(f"Article {i+1} Abstract:", headline)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Article 1125 Abstract: The eight teams that did not qualify to compete in the Walt Disney World bubble can create similar campuses and host voluntary workouts.
Article 1126 Abstract: A new system meant no line judges on all but two courts, including the one where Novak Djokovic was playing when he hit a judge with a ball.
Article 1127 Abstract: The National Novel Writing Month event challenges people to crank out 50,000 words in 30 days. Here are the digital tools to help you make a go of it.
Article 1128 Abstract: "I wouldn't be shocked if the guys at Home Depot showed the smugglers how to do it," Noah said of the holes. "Those guys will help you with any project."
Article 1129 Abstract: What we can learn from Trump's deficitpalooza.
Article 1130 Abstract: "What's Left of Me Is Yours," a debut novel by Stephanie Scott, is inspired by the events surrounding an unlikely murder that occurred in Japan.
Article 1131 Abstract:

In [None]:
X_test=preprocess_text(abstracts_test, stopwords=None)
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit LabelEncoder on y_dev
label_encoder.fit(y_test)
y_test = label_encoder.transform(y_test)

In [None]:
print("Shape of X_dev:", X_test.shape)
print("Shape of y_dev:", y_test.shape)


Shape of X_dev: (6124, 17530)
Shape of y_dev: (6124,)


In [None]:
X_test

<6124x17530 sparse matrix of type '<class 'numpy.int64'>'
	with 98798 stored elements in Compressed Sparse Row format>

In [None]:
y_test

array([1, 1, 1, ..., 1, 1, 1])

**Model Evaluaion**