<a href="https://colab.research.google.com/github/SanieRojas/subject_screener/blob/v1/Module_sub_process_build.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# All imports required


In [22]:
import pandas as pd
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [23]:
nltk.download(['stopwords', 'vader_lexicon', 'punkt'])
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [24]:
file_name = "news_data_Israel_1697595469.csv"
df = pd.read_csv(file_name)

In [25]:
#Pandas dataframe
df.dtypes

title                          object
desc                          float64
date                           object
datetime                       object
link                           object
img                            object
media                          object
site                          float64
2023-10-17 23:17:49.446507     object
dtype: object

In [26]:
#with amount of records
df.shape

(90, 9)

In [31]:
headlines = df.drop(columns=['desc', 'date','site','link','img','media','2023-10-17 23:17:49.446507'], axis=1)
print(headlines.head(2))
print(headlines.dtypes)

                                               title  \
0  Israel-Hamas war live updates: Israel denies t...   
1  Israel-Hamas war live: deadly Gaza hospital bl...   

                     datetime  
0  2023-10-17 22:17:51.398034  
1  2023-10-17 22:17:51.397035  
title       object
datetime    object
dtype: object


In [None]:
# Specify the file path where you want to save the text data
output_file = 'news.txt'

# Extract the specified column as a Pandas Series & Save the column data as text in a .txt file
column_data = headlines["title"]
column_data.to_csv(output_file, header=False, index=False, sep='\t')

print(f'Column "{column_name}" has been saved as text in {output_file}')

In [32]:
#define function for cleanse of the text
def clean_text(text):
    # Use regular expressions to keep only letters and whitespace
    my_stopwords = nltk.corpus.stopwords.words("english")
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    words = nltk.word_tokenize(cleaned_text)
    words_nstw = [word for word in words if word not in my_stopwords]
    #ideal to remove stopwords
    return words_nstw

In [36]:
#cleanse and tokenize words
headlines['tokens'] = headlines['title'].apply(clean_text)
headlines.head(1)

Unnamed: 0,title,datetime,tokens
0,Israel-Hamas war live updates: Israel denies t...,2023-10-17 22:17:51.398034,"[israelhamas, war, live, updates, israel, deni..."


In [44]:
headlines["tokens"][0][2]

'live'

In [49]:
micro_test = headlines["tokens"][89]

In [46]:
analyzer = SentimentIntensityAnalyzer()

In [50]:
sentiment_score = analyzer.polarity_scores(' '.join(micro_test))['compound']

In [51]:
sentiment_score

-0.6908

In [55]:
scores = []

for i in range(len(headlines)):
  tokens = headlines["tokens"][i]
  sentiment_score = analyzer.polarity_scores(' '.join(tokens))['compound']
  scores.append(sentiment_score)
  i + 1

In [57]:
len(scores)

90

In [64]:
headlines["score"] = scores
print(headlines)

                                                title  \
0   Israel-Hamas war live updates: Israel denies t...   
1   Israel-Hamas war live: deadly Gaza hospital bl...   
2   UN Security Council to vote Wednesday on Israe...   
3   DOD Increases Deterrence Posture in Middle Eas...   
4   Biden considering $100 bln funding request tha...   
..                                                ...   
85  President Joe Biden will visit Israel in high-...   
86  Gaza tunnels give Hamas an advantage in fight ...   
87  Teenage sisters among 10 Britons still missing...   
88  'The people of Israel live': Hundreds rally in...   
89  Even if Israel Crushes Hamas, New Threats Will...   

                      datetime  \
0   2023-10-17 22:17:51.398034   
1   2023-10-17 22:17:51.397035   
2   2023-10-17 20:17:51.403037   
3   2023-10-17 20:17:51.400037   
4   2023-10-17 19:17:51.406036   
..                         ...   
85  2023-10-17 01:17:51.394654   
86  2023-10-17 00:17:51.419452   
87  2023-

In [74]:
# Function to perform NER on a text
def extract_named_entities(text):
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    words = nltk.word_tokenize(cleaned_text)
    pos_tags = nltk.pos_tag(words)
    named_entities = nltk.ne_chunk(pos_tags)
    return named_entities

In [75]:
headlines["named_entities"] = headlines["title"].apply(extract_named_entities)
print(headlines)



                                                title  \
0   Israel-Hamas war live updates: Israel denies t...   
1   Israel-Hamas war live: deadly Gaza hospital bl...   
2   UN Security Council to vote Wednesday on Israe...   
3   DOD Increases Deterrence Posture in Middle Eas...   
4   Biden considering $100 bln funding request tha...   
..                                                ...   
85  President Joe Biden will visit Israel in high-...   
86  Gaza tunnels give Hamas an advantage in fight ...   
87  Teenage sisters among 10 Britons still missing...   
88  'The people of Israel live': Hundreds rally in...   
89  Even if Israel Crushes Hamas, New Threats Will...   

                      datetime  \
0   2023-10-17 22:17:51.398034   
1   2023-10-17 22:17:51.397035   
2   2023-10-17 20:17:51.403037   
3   2023-10-17 20:17:51.400037   
4   2023-10-17 19:17:51.406036   
..                         ...   
85  2023-10-17 01:17:51.394654   
86  2023-10-17 00:17:51.419452   
87  2023-

In [85]:
from nltk import word_tokenize, pos_tag, ne_chunk

def extract_entities(text_file):
    with open(text_file, 'r') as f:
        text = f.read()
    entities = {}
    for sent in nltk.sent_tokenize(text):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if hasattr(chunk, 'label'):
                entity = ' '.join(c[0] for c in chunk)
                entities[entity] = entities.get(entity, 0) + 1
    return entities

text_file = "news.txt"
entities = extract_entities(text_file)
print(entities)

{'Israel': 45, 'Gaza': 9, 'Palestinian': 2, 'Abbas': 1, 'Biden': 6, 'Middle East': 4, 'Jordan': 2, 'UN': 2, 'Security Council': 1, 'DOD': 1, 'Deterrence Posture': 1, 'Continues Aid': 1, 'Nonprofit': 1, 'Florida': 1, 'DeSantis': 1, 'Canadian': 1, 'Israeli': 4, 'Gigi': 1, 'China': 2, 'Diplomatic': 1, 'Elon Musk': 1, 'Netanyahu': 4, 'Tlaib': 1, 'Humanitarian': 1, 'Gaza Free': 1, 'McDonald': 1, 'White House': 2, 'U.S.': 4, 'Opinion': 1, 'Joe Biden Did Right': 1, 'Hamas First': 1, 'Cyprus': 1, 'EU': 1, 'Hamas': 13, 'Putin': 1, 'US': 5, 'How Does Israel': 1, 'Gaza Compare': 1, 'Iran': 5, 'Khamenei': 2, 'Gaza Rishi': 1, 'Sky News': 1, 'Fed': 1, 'Ukraine': 1, 'Malaysia': 1, 'Frankfurt': 1, 'WATCH': 1, 'Scholz': 1, 'Germany': 2, 'Lebanon-Israel': 1, 'Sderot Israelis': 1, 'Hamas Biden Will Visit Israel': 1, 'Russia': 1, 'West': 1, 'Will Israel': 1, 'Joly': 1, 'Ukraine Eyes Israel': 1, 'New': 2, 'Missile Defense': 1, 'Spain': 1, 'Negev Bedouins': 1, 'Scholz Visits Israel': 1, 'Special Commitment 

In [88]:
sorted_entities = dict(sorted(entities.items(), key=lambda item: item[1], reverse=True))
print(sorted_entities)

{'Israel': 45, 'Hamas': 13, 'Gaza': 9, 'Biden': 6, 'US': 5, 'Iran': 5, 'Middle East': 4, 'Israeli': 4, 'Netanyahu': 4, 'U.S.': 4, 'Palestinian': 2, 'Jordan': 2, 'UN': 2, 'China': 2, 'White House': 2, 'Khamenei': 2, 'Germany': 2, 'New': 2, 'House': 2, 'Abbas': 1, 'Security Council': 1, 'DOD': 1, 'Deterrence Posture': 1, 'Continues Aid': 1, 'Nonprofit': 1, 'Florida': 1, 'DeSantis': 1, 'Canadian': 1, 'Gigi': 1, 'Diplomatic': 1, 'Elon Musk': 1, 'Tlaib': 1, 'Humanitarian': 1, 'Gaza Free': 1, 'McDonald': 1, 'Opinion': 1, 'Joe Biden Did Right': 1, 'Hamas First': 1, 'Cyprus': 1, 'EU': 1, 'Putin': 1, 'How Does Israel': 1, 'Gaza Compare': 1, 'Gaza Rishi': 1, 'Sky News': 1, 'Fed': 1, 'Ukraine': 1, 'Malaysia': 1, 'Frankfurt': 1, 'WATCH': 1, 'Scholz': 1, 'Lebanon-Israel': 1, 'Sderot Israelis': 1, 'Hamas Biden Will Visit Israel': 1, 'Russia': 1, 'West': 1, 'Will Israel': 1, 'Joly': 1, 'Ukraine Eyes Israel': 1, 'Missile Defense': 1, 'Spain': 1, 'Negev Bedouins': 1, 'Scholz Visits Israel': 1, 'Special

Column "title" has been saved as text in news.txt
