In [37]:
import gensim
from gensim.models.phrases import Phrases, Phraser
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import string
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.stem import WordNetLemmatizer


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
df = pd.read_csv("summaries.csv")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/norika_machome/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/norika_machome/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/norika_machome/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [26]:
df['Year'] = pd.to_datetime(df['Year']).dt.year

lemmatizer = WordNetLemmatizer()

# Define a function to clean, tokenize, and lemmatize the text
def clean_and_tokenize(text):
    # Remove newline characters
    text = text.replace('\n', ' ')

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove punctuation and stopwords, and lemmatize
    stop_words = set(stopwords.words('english'))
    tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.isalpha() and word.lower() not in stop_words]
    tokens = ['artificial_intelligence' if word == 'ai' else word for word in tokens]
    tokens = ['iot' if word == 'internet_of_things' else word for word in tokens]

    # Join the tokens back into a string for Cleaned Text
    cleaned_text = ' '.join(tokens)
    return cleaned_text, tokens

# Apply the cleaning and tokenization function to the 'Text' column
df['Cleaned Text'], df['Tokenized Text'] = zip(*df['Text'].apply(clean_and_tokenize))

# Train a bigram model
phrases = Phrases(df['Tokenized Text'], min_count=5, threshold=10)
bigram = Phraser(phrases)

# Apply the bigram model to transform the tokenized text
df['Bigram Text'] = df['Tokenized Text'].apply(lambda x: bigram[x])

df.head()

Unnamed: 0,Year,Text,Cleaned Text,Tokenized Text,Bigram Text
0,1970,\n\nScience and Technology in Society forum\n\...,science technology society forum science techn...,"[science, technology, society, forum, science,...","[science_technology, society_forum, science_te..."
1,1970,\nScience and Technology in Society (STS) foru...,science technology society sts forum light sha...,"[science, technology, society, sts, forum, lig...","[science_technology, society, sts_forum, light..."
2,1970,\nScience and Technology in Society (STS) foru...,science technology society sts forum light sha...,"[science, technology, society, sts, forum, lig...","[science_technology, society, sts_forum, light..."
3,1970,Science and Technology in Society (STS) forum...,science technology society sts forum light sha...,"[science, technology, society, sts, forum, lig...","[science_technology, society, sts_forum, light..."
4,1970,STS forum 2008 I\nScience and Technology...,sts forum science technology society sts forum...,"[sts, forum, science, technology, society, sts...","[sts_forum, science_technology, society, sts_f..."


In [28]:
# Define a function to count frequencies
def count_frequencies(tokens_list):
    all_tokens = [token for tokens in tokens_list for token in tokens]
    return Counter(all_tokens)

# Count frequencies for tokenized text
tokenized_freq = count_frequencies(df['Tokenized Text'])
bigram_freq = count_frequencies(df['Bigram Text'])

# Get the top 100 words/bigrams
top_1000_tokenized = tokenized_freq.most_common(1000)
top_1000_bigrams = bigram_freq.most_common(1000)

# Convert to DataFrame for better visualization
top_1000_tokenized_df = pd.DataFrame(top_1000_tokenized, columns=['Word/Bigram', 'Frequency'])
top_1000_bigrams_df = pd.DataFrame(top_1000_bigrams, columns=['Word/Bigram', 'Frequency'])

# Save to CSV files
top_1000_tokenized_df.to_csv('top_1000_tokenized.csv', index=False)
top_1000_bigrams_df.to_csv('top_1000_bigrams.csv', index=False)

Bigram Yearly Frequencies:
      also  technology  science  need  research  science_technology  society  \
2004     0           0        0     0         0                   0        0   
2005     0           0        0     0         0                   0        0   
2006     0           0        0     0         0                   0        0   
2007     0           0        0     0         0                   0        0   
2008     0           0        0     0         0                   0        0   

      new  innovation  must  ...  african  language  unique  report  \
2004    0           0     0  ...        0         0       0       0   
2005    0           0     0  ...        0         0       0       0   
2006    0           0     0  ...        0         0       0       0   
2007    0           0     0  ...        0         0       0       0   
2008    0           0     0  ...        0         0       0       0   

      nutrition  trying  public_private  mankind  accessible  rec

In [31]:
# Identify bigrams with significant changes
drastic_change_bigrams = []

# Define a threshold for significant change
change_threshold = 20  # This can be adjusted based on the data

for bigram in bigram_yearly_freq_df.columns:
    yearly_freq = bigram_yearly_freq_df[bigram]
    max_freq = yearly_freq.max()
    min_freq = yearly_freq.min()
    
    # Check for emergence (start with low frequency, then increase significantly)
    if min_freq == 0 and max_freq >= change_threshold:
        drastic_change_bigrams.append((bigram, 'emergence'))
    
    # Check for disappearance (start with high frequency, then drop significantly)
    if max_freq > change_threshold and min_freq == 0:
        drastic_change_bigrams.append((bigram, 'disappearance'))
    
    # Check for gaps in the middle of the timeline
    if yearly_freq[yearly_freq == 0].any() and (yearly_freq.max() >= change_threshold):
        drastic_change_bigrams.append((bigram, 'gap'))

# Convert the list to a DataFrame
drastic_change_bigrams_df = pd.DataFrame(drastic_change_bigrams, columns=['Bigram', 'Change Type'])

print("Bigrams with Drastic Changes:")
print(drastic_change_bigrams_df)


Bigrams with Drastic Changes:
Empty DataFrame
Columns: [Bigram, Change Type]
Index: []


In [None]:
# Exclude unwanted words and bigrams
exclude_words = {
    'said', 'could', 'many', 'also', 'must', 'may', 'would', 'us', 'one', 'new', 
    'however', 'important', 'use', 'well', 'including', 'explained', 'noted', 
    'example', 'order', 'stated', 'often', 'session', 'jp', 'session_chair', 'plenary_session',
    'around_world', 'chief_executive', 'executive_officer', 'vice_president', 'also_discussed',
    'participants', 'president', 'percent', 'shadows', 'sts_forum', 'director', 'society_forum',
    'concurrent_sessions', 'former_president',
    'minister', 'issue', 'issues', 'chair', 'please_contact', 'speakers', 'speaker', 'chairman', 'much',
    'building_nagatacho', 'opening_remarks'
}

# Define a function to filter out excluded words and bigrams
def filter_excluded_words(bigram_text):
    return [word for word in bigram_text if word not in exclude_words]

# Apply the exclusion function
df['Filtered Bigram Text'] = df['Bigram Text'].apply(filter_excluded_words)

