In [1]:
import spacy
import warnings
import itertools
import collections
import pandas as pd

# pd.set_option('max_colwidth', None)
warnings.simplefilter(action='ignore', category=FutureWarning)

nlp = spacy.load("en_core_web_sm")

## Years

In [None]:
years = {}
for index, row in df.iterrows():
    sentences = row['fullText'].split("\n")
    for sentence in sentences:
        if "publication year" in sentence.lower():
            year = sentence.split(":")[1].replace(" ", "")
            if not year in years:
                years[year] = 1
            else:
                years[year] += 1
years = collections.OrderedDict(sorted(years.items()))
pd.DataFrame(list(years.items()), columns=['year','count']).to_csv("years.csv", index=False)

## Dates

In [None]:
dates = {}
for index, row in df.iterrows():
    sentences = row['fullText'].split("\n")
    for sentence in sentences:
        if "publication date" in sentence.lower():
            date = sentence.split(":")[1].replace(" ", "")
            year = date.split(",")[1]
            if not date in dates:
                dates[date] = 1
            else:
                dates[date] += 1
dates = collections.OrderedDict(sorted(dates.items()))

dates_df = pd.DataFrame(list(dates.items()), columns=['full_date', 'count'])
dates_df[['temp', 'year']] = dates_df['full_date'].str.split(',', expand=True)
dates_df['full_date'] = dates_df['full_date'].str[:3] + " " + dates_df['full_date'].str[3:]
dates_df['month'] = dates_df['temp'].str[:3]
dates_df['day'] = dates_df['temp'].str[3:]

del dates_df['temp']
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
dates_df['month'] = pd.Categorical(dates_df['month'], categories=months, ordered=True)
dates_df.sort_values(['year', 'month', 'day'], ascending=[True, True, True]).to_csv("dates.csv", index=False)

## Count based on ID

In [12]:
!rm -r count_id
!mkdir count_id

In [14]:
folders = ['1950-1959', '1960-1969', '1970-1979', '1980-1989', '1990-1999', '2000-2009','2010-2019', '2020-2029']
for prefix in folders:
    df = pd.read_json(path_or_buf="jsonl/"+prefix+'.jsonl', lines=True)
    display(df.head(1))

    # unigram
    for index, row in df.iterrows():
        count = 0
        for key, value in row['unigramCount'].items():
            if 'automation' in key.lower():
                count+=row['unigramCount'][key]
        df.loc[index, 'automation_unigram_count'] = count
    df[['id', 'automation_unigram_count']].to_csv("count_id/"+prefix+"_unigram.csv", index=False)
    print("Unigram CSV for {} has been created".format(prefix))
    
    # bigram
    bigram_df = pd.DataFrame()
    for index, row in df.iterrows():
        for key, value in row['bigramCount'].items():
            if 'automation' in key.lower():
                new_row = {'id':row['id'], 'word_1':key.split()[0], 'word_2':key.split()[1], 'automation_bigram_count':row['bigramCount'][key]}
                bigram_df = bigram_df.append(new_row, ignore_index=True)
    bigram_df_grouped = bigram_df.groupby(['id'], as_index=False)['automation_bigram_count'].sum().rename(columns={'automation_bigram_count':'automation_bigram_count_grouped'})
    pd.merge(bigram_df, bigram_df_grouped, on='id').to_csv("count_id/"+prefix+"_bigram.csv", index=False)
    print("Bigram CSV for {} has been created".format(prefix))

    # trigram
    trigram_df = pd.DataFrame()
    for index, row in df.iterrows():
        for key, value in row['trigramCount'].items():
            if 'automation' in key.lower():
                new_row = {'id':row['id'], 'word_1':key.split()[0], 'word_2':key.split()[1], 'word_3':key.split()[2], 
                           'automation_trigram_count':row['trigramCount'][key]}
                trigram_df = trigram_df.append(new_row, ignore_index=True)
    trigram_df_grouped = trigram_df.groupby(['id'], as_index=False)['automation_trigram_count'].sum().rename(columns={'automation_trigram_count':'automation_trigram_count_grouped'})
    pd.merge(trigram_df, trigram_df_grouped, on='id').to_csv("count_id/"+prefix+"_trigram.csv", index=False)
    print("Trigram CSV for {} has been created.\n".format(prefix))

Unnamed: 0,id,outputFormat,wordCount,fullText,unigramCount,bigramCount,trigramCount
0,111665565,"[unigram, bigram, trigram, fullText]",285,metals industry speedsconversion average incre...,"{'metals': 3, 'industry': 1, 'speedsconversion...","{'metals industry': 1, 'industry speedsconvers...","{'metals industry speedsconversion': 1, 'indus..."


Unigram CSV for count_id/1950-1959 has been created
Bigram CSV for count_id/1950-1959 has been created
Trigram CSV for count_id/1950-1959 has been created.



Unnamed: 0,id,outputFormat,wordCount,fullText,unigramCount,bigramCount,trigramCount
0,114925995,"[unigram, bigram, trigram, fullText]",92,unions demanding programs washington upia grou...,"{'unions': 3, 'demanding': 1, 'programs': 2, '...","{'unions demanding': 1, 'demanding programs': ...","{'unions demanding programs': 1, 'demanding pr..."


Unigram CSV for count_id/1960-1969 has been created
Bigram CSV for count_id/1960-1969 has been created
Trigram CSV for count_id/1960-1969 has been created.



Unnamed: 0,id,outputFormat,wordCount,fullText,unigramCount,bigramCount,trigramCount
0,117783758,"[unigram, bigram, trigram, fullText]",450,costs alexander hammer specialize automate ord...,"{'costs': 9, 'alexander': 1, 'hammer': 1, 'spe...","{'costs alexander': 1, 'alexander hammer': 1, ...","{'costs alexander hammer': 1, 'alexander hamme..."


Unigram CSV for count_id/1970-1979 has been created
Bigram CSV for count_id/1970-1979 has been created
Trigram CSV for count_id/1970-1979 has been created.



Unnamed: 0,id,outputFormat,wordCount,fullText,unigramCount,bigramCount,trigramCount
0,427454541,"[unigram, bigram, trigram, fullText]",352,number basic reasons think outcry system tradi...,"{'number': 3, 'basic': 2, 'reasons': 2, 'think...","{'number basic': 2, 'basic reasons': 2, 'reaso...","{'number basic reasons': 2, 'basic reasons thi..."


Unigram CSV for count_id/1980-1989 has been created
Bigram CSV for count_id/1980-1989 has been created
Trigram CSV for count_id/1980-1989 has been created.



Unnamed: 0,id,outputFormat,wordCount,fullText,unigramCount,bigramCount,trigramCount
0,2234013923,"[unigram, bigram, trigram, fullText]",236,governments coordinator 2000 conversion today ...,"{'governments': 1, 'coordinator': 1, '2000': 3...","{'governments coordinator': 1, 'coordinator 20...","{'governments coordinator 2000': 1, 'coordinat..."


Unigram CSV for count_id/1990-1999 has been created
Bigram CSV for count_id/1990-1999 has been created
Trigram CSV for count_id/1990-1999 has been created.



Unnamed: 0,id,outputFormat,wordCount,fullText,unigramCount,bigramCount,trigramCount
0,434257357,"[unigram, bigram, trigram, fullText]",377,replication enabled digital coming tactile wor...,"{'replication': 1, 'enabled': 2, 'digital': 5,...","{'replication enabled': 1, 'enabled digital': ...","{'replication enabled digital': 1, 'enabled di..."


Unigram CSV for count_id/2000-2009 has been created
Bigram CSV for count_id/2000-2009 has been created
Trigram CSV for count_id/2000-2009 has been created.



Unnamed: 0,id,outputFormat,wordCount,fullText,unigramCount,bigramCount,trigramCount
0,1857295624,"[unigram, bigram, trigram, fullText]",1813,president obama delivered farewell address chi...,"{'president': 7, 'obama': 1, 'delivered': 1, '...","{'president obama': 1, 'obama delivered': 1, '...","{'president obama delivered': 1, 'obama delive..."


Unigram CSV for count_id/2010-2019 has been created
Bigram CSV for count_id/2010-2019 has been created
Trigram CSV for count_id/2010-2019 has been created.



Unnamed: 0,id,outputFormat,wordCount,fullText,unigramCount,bigramCount,trigramCount
0,2472937624,"[unigram, bigram, trigram, fullText]",425,sleep procrastinate better money 2021what terr...,"{'sleep': 7, 'procrastinate': 4, 'better': 4, ...","{'sleep procrastinate': 1, 'procrastinate bett...","{'sleep procrastinate better': 1, 'procrastina..."


Unigram CSV for count_id/2020-2029 has been created
Bigram CSV for count_id/2020-2029 has been created
Trigram CSV for count_id/2020-2029 has been created.



# Count based on words

In [56]:
!rm -r count_word
!mkdir count_word

In [57]:
files = ['1950-1959', '1960-1969', '1970-1979', '1980-1989', '1990-1999', '2000-2009','2010-2019', '2020-2029']
for file in files:
    df = pd.read_csv("count_id/"+file+"_bigram.csv")
    df["word"] = df["word_1"] + " " + df["word_2"]
    # sort for aggreagation
    df['word'] = df['word'].apply(lambda x: " ".join(sorted(x.split(" "))))
    df = df.groupby('word')['automation_bigram_count'].agg('sum').reset_index()
    df.to_csv("count_word/"+file+"_bigram.csv")
    
    
    df = pd.read_csv("count_id/"+file+"_trigram.csv")
    df["word"] = df["word_1"] + " " + df["word_2"] + " " + df["word_3"]
    # sort for aggreagation
    df['word'] = df['word'].apply(lambda x: " ".join(sorted(x.split(" "))))
    df = df.groupby('word')['automation_trigram_count'].agg('sum').reset_index()
    df.to_csv("count_word/"+file+"_trigram.csv")
print("*"*20+"FINISHED"+"*"*20)

********************FINISHED********************


# Lemmatization

In [3]:
!rm -r count_word_lemmatized
!mkdir count_word_lemmatized

In [4]:
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

In [5]:
files = ['1950-1959', '1960-1969', '1970-1979', '1980-1989', '1990-1999', '2000-2009','2010-2019', '2020-2029']
for file in files:
    df = pd.read_csv("count_id/"+file+"_bigram.csv")
    # lemmatization
    df["word"] = df["word_1"] + " " + df["word_2"]
    df["word"] = df["word"].apply(lambda row: " ".join([w.lemma_ for w in nlp(row)]))
    # sort for aggreagation
    df['word'] = df['word'].apply(lambda x: " ".join(sorted(x.split(" "))))
    df = df.groupby('word')['automation_bigram_count'].agg('sum').reset_index()
    df.to_csv("count_word_lemmatized/"+file+"_bigram.csv")
    
    
    df = pd.read_csv("count_id/"+file+"_trigram.csv")
    # lemmatization
    df["word"] = df["word_1"] + " " + df["word_2"] + " " + df["word_3"]
    df["word"] = df["word"].apply(lambda row: " ".join([w.lemma_ for w in nlp(row)]))
    # sort for aggreagation
    df['word'] = df['word'].apply(lambda x: " ".join(sorted(x.split(" "))))
    df = df.groupby('word')['automation_trigram_count'].agg('sum').reset_index()
    df.to_csv("count_word_lemmatized/"+file+"_trigram.csv")
print("*"*20+"FINISHED"+"*"*20)

********************FINISHED********************


In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("dangerous")
for token in doc:
    print(token.lemma_)

dangerous
