In [1]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer 
from text_cleaner import TextCleaner

In [2]:
def get_idf_weight(texts, output):
    """Calculate idf-weight for the given texts and save the results in output""" 
    
    #instantiate CountVectorizer() n=(1-3)-grams
    cv=CountVectorizer(ngram_range=(1,3)) 

    # this steps generates word counts for the words in your docs 
    word_count_vector=cv.fit_transform(texts)

    print(word_count_vector.shape)
    
    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
    tfidf_transformer.fit(word_count_vector)
    
    df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"]) 
    df_idf['word'] = df_idf.index
    
    # sort ascending 
    df_idf= df_idf.sort_values(by=['idf_weights'])
    
    
    # save idf_weights
    df_idf.to_csv(output, index = False)
    
    return df_idf
    

In [3]:
def clean_text(data_fp):
    """Clean the texts stored in data_fp. Cleaning includes removing stop-words, extra spaces etc"""
    docs = pd.read_csv(data_fp)
    txt_cleaner = TextCleaner()
    cleaned_texts = docs['text_clean'].apply(txt_cleaner.preprocess)
    return cleaned_texts
    

In [4]:
"""1960-coal"""
data_fp = '../data/selected-data/1960/1960s_coal_labeled_full_0.95.csv'
output = '../output/idf/coal_1960_idf.csv'
cleaned_texts = clean_text(data_fp)
df_idf = get_idf_weight(cleaned_texts,output)
print(df_idf[:100])

(4626, 633921)
           idf_weights       word
kolen         1.708172      kolen
jaar          1.807263       jaar
miljoen       2.106641    miljoen
ton           2.145287        ton
moeten        2.196151     moeten
...                ...        ...
enige         3.382880      enige
bedrijven     3.382880  bedrijven
miljard       3.389931    miljard
bovendien     3.394659  bovendien
bedrijf       3.397031    bedrijf

[100 rows x 2 columns]


In [6]:
"""1960-oil"""
data_fp = '../data/selected-data/1960/1960s_oil_labeled_full_0.95.csv'
output = '../output/idf/oil_1960_idf.csv'
cleaned_texts = clean_text(data_fp)
df_idf = get_idf_weight(cleaned_texts,output)
print(df_idf[:100])

(57196, 5492612)
          idf_weights      word
olie         1.837110      olie
jaar         2.375046      jaar
grote        2.427064     grote
wel          2.471526       wel
echter       2.528321    echter
...               ...       ...
jaren        3.460742     jaren
waarden      3.471862   waarden
beter        3.472691     beter
minister     3.473105  minister
gedaan       3.477886    gedaan

[100 rows x 2 columns]


In [7]:
"""1960-gas"""
data_fp = '../data/selected-data/1960/1960s_gas_labeled_full_0.95.csv'
output = '../output/idf/gas_1960_idf.csv'
cleaned_texts = clean_text(data_fp)
df_idf = get_idf_weight(cleaned_texts,output)
print(df_idf[:100])

(40816, 3612065)
          idf_weights      word
aardgas      2.161217   aardgas
gas          2.182171       gas
jaar         2.421323      jaar
wel          2.466901       wel
grote        2.669828     grote
...               ...       ...
zeggen       3.819974    zeggen
verwacht     3.821619  verwacht
werk         3.829472      werk
hadden       3.833630    hadden
weinig       3.842839    weinig

[100 rows x 2 columns]


In [8]:
"""1970-coal"""
data_fp = '../data/selected-data/1970/1970s_coal_labeled_full_0.95.csv'
output = '../output/idf/coal_1970_idf.csv'
cleaned_texts = clean_text(data_fp)
df_idf = get_idf_weight(cleaned_texts,output)
print(df_idf[:100])

(5388, 594354)
           idf_weights       word
jaar          1.816419       jaar
kolen         1.886402      kolen
olie          1.953435       olie
moeten        2.149037     moeten
steenkool     2.205025  steenkool
...                ...        ...
gevolg        3.554244     gevolg
nemen         3.561430      nemen
grootste      3.580848   grootste
problemen     3.588228  problemen
binnen        3.593179     binnen

[100 rows x 2 columns]


In [9]:
"""1970-oil"""
data_fp = '../data/selected-data/1970/1970s_oil_labeled_full_0.95.csv'
output = '../output/idf/oil_1970_idf.csv'
cleaned_texts = clean_text(data_fp)
df_idf = get_idf_weight(cleaned_texts,output)
print(df_idf[:100])

(96189, 7342717)
          idf_weights      word
olie         1.845996      olie
jaar         2.292204      jaar
wel          2.526146       wel
grote        2.634172     grote
moeten       2.717297    moeten
...               ...       ...
ter          3.688847       ter
stemming     3.692679  stemming
waarbij      3.699149   waarbij
miljard      3.701780   miljard
tijdens      3.703797   tijdens

[100 rows x 2 columns]


In [10]:
"""1970-coal"""
data_fp = '../data/selected-data/1970/1970s_coal_labeled_full_0.95.csv'
output = '../output/idf/coal_1970_idf.csv'
cleaned_texts = clean_text(data_fp)
df_idf = get_idf_weight(cleaned_texts,output)
print(df_idf[:100])

(5388, 594354)
           idf_weights       word
jaar          1.816419       jaar
kolen         1.886402      kolen
olie          1.953435       olie
moeten        2.149037     moeten
steenkool     2.205025  steenkool
...                ...        ...
gevolg        3.554244     gevolg
nemen         3.561430      nemen
grootste      3.580848   grootste
problemen     3.588228  problemen
binnen        3.593179     binnen

[100 rows x 2 columns]


In [11]:
"""1970-gas"""
data_fp = '../data/selected-data/1970/1970s_gas_labeled_full_0.95.csv'
output = '../output/idf/gas_1970_idf.csv'
cleaned_texts = clean_text(data_fp)
df_idf = get_idf_weight(cleaned_texts,output)
print(df_idf[:100])

(51678, 3817821)
          idf_weights      word
gas          2.346370       gas
jaar         2.355109      jaar
wel          2.404947       wel
aardgas      2.447558   aardgas
moeten       2.623547    moeten
...               ...       ...
kosten       3.991080    kosten
gisteren     4.014463  gisteren
weten        4.017228     weten
tien         4.020396      tien
bekend       4.022381    bekend

[100 rows x 2 columns]


In [12]:
"""1980-coal"""
data_fp = '../data/selected-data/1980/1980s_coal_labeled_full_0.95.csv'
output = '../output/idf/coal_1980_idf.csv'
cleaned_texts = clean_text(data_fp)
df_idf = get_idf_weight(cleaned_texts,output)
print(df_idf[:100])

(29289, 541157)
              idf_weights          word
kolen            1.773372         kolen
jaar             1.816691          jaar
olie             2.182246          olie
moeten           2.265822        moeten
steenkool        2.310763     steenkool
...                   ...           ...
brandstoffen     3.561881  brandstoffen
gebied           3.575245        gebied
nemen            3.579289         nemen
zeker            3.588789         zeker
ter              3.597004           ter

[100 rows x 2 columns]


In [13]:
"""1980-oil"""
data_fp = '../data/selected-data/1980/1980s_oil_labeled_full_0.95.csv'
output = '../output/idf/oil_1980_idf.csv'
cleaned_texts = clean_text(data_fp)
df_idf = get_idf_weight(cleaned_texts,output)
print(df_idf[:100])

(1474, 366538)
           idf_weights       word
jaar          1.316337       jaar
olie          1.358129       olie
procent       1.613052    procent
per           1.711622        per
miljoen       1.722733    miljoen
...                ...        ...
gemaakt       2.845375    gemaakt
daarbij       2.849676    daarbij
steenkool     2.853996  steenkool
rond          2.858334       rond
problemen     2.862691  problemen

[100 rows x 2 columns]


In [14]:
"""1980-gas"""
data_fp = '../data/selected-data/1980/1980s_gas_labeled_full_0.95.csv'
output = '../output/idf/gas_1980_idf.csv'
cleaned_texts = clean_text(data_fp)
df_idf = get_idf_weight(cleaned_texts,output)
print(df_idf[:100])

(174378, 4376038)
           idf_weights       word
jaar          2.207970       jaar
gas           2.285706        gas
wel           2.412221        wel
moeten        2.658019     moeten
volgens       2.796387    volgens
...                ...        ...
geval         3.937148      geval
geleden       3.939965    geleden
geven         3.941050      geven
bedrijven     3.950652  bedrijven
via           3.960457        via

[100 rows x 2 columns]


In [15]:
"""1990-coal"""
data_fp = '../data/selected-data/1990/1990s_coal_labeled_full_0.95.csv'
output = '../output/idf/coal_1990_idf.csv'
cleaned_texts = clean_text(data_fp)
df_idf = get_idf_weight(cleaned_texts,output)
print(df_idf[:100])

(2259, 145380)
         idf_weights     word
it          1.763722       it
jaar        1.799491     jaar
kolen       2.010164    kolen
olie        2.176335     olie
volgens     2.177770  volgens
...              ...      ...
zeer        3.385582     zeer
zoden       3.400110    zoden
den         3.409914      den
bijna       3.414852    bijna
oost        3.414852     oost

[100 rows x 2 columns]


In [16]:
"""1990-oil"""
data_fp = '../data/selected-data/1990/1990s_oil_labeled_full_0.95.csv'
output = '../output/idf/oil_1990_idf.csv'
cleaned_texts = clean_text(data_fp)
df_idf = get_idf_weight(cleaned_texts,output)
print(df_idf[:100])

(87740, 3588526)
          idf_weights      word
olie         1.804361      olie
it           2.026976        it
jaar         2.154149      jaar
volgens      2.533060   volgens
miljoen      2.570717   miljoen
...               ...       ...
index        3.520936     index
handel       3.530338    handel
vraag        3.531484     vraag
steeg        3.532200     steeg
minister     3.534497  minister

[100 rows x 2 columns]


In [17]:
"""1990-gas"""
data_fp = '../data/selected-data/1990/1990s_gas_labeled_full_0.95.csv'
output = '../output/idf/gas_1990_idf.csv'
cleaned_texts = clean_text(data_fp)
df_idf = get_idf_weight(cleaned_texts,output)
print(df_idf[:100])

(16127, 2041858)
          idf_weights      word
it           2.124426        it
gas          2.134787       gas
jaar         2.175127      jaar
wel          2.429372       wel
knnen        2.563865     knnen
...               ...       ...
milie        3.799740     milie
gemaakt      3.808956   gemaakt
ongeveer     3.814114  ongeveer
staan        3.817221     staan
zeker        3.823464     zeker

[100 rows x 2 columns]
