In [3]:
import os
import pandas as pd
import glob
import re
import spacy
from codecarbon import EmissionsTracker

In [14]:
def emissions_tracker(tracker_outpath):
    tracker = EmissionsTracker(project_name = "assignment 1",
                                output_dir = tracker_outpath,
                                output_file = "emissions_assignment1.csv")
    return tracker


def load_spacy():
    """ Load SpaCy model """
    nlp = spacy.load("en_core_web_md")
    return nlp


def remove_metadata(text):
    """ Remove metadata from text input """
    return re.sub(r"<*?>", "", text)


def count_pos(doc):
    """
    Count the number of each part-of-speech (POS) tag in the document. The function takes a spaCy doc
    object as input and returns a tuple containing counts of nouns, verbs, adjectives, and adverbs.
    """
    noun_count, verb_count, adv_count, adj_count = 0, 0, 0, 0

    for token in doc:
        if token.pos_ == "NOUN":
            noun_count += 1
        elif token.pos_ == "VERB":
            verb_count += 1
        elif token.pos_ == "ADV":
            adv_count += 1
        elif token.pos_ == "ADJ":
            adj_count += 1

    return noun_count, verb_count, adv_count, adj_count


def rel_freq(count, len_doc): 
    """
    Calculate the relative frequency per 10,000 words and round the decimals. The function takes the number of
    POS and the total number of tokens in the given text, while returns the relative frequency.
    """
    return round((count/len_doc * 10000), 2)


def no_unique_ents(doc):
    """
    The function counts the total number of unique PER, LOC, and ORG entities. The function takes a spaCy doc
    object as input and returns a list containing counts of unique entities.

    """
    enteties = []

    for ent in doc.ents: 
        enteties.append((ent.text, ent.label_))

    enteties_df = pd.DataFrame(enteties, columns=["enteties", "label"])
    enteties_df = enteties_df.drop_duplicates()
    unique_counts = enteties_df.value_counts(subset = "label")
    
    unique_labels = ['PERSON', 'LOC', 'ORG']
    unique_row = []

    for label in unique_labels:
        if label in (unique_counts.index):
            unique_row.append(unique_counts[label])
        else:
            unique_row.append(0)

    return unique_row


def process_text(filepath):
    """ 
    The function iterates over .txt files in the subfolders and extracts linguistic features. Also, it creates a
    Pandas DataFrame to store and append the extracted features for each file.
    """

    for subfolder in sorted(os.listdir(filepath)):
        subfolder_path = os.path.join(filepath, subfolder)

        print(subfolder_path)

        out_df = pd.DataFrame(columns = ("Filename", "RelFreq NOUN", "RelFreq VERB", "RelFreq ADV",
                                        "RelFreq ADJ", "No. Unique PER", "No. Unique LOC", "No. Unique ORG"))

        csv_outpath = os.path.join("out", f"{subfolder}.csv")

        for file in sorted(glob.glob(os.path.join(subfolder_path, "*.txt"))):
            with open(file, "r", encoding = "latin-1") as f:
                text = f.read()
                doc = nlp(remove_metadata(text))

            noun_count, verb_count, adv_count, adj_count = count_pos(doc)
            len_doc = len(doc)
            noun_rel_freq, verb_rel_freq, adv_rel_freq, adj_rel_freq = rel_freq(noun_count, len_doc), rel_freq(verb_count, len_doc), rel_freq(adv_count, len_doc), rel_freq(adj_count, len_doc)
            No_unique_per, No_unique_loc, No_unique_org = no_unique_ents(doc)
            
            text_name = file.split("/")[-1]

            text_row = [text_name, noun_rel_freq, verb_rel_freq, adv_rel_freq, adj_rel_freq,
                        No_unique_per, No_unique_loc, No_unique_org]
            out_df.loc[len(out_df)] = text_row

        csv_outpath = os.path.join("out", f"{subfolder}_data_test.csv")
        out_df.to_csv(csv_outpath)



In [15]:
tracker_outpath = "../assignment-5/out"
tracker = emissions_tracker(tracker_outpath)
tracker.start()

nlp = load_spacy()

filepath = os.path.join("..", "..", "..", "..", "cds-lang-data", "USEcorpus", "USEcorpus") # "in", "USEcorpus"

results = process_text(filepath)

tracker.stop()


[codecarbon INFO @ 10:02:30] [setup] RAM Tracking...
[codecarbon INFO @ 10:02:30] [setup] GPU Tracking...
[codecarbon INFO @ 10:02:30] No GPU found.
[codecarbon INFO @ 10:02:30] [setup] CPU Tracking...
[codecarbon INFO @ 10:02:30] Energy consumed for RAM : 0.015286 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:02:30] Energy consumed for all CPUs : 0.004605 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:02:30] 0.019891 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:02:31] CPU Model on constant consumption mode: Intel(R) Xeon(R) Gold 6130 CPU @ 2.10GHz
[codecarbon INFO @ 10:02:31] >>> Tracker's metadata:
[codecarbon INFO @ 10:02:31]   Platform system: Linux-5.4.256.el8-x86_64-with-glibc2.35
[codecarbon INFO @ 10:02:31]   Python version: 3.10.12
[codecarbon INFO @ 10:02:31]   CodeCarbon version: 2.3.5
[codecarbon INFO @ 10:02:31]   Available RAM : 376.535 GB
[codecarbon INFO @ 10:02:31]   CPU count: 64
[codecarbon INFO @ 10:02:31]   CPU model: Intel(

../../../../cds-lang-data/USEcorpus/USEcorpus/a1


[codecarbon INFO @ 10:02:43] Energy consumed for RAM : 0.002352 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:02:43] Energy consumed for all CPUs : 0.000708 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:02:43] 0.003060 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:02:45] Energy consumed for RAM : 0.015874 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:02:45] Energy consumed for all CPUs : 0.004782 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:02:45] 0.020655 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:02:48] Energy consumed for RAM : 0.007058 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:02:48] Energy consumed for all CPUs : 0.002125 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:02:48] 0.009183 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:02:50] Energy consumed for RAM : 0.000588 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:02:50] Energy consumed f

../../../../cds-lang-data/USEcorpus/USEcorpus/a2


[codecarbon INFO @ 10:03:43] Energy consumed for RAM : 0.004704 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:03:43] Energy consumed for all CPUs : 0.001416 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:03:43] 0.006120 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:03:45] Energy consumed for RAM : 0.018226 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:03:45] Energy consumed for all CPUs : 0.005490 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:03:45] 0.023716 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:03:48] Energy consumed for RAM : 0.009411 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:03:48] Energy consumed for all CPUs : 0.002833 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:03:48] 0.012244 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:03:50] Energy consumed for RAM : 0.002941 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:03:50] Energy consumed f

../../../../cds-lang-data/USEcorpus/USEcorpus/a3


[codecarbon INFO @ 10:04:43] Energy consumed for RAM : 0.007055 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:04:43] Energy consumed for all CPUs : 0.002125 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:04:43] 0.009180 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:04:45] Energy consumed for RAM : 0.020578 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:04:45] Energy consumed for all CPUs : 0.006198 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:04:45] 0.026777 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:04:48] Energy consumed for RAM : 0.011764 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:04:48] Energy consumed for all CPUs : 0.003542 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:04:48] 0.015306 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:04:50] Energy consumed for RAM : 0.005294 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:04:50] Energy consumed f

../../../../cds-lang-data/USEcorpus/USEcorpus/a4


[codecarbon INFO @ 10:05:43] Energy consumed for RAM : 0.009407 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:05:43] Energy consumed for all CPUs : 0.002833 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:05:43] 0.012240 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:05:46] Energy consumed for RAM : 0.022932 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:05:46] Energy consumed for all CPUs : 0.006907 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:05:46] 0.029839 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:05:48] Energy consumed for RAM : 0.014132 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:05:48] Energy consumed for all CPUs : 0.004256 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:05:48] 0.018387 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:05:50] Energy consumed for RAM : 0.007646 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:05:50] Energy consumed f

../../../../cds-lang-data/USEcorpus/USEcorpus/a5


[codecarbon INFO @ 10:06:28] Energy consumed for RAM : 0.011172 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:06:28] Energy consumed for all CPUs : 0.003364 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:06:28] 0.014536 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:06:31] Energy consumed for RAM : 0.024696 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:06:31] Energy consumed for all CPUs : 0.007439 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:06:31] 0.032135 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:06:33] Energy consumed for RAM : 0.015896 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:06:33] Energy consumed for all CPUs : 0.004787 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:06:33] 0.020682 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:06:35] Energy consumed for RAM : 0.009411 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:06:35] Energy consumed f

../../../../cds-lang-data/USEcorpus/USEcorpus/b1


[codecarbon INFO @ 10:07:01] Energy consumed for RAM : 0.025872 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:07:01] Energy consumed for all CPUs : 0.007793 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:07:01] 0.033665 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:07:03] Energy consumed for RAM : 0.017072 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:07:03] Energy consumed for all CPUs : 0.005141 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:07:03] 0.022213 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:07:05] Energy consumed for RAM : 0.010587 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:07:05] Energy consumed for all CPUs : 0.003188 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:07:05] 0.013775 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:07:07] Energy consumed for RAM : 0.015908 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:07:07] Energy consumed f

../../../../cds-lang-data/USEcorpus/USEcorpus/b2


[codecarbon INFO @ 10:07:18] Energy consumed for RAM : 0.017660 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:07:18] Energy consumed for all CPUs : 0.005318 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:07:18] 0.022978 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:07:20] Energy consumed for RAM : 0.011176 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:07:20] Energy consumed for all CPUs : 0.003365 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:07:20] 0.014540 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:07:22] Energy consumed for RAM : 0.016496 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:07:22] Energy consumed for all CPUs : 0.004967 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:07:22] 0.021463 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:07:28] Energy consumed for RAM : 0.013525 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:07:28] Energy consumed f

../../../../cds-lang-data/USEcorpus/USEcorpus/b3


[codecarbon INFO @ 10:07:33] Energy consumed for RAM : 0.018248 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:07:33] Energy consumed for all CPUs : 0.005496 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:07:33] 0.023743 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:07:35] Energy consumed for RAM : 0.011764 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:07:35] Energy consumed for all CPUs : 0.003542 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:07:35] 0.015305 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:07:37] Energy consumed for RAM : 0.017084 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:07:37] Energy consumed for all CPUs : 0.005144 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:07:37] 0.022228 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:07:43] Energy consumed for RAM : 0.014113 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:07:43] Energy consumed f

../../../../cds-lang-data/USEcorpus/USEcorpus/b4


[codecarbon INFO @ 10:07:48] Energy consumed for RAM : 0.018835 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:07:48] Energy consumed for all CPUs : 0.005672 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:07:48] 0.024507 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:07:50] Energy consumed for RAM : 0.012352 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:07:50] Energy consumed for all CPUs : 0.003719 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:07:50] 0.016071 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:07:52] Energy consumed for RAM : 0.017672 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:07:52] Energy consumed for all CPUs : 0.005321 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:07:52] 0.022993 kWh of electricity used since the beginning.


../../../../cds-lang-data/USEcorpus/USEcorpus/b5


[codecarbon INFO @ 10:07:58] Energy consumed for RAM : 0.014701 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:07:58] Energy consumed for all CPUs : 0.004427 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:07:58] 0.019128 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:08:01] Energy consumed for RAM : 0.028224 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:08:01] Energy consumed for all CPUs : 0.008501 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:08:01] 0.036726 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:08:03] Energy consumed for RAM : 0.019423 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:08:03] Energy consumed for all CPUs : 0.005849 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:08:03] 0.025272 kWh of electricity used since the beginning.


../../../../cds-lang-data/USEcorpus/USEcorpus/b6


[codecarbon INFO @ 10:08:05] Energy consumed for RAM : 0.012940 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:08:05] Energy consumed for all CPUs : 0.003896 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:08:05] 0.016836 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:08:07] Energy consumed for RAM : 0.018260 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:08:07] Energy consumed for all CPUs : 0.005498 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:08:07] 0.023758 kWh of electricity used since the beginning.


../../../../cds-lang-data/USEcorpus/USEcorpus/b7
../../../../cds-lang-data/USEcorpus/USEcorpus/b8
../../../../cds-lang-data/USEcorpus/USEcorpus/c1


[codecarbon INFO @ 10:08:13] Energy consumed for RAM : 0.015289 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:08:13] Energy consumed for all CPUs : 0.004604 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:08:13] 0.019893 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:08:14] Energy consumed for RAM : 0.013296 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:08:14] Energy consumed for all CPUs : 0.004003 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:08:14] 0.017299 kWh of electricity used since the beginning.


0.0031210202861831327

[codecarbon INFO @ 10:08:16] Energy consumed for RAM : 0.028813 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:08:16] Energy consumed for all CPUs : 0.008678 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:08:16] 0.037491 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:08:18] Energy consumed for RAM : 0.020011 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:08:18] Energy consumed for all CPUs : 0.006026 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:08:18] 0.026037 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:08:22] Energy consumed for RAM : 0.018848 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:08:22] Energy consumed for all CPUs : 0.005675 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:08:22] 0.024523 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:08:28] Energy consumed for RAM : 0.015878 kWh. RAM Power : 141.20075225830078 W
[codecarbon INFO @ 10:08:28] Energy consumed f

In [6]:
table_tester = pd.read_csv("out/a1_data_test.csv")
table_tester

Unnamed: 0.1,Unnamed: 0,Filename,RelFreq NOUN,RelFreq VERB,RelFreq ADV,RelFreq ADJ,No. Unique PER,No. Unique LOC,No. Unique ORG
0,0,doc>,1524.48,1216.78,531.47,797.20,0,0,0
1,1,doc>,1161.05,1235.96,836.45,599.25,1,0,0
2,2,doc>,1477.95,1191.90,476.76,679.38,1,0,0
3,3,doc>,1092.72,1357.62,573.95,596.03,1,0,1
4,4,doc>,1314.50,1191.65,675.68,577.40,0,1,2
...,...,...,...,...,...,...,...,...,...
298,298,doc>,1576.92,1057.69,384.62,750.00,0,0,0
299,299,doc>,1333.33,1452.99,666.67,700.85,0,0,0
300,300,doc>,1206.50,1229.70,440.84,603.25,0,0,0
301,301,doc>,1448.28,1264.37,735.63,505.75,2,0,0
