In [3]:
import os
import re
import nltk
import pandas as pd 
import numpy as np
from bs4 import BeautifulSoup

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## This code file mainly does two things:

1. It preprocess the raw 10Q .txt files, and returns cleaned, purified texts stored under the 'cleaned_data' folder

2. It performs text vectorization and assigns numerical values for each negative word using two negative word lists and two methods:
    
    The two negative word dictionaries are:
    
    * The traditional Harvard Psychosociological Dictionary, specifically, the Harvard-IV-4 TagNeg Dictionary, denoted as 'hiv4' in the code. 
    * The Loughran and Mcdonald Dictionay, denoted as 'lm' in the code

    The two methods are: 

    * Proportional Weighting: the proportional weights is defined as the word list counts relative to the total number of words appearing in a firm's 10-Q. 

    * Term weighting: A new method developed by Loughran and Mcdonald based on the common tf-idf approach. 
    
        The formula is : 
    
    
$$
w_{i, j}= \begin{cases}\frac{\left(1+\log \left(t f_{i, j}\right)\right)}{\left(1+\log \left(a_j\right)\right)} \log \frac{N}{d f_i} & \text { if } t f_{i, j} \geq 1 \\ 0 & \text { otherwise }\end{cases}
$$

After obtaining the weights for each file, we can calculate a negativity score for each file by summing up horizontally the weights. The final output is a dataframe called 'negativity_scores', the row indexes are the cleaned 10-Q file names, and the columns are the four combination of methods we use to gauge the negativity sentiment: 

hiv4_pw, lm_pw, hiv4_tw, lm_tw

## Preprocess raw text

This part consists of two functions:
* extract_txt(): extract and parse the valid content from a raw 10-Q .txt file
* clean_txt(): further cleans the valid content and returns the cleaned text

We loop through each folder and each raw txt file under the '10Q' folder, and store the cleaned txt files under a new folder called 'cleaned_data'. 

If you click in and check any of the cleaned_data files, you can see that they are all cleaned text and there are a total of 7320 cleaned text files

In [7]:
# helper functions 
def extract_txt(file_path):
    '''Input (str): drive path where a raw txt file is stored
        Output (str): returns valid txt content in the raw file
        Here I just read the file, and only keep the content between the first <TEXT></TEXT>'''
    
    with open(file_path,'r',encoding='utf-8') as file:
        data = file.read()
        document = re.search('<TEXT>(.*?)</TEXT>', data, re.DOTALL)

        if document:
            # Only keep the first "<TEXT></TEXT>" content 
            htm_data = document.group(1)
            # parse the htm content
            text = BeautifulSoup(htm_data, 'html.parser').get_text()
            return text 
        else:
            return "No document found"
        
def clean_txt(text):
   
     # remove all html tags but keep its content
    text = re.sub(r"<.*?>", " ", text)
    # Remove CSS
    text = re.sub(r'<style[^>]*>[\s\S]*?</style>', '', text)
    # Remove XML and XBRL elements (assuming they are enclosed in angle brackets)
    text = re.sub(r'<[^>]*>', '', text)
    # Remove tables (assuming they are enclosed in <table>...</table> tags)
    text = re.sub(r'<table[^>]*>[\s\S]*?</table>', '', text)
    # convert to lowercase
    text = text.lower()
    # removing non-alphabetical characters, like numbers and special characters
    text = re.sub('\d','',text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # remove hyperlinks
    text = re.sub(r"https?://\S+", "", text)
    # remove repeated characters
    text = re.sub(r'(.)\1{3,}',r'\1', text)
    # remove XBRL word
    text = re.sub('xbrl', '', text)

    # Tokenize the text
    words = nltk.word_tokenize(text)
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]

    # Define a minimum word length (adjust as needed)
    # Example: Remove words with fewer than 3 characters
    min_word_length = 3  

    # Filter out words that meet the criteria
    filtered_words = [word for word in filtered_words if len(word) > min_word_length]
    
    # Join the filtered words back into a single string
    cleaned_text = ' '.join(filtered_words)

    return cleaned_text  

In [8]:
# tetsing 1
# raw_text = extract_txt(r'D:\\NLP-HW1\\10Q\\\\master2018QTR1_10Q\\16732-CPB-2018-03-05.txt')
# cleaned_text = clean_txt(raw_text)

# # Open the file in write mode ('w') and write the content
# with open('output_raw.txt', 'w') as file:
#     file.write(raw_text)

# with open('output_cleaned.txt', 'w') as file:
#     file.write(cleaned_text)

In [10]:
# pipeline: preprocess for all 
base_path = r"D:\NLP-HW1\10Q\\"
folders = os.listdir(base_path)

# obatin the paths for all the txt files and store them into a path_lsit 
path_list = []
file_list = []

for folder in folders:
    for filename in os.listdir(base_path+folder):
        file_list.append(filename)
        file_path = base_path + folder + '\\' + filename
        path_list.append(file_path)

# loop through path_list, preprocess text and store them in an output directory 
output_dir = r"D:\NLP-HW1\cleaned_data\\"

for i,path in enumerate(path_list):
    filename = file_list[i]
    raw_text = extract_txt(path)
    
    cleaned_text = clean_txt(raw_text)
    # store 
    with open(output_dir + filename, "w",newline='',encoding='utf-8') as output:
        output.write(str(cleaned_text))
        
    print("finished iterating at: ", i )

finished iterating at:  0
finished iterating at:  1
finished iterating at:  2
finished iterating at:  3
finished iterating at:  4
finished iterating at:  5
finished iterating at:  6
finished iterating at:  7
finished iterating at:  8
finished iterating at:  9
finished iterating at:  10
finished iterating at:  11
finished iterating at:  12
finished iterating at:  13
finished iterating at:  14
finished iterating at:  15
finished iterating at:  16
finished iterating at:  17
finished iterating at:  18
finished iterating at:  19
finished iterating at:  20
finished iterating at:  21
finished iterating at:  22
finished iterating at:  23
finished iterating at:  24
finished iterating at:  25
finished iterating at:  26
finished iterating at:  27
finished iterating at:  28
finished iterating at:  29
finished iterating at:  30
finished iterating at:  31
finished iterating at:  32
finished iterating at:  33
finished iterating at:  34
finished iterating at:  35
finished iterating at:  36
finished it

## Text Vectoriaztion

In [1]:
# import out two word lists 
import harvard_wordlist 
import lm_neg_wordlist

In [2]:
hiv4_list = harvard_wordlist.harvard_neg
lm_neg_list = lm_neg_wordlist.lm_negative

In [38]:
# Important: convert the words in the word_list to lower case 
hiv4_list = [word.lower() for word in hiv4_list]
lm_neg_list = [word.lower() for word in lm_neg_list]

Terminologies:

* Term Frequency: In document d, the frequency represents the number of instances of a given word t. 
    
    tf(t,d) = count of t in d / number of words in d

* Document Frequency: number of documents containing at least one term t

    df(t) = N(t)

* Inverse Document Frequency: the number of documents in the corpus separated by the frequency of the text

    idf(t) = log(N / df(t))

    The more common word is supposed to be considered less significant


The article uses two methods: 

1. Proportional weighting: tf(t,d) / total word count in a doc

2. Term weighting: reassign a weight to each negative word using the following function:

$$
w_{i, j}= \begin{cases}\frac{\left(1+\log \left(t f_{i, j}\right)\right)}{\left(1+\log \left(a_j\right)\right)} \log \frac{N}{d f_i} & \text { if } t f_{i, j} \geq 1 \\ 0 & \text { otherwise }\end{cases}
$$

Among which $a_j$ is the average word count in the document

In [5]:
# helper functions 

# count the number of valid documents in the 'cleaned_data' folder
def count_valid_docs(folder_path, file_extension=".txt"):
    ''' Count the number of valid documents with the specific file extension .txt
        Valid documents should have a file size greater than 0KB
        
        Args: 
            folder path (str)
            file_extension(str)
        Returns:
            int: the number of valid documensts in the folder '''
    # initialize a count for valid documents 
    N = 0
    # List files in the folder 
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path) and filename.endswith(file_extension):
            # check if the file has a size greater than 0KB
            if os.path.getsize(file_path) > 0:
                # update valid doc dount 
                N += 1
    return N 

# helper function to read file_content from file_path
def read_doc(file_path):
    '''read file content
        input: file_path
        output: str type, file_content
    '''
    with open(file_path,'r') as file:
        file_content = file.read()
    return file_content 

# Count the number of words in a doc 
def count_doc_len(file_content):
    '''Input: str file_content
        return: a number of the words in file'''
    words = file_content.split()
    return len(words)

# Count the occurence of one specific negative word in a specific doc 
def tf(file_content, word_list):
    '''count the occurence of each word in the word_lsit
        return (dict): word_counts
    '''
    word_counts = {word: 0 for word in word_list}
    # convert file_content into separate chars of words
    words = re.findall(r'\b\w+\b', file_content.lower()) 
    for word in words:
        if word in word_list:
            word_counts[word] += 1
    return word_counts

### Proportional Weighting 

In [6]:
# convert the words in the word_list to lower case 
hiv4_list = [word.lower() for word in hiv4_list]
lm_neg_list = [word.lower() for word in lm_neg_list]

In [25]:
# wrap up
folder_path = r"D:\NLP-HW1\cleaned_data"
file_extension = ".txt"
N = count_valid_docs(folder_path, file_extension)
print(f"Number of valid {file_extension} docs in the folder is: {N}")

# Initialize dataframes for word_count and proportional weighting respectively 
# term frequenct/ word count
hiv4_wc = pd.DataFrame(columns=hiv4_list)
lm_wc = pd.DataFrame(columns=lm_neg_list)
# proportional weighting 
hiv4_pw = pd.DataFrame(columns=hiv4_list)
lm_pw = pd.DataFrame(columns=lm_neg_list)

# initialize hiv4_df dict
hiv4_df_dict = {word: 0 for word in hiv4_list}
lm_df_dict = {word: 0 for word in lm_neg_list}

# loop through each file in the "cleaned_data" folder
base_path = r"D:\NLP-HW1\cleaned_data\\"
file_names = os.listdir("cleaned_data")
i = 0 
total_length = 0

for file_name in file_names:
    file_content = read_doc(base_path+file_name)
    # added 
    words = file_content.split()
    file_length = len(words)
    # update total_length
    total_length += file_length

    # update the wc (word_count) & pw (proportional weighting) dict 
    hiv4_wc_dict = tf(file_content, hiv4_list)
    hiv4_pw_dict = {key: value / file_length if file_length !=0  else 0 for key, value in hiv4_wc_dict.items()}
    hiv4_wc.loc[file_name] = hiv4_wc_dict.values()
    hiv4_pw.loc[file_name] = hiv4_pw_dict.values()

    lm_wc_dict = tf(file_content, lm_neg_list)
    lm_pw_dict = {key: value / file_length if file_length !=0  else 0 for key, value in lm_wc_dict.items()}
    lm_wc.loc[file_name] = lm_wc_dict.values()
    lm_pw.loc[file_name] = lm_pw_dict.values()

    # update the df(document frequency) dicts
    for word in words:
        if word in hiv4_list:
            hiv4_df_dict[word] += 1
        if word in lm_neg_list:
            lm_df_dict[word] += 1

    print("Done for: ", i)
    i += 1

# Average number of words in document 
a = total_length / N
print(f"Avergae number of words in document is: {a}")

Number of valid .txt docs in the folder is: 7320
Done for:  0
Done for:  1
Done for:  2
Done for:  3
Done for:  4
Done for:  5
Done for:  6
Done for:  7
Done for:  8
Done for:  9
Done for:  10
Done for:  11
Done for:  12
Done for:  13
Done for:  14
Done for:  15
Done for:  16
Done for:  17
Done for:  18
Done for:  19
Done for:  20
Done for:  21
Done for:  22
Done for:  23
Done for:  24
Done for:  25
Done for:  26
Done for:  27
Done for:  28
Done for:  29
Done for:  30
Done for:  31
Done for:  32
Done for:  33
Done for:  34
Done for:  35
Done for:  36
Done for:  37
Done for:  38
Done for:  39
Done for:  40
Done for:  41
Done for:  42
Done for:  43
Done for:  44
Done for:  45
Done for:  46
Done for:  47
Done for:  48
Done for:  49
Done for:  50
Done for:  51
Done for:  52
Done for:  53
Done for:  54
Done for:  55
Done for:  56
Done for:  57
Done for:  58
Done for:  59
Done for:  60
Done for:  61
Done for:  62
Done for:  63
Done for:  64
Done for:  65
Done for:  66
Done for:  67
Done for:

In [26]:
print(N)
print(a)

7320
15730.488524590164


In [27]:
# store the proportional weights 
hiv4_pw.to_csv("hiv4_pw.csv")
lm_pw.to_csv("lm_pw.csv")

### Term weighting


$$
w_{i, j}= \begin{cases}\frac{\left(1+\log \left(t f_{i, j}\right)\right)}{\left(1+\log \left(a_j\right)\right)} \log \frac{N}{d f_i} & \text { if } t f_{i, j} \geq 1 \\ 0 & \text { otherwise }\end{cases}
$$


In [29]:
# log(N/d_j): transform from df -> idf 
hiv4_idf_dict = {key: (1/value) * N  if value !=0 else 0 for key, value in hiv4_df_dict.items()}
lm_idf_dict = {key: (1/value) * N  if value !=0 else 0 for key, value in lm_df_dict.items()}

In [31]:
# term weighting 
# (1+log(tf_i,j)) / (1+log(a_j)) 
def transform_value(x):
    '''Compute (1+log(tf_i,j)) / (1+log(a_j)) for every element in the wc dataframes'''
    return (1 + np.log(x)) / (1 + np.log(a)) if x != 0 else 0

# Transform the hiv4_wc and lm_wc term frequency dataframe 
hiv4_tw = hiv4_wc.applymap(transform_value) * pd.Series(hiv4_idf_dict)
lm_tw = lm_wc.applymap(transform_value) * pd.Series(lm_idf_dict)

In [35]:
# store term weighting 
hiv4_tw.to_csv('hiv4_tw.csv')
lm_tw.to_csv('lm_tw.csv')

## Final output

In [40]:
# initialize a dataframe 
negativity_scores = pd.DataFrame()
negativity_scores['hiv4_pw'] = hiv4_pw.sum(axis=1)
negativity_scores['lm_pw'] = lm_pw.sum(axis=1)
negativity_scores['hiv4_tw'] = hiv4_tw.sum(axis=1)
negativity_scores['lm_tw'] = lm_tw.sum(axis=1)
negativity_scores.head()

Unnamed: 0,hiv4_pw,lm_pw,hiv4_tw,lm_tw
1000228-HSIC-2018-05-08.txt,0.022767,0.034463,153.587786,254.455412
1000228-HSIC-2018-08-06.txt,0.02522,0.037101,159.97149,267.874377
1000228-HSIC-2018-11-06.txt,0.026597,0.040956,168.452773,488.237343
1000228-HSIC-2019-05-07.txt,0.028794,0.049216,130.977812,519.392587
1000228-HSIC-2019-08-06.txt,0.030097,0.047986,129.820982,535.075698


In [37]:
negativity_scores.to_csv('negativity_scores.csv')