In [None]:
import pandas as pd
# import sklearn.feature_extraction.text as txt
import bs4
# from sklearn.feature_extraction.text import CountVectorizer
import datetime

In [None]:
# read business_ball data
file_path = '/content/businessballs_data.csv'
businessballs_data = pd.read_csv(file_path)
businessballs_data.head()

In [None]:
businessballs_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     177 non-null    int64  
 1   skill          177 non-null    object 
 2   title          177 non-null    object 
 3   reading_time   177 non-null    object 
 4   rating         177 non-null    float64
 5   views          177 non-null    object 
 6   link           177 non-null    object 
 7   body           177 non-null    object 
 8   raw_body_data  177 non-null    object 
 9   metadata       177 non-null    object 
 10  tracking_log   177 non-null    object 
 11  validator_log  177 non-null    object 
dtypes: float64(1), int64(1), object(10)
memory usage: 16.7+ KB


In [None]:
# this function disaggregates the content column (html format) of the df dataframe into many paragraphs
# For each paragraph, it searchs for the first previous heading found

#**************************************************************************************
# to save memory the Content and paragraphs will be droped from the returned dataframe
#**************************************************************************************

def disaggregate_content(df, tag = 'p', a_class = None): # df is supposed a dataframe that contains the Content column in the html format : <body>....</body>
    # these list are used to constract the returned dataframe
    soft_kill_names_df = [] # 
    criterias_df = [] 
    URLs_df = []
    titles_of_URLs_df = []
    summaries_df = [] # summary produced in the previous step (summary of all the concateneted paragraphs)
    paragraphs_df = []
    headings_df = [] # heading just before paragraph
    headings_type_df = [] # type of tytle : h1, h2,...
    

    for idx, row in df.iterrows():
        soup = bs4.BeautifulSoup(row['raw_body_data'], "html.parser")
        
        if a_class != None:
            paragraphs = soup.find_all(tag, a_class) 
        else:
            paragraphs = soup.find_all(tag) 
            
        for paragraph in paragraphs:
            
            # get paragraph text
            temp = paragraph.get_text()
            if temp == '': # if empty we continue
                continue
            
            paragraphs_df.append(paragraph.get_text())
            
            # get the old columns of df except Content beacause it's huge and we won't need it later
            soft_kill_names_df.append(row['skill']) 
            # criterias_df.append(row['Criteria']) 
            URLs_df.append(row['link'])
            titles_of_URLs_df.append(row['title'])
            # summaries_df.append(row['summary'])
            
            # find heading just before that paragraph. find_heading returns (title, hx). See definition below
            heading = find_heading(paragraph)
            headings_df.append(heading[0]) # the text heading
            try:
                # it's type : h1...h6
                headings_type_df.append(heading[1])
            except:
                # if no heading found affect None
                headings_type_df.append('None')
                continue
        
    
    disaggregated_df = pd.DataFrame.from_dict({'Soft Skill Name' : soft_kill_names_df, 
                                               'URL': URLs_df, 
                                               'Title of URL' : titles_of_URLs_df, 
                                               'header':headings_df,
                                                'paragraph' : paragraphs_df})
    return disaggregated_df



# this function looks for the first heading previous to elt passed as parameter
# starting form h6 until h1
def find_heading(elt):
    headings = ['h6', 'h5', 'h4', 'h3', 'h2', 'h1']
    elt = elt.previous_element
    while (not (elt is None) ) and (not (elt.name in headings)):
         elt = elt.previous_element
    
    if (not (elt is None) ):
        return elt.get_text(), elt.name
    else:
        return 'None', 'None'

In [None]:
businessballs_data_disagg = disaggregate_content(businessballs_data, tag='p')

In [None]:
businessballs_data_disagg.to_csv('businessballs_data_disagg.csv', index = False)
businessballs_data_disagg.to_excel('businessballs_data_disagg.xlsx', index = False)