# FIT5196 Assessment 3
# Task 1 Reconstruct the Original Meeting Transcripts
#### Student Name: Syed Ali Alim Rizvi
#### Student ID: 28984773

Date: 3/06/2018

Version: 3.0

Environment: Python 3.6.0 and Anaconda 4.3.0 (64-bit)

Libraries used:
* from bs4 BeautifulSoup
* re 
* os
* pandas


## 1. Introduction
This assignment comprises the recreation of meeting transcripts from XML files.

Tasks:
1. Importing libraries
2. Reading data in
3. Recreation of files

More details for each task will be given in the following sections.

## 2. Imports

In [1]:
from bs4 import BeautifulSoup as bsoup
import re
import os
import pandas as pd

## 3. Functions

#### Parsing Topic and creating tuples of Information

In [2]:
def parse_topic(file):
    
    #create soup
    Topic = bsoup(file, 'lxml',from_encoding="ISO-8859-1")
            
    root_topics= [] #list of root toppics
    parent_dict = {} #identify parent root topic of child 
    #find all root topics; recursive is false because we dont want children
    for i in Topic.find('nite:root').find_all('topic', recursive=False):
        root_topic = re.search(r'(?<=\.)\d+', i['nite:id']).group(0)
        root_topics.append(root_topic)
        #check children of root; recursive is true because we want all children no matter how deep
        for j in i.find_all('topic'):
            child_topic = re.search(r'(?<=\.)\d+', j['nite:id']).group(0)
            parent_dict[child_topic] = root_topic
        
    #convert topics into integers
    root_topics = [int(x) for x in root_topics]

    #creating tuples for topics
    tops = []

    #create tuples
    for topic in Topic.find_all('topic'):
        
        for child in topic.find_all(re.compile('child')):

            #required fields
            topic = re.search(r'(?<=\.)\d+', child.parent['nite:id']).group(0)
            group = re.search(r'\.(\w)\.',child['href']).group(1)
            word_lims = re.findall(r'(?<=words)\d+',child['href'])

            #required tuple
            if len(word_lims) == 2:
                t = tuple([int(topic), group, int(word_lims[0]), int(word_lims[1])])
            elif len(word_lims) == 1:
                t = tuple([int(topic), group, int(word_lims[0]), int(word_lims[0])])
            else:
                print('issue')

            tops.append(t)
        
    #return list of tuples, the list of root topics, dictionary of subtopics and their root parents
    return (tops, root_topics, parent_dict)

#### Parsing Segments and creating tuples of Information

In [3]:
#fucntion to parse a segment
def parse_seg(file):
    
    #create soup
    Segment = bsoup(file, 'lxml')
    
    #list to store the segments tuples
    segs=[]

    #create tuples
    for seg in Segment.find_all(re.compile(r'child')):

        #taking out group and word lims
        group = re.search(r'\.(\w)\.',seg['href']).group(1)
        word_lims = re.findall(r'(?<=words)\d+',seg['href'])

        #required tuple
        if len(word_lims) == 2:
            t = tuple([group, int(word_lims[0]), int(word_lims[1])])
        elif len(word_lims) == 1:
            t = tuple([group, int(word_lims[0]), int(word_lims[0])])
        else:
            print('issue')
        
        #append tuple
        segs.append(t)
    
    #reutrn list of tuples
    return segs

#### Parsing Words and creating tuples of Information

In [4]:
#fucntion to parse a words
def parse_words(file):
    
    #create soup
    Word = bsoup(file, 'lxml')
    
    #list to store the segments tuples
    words=[]

    #create tuples
    for wor in Word.find(r'nite:root').find_all():
#     for wor in Word.find_all(re.compile(r'w')):
#     for wor in Word.find_all(re.compile(r'w|vocal.*')):
        #taking out group and word Ind and word    
        group = re.search(r'\.(\w)\.',wor['nite:id']).group(1)
        try:
            word_ind = re.findall(r'.*[^\d]+(\d+)', wor['nite:id'])[0]
        except IndexError:
            print(wor)
            print(re.findall(r'(?<=words)\d+',wor['nite:id']))
            raise
        word_val = wor.string if wor.string!=None else ''

        #required tuple
        t = tuple([group, int(word_ind), word_val])

        words.append(t)
    
    #reutrn list of tuples
    return words

#### Function to create text output of the Topic

In [5]:
#function to create topic
def topic_to_text(root_topics, parent_dict, tdf, sdf, wdf):
    
    ##################################################################################################################
    # for the segments data frame add a new column with the segment text
    ##################################################################################################################
    
    # Loop over the segment data base and create text for each segment and add it to the datafrmae
    for i in range(len(sdf)):

        #identify variables
        seg = sdf.iloc[i,0] #which segment it is
        start=sdf.iloc[i,1] #what is the start word of the segment
        end = sdf.iloc[i,2] #what is the end word of the segment

        #create segment
        #create a list of words from the words df that are of the same segment and are between start and end of segment
        wl = wdf[(wdf.segment == seg) & (wdf.ind >= start) & (wdf.ind <= end)].iloc[:,2].tolist()
        #join the words with spaces to create the text for the segment
        wstr = ' '.join(wl)

        #add the text of the segment to the dataframe column msg
        sdf.loc[i,'msg'] = wstr
        
    
    ##################################################################################################################
    # for the Topics data frame add a new column with the text for each subtopic
    ##################################################################################################################
    
    #topics
    for i in range(len(tdf)):

        #identify variables
        seg = tdf.iloc[i,1] # segment of the subtopic selected
        start = tdf.iloc[i,2] #start word of the subtopic  selected
        end = tdf.iloc[i,3] #end word of the subtop selected

        #create list of segment texts
        not_needed = (sdf.stop < start) | (sdf.start > end) #identify the segments that would NOT be needed for this subtopic
        sl = sdf[(sdf.segment == seg) & ~not_needed].iloc[:,3].tolist() #create a list of segments in the topic
        
        #varible for segment and topic
        seg_start = sdf[(sdf.segment == seg) & ~not_needed].iloc[0,1] # what is the starting word of the group of segments
        seg_end = sdf[(sdf.segment == seg) & ~not_needed].iloc[-1,2] #what is the ending word of the group of segments
        start_diff = int(start-seg_start) #calculate the difference between the segment and topic start words
        end_diff = int(seg_end - end) # calculate the difference between the segment and topic end words
        
        #join the segment texts accordingly
        #if topic starts after a segment and ends before a segment aswell (they could be different or same segments)
        if (end_diff>0) & (start_diff>0):
#             print(end_diff, 'top index', i, '--both')
            sl = ' \n'.join(sl) #sl is a list of segments so we need to join them before indexing (with \n and a space)
            sl = sl.split(' ') #then split them into words by white space (since \n was with a space so its okay)
            sl = sl[start_diff:-end_diff] #select the required words
            sl = ' '.join(sl) #join them with spaces
            sstr = sl
        #if the topic ends before a segment
        elif end_diff>0:
            sl = ' \n'.join(sl) #sl is a list of segments so we need to join them before indexing (with \n and a space)
            sl = sl.split(' ') #then split them into words by white space (since \n was with a space so its okay)
            sl = sl[:-end_diff] #select the required words
            sl = ' '.join(sl) #join them with spaces
            sstr = sl
        #if a topic starts after a segment 
        elif start_diff>0:
            sl = ' \n'.join(sl) #sl is a list of segments so we need to join them before indexing (with \n and a space)
            sl = sl.split(' ') #then split them into words by white space (since \n was with a space so its okay)
            sl = sl[start_diff:] #select the required words
            sl = ' '.join(sl) #join them with spaces
            sstr = sl
        elif start_diff<0:
            print('seg more than topic error')
            break
        else:
            sstr = '\n'.join(sl)

        #enter words for that sub_topic in dataframe
        tdf.loc[i,'msg'] = sstr
        
    ##################################################################################################################
    # combine the text for each sub_topic to create a complete text file
    ##################################################################################################################
    
    #joining topics
    tdf = tdf[~(tdf.msg == '')]
    #converting subtopics to root topics
    tdf.loc[~tdf.topic.isin(root_topics), 'topic'] = tdf.loc[~tdf.topic.isin(root_topics), 'topic'].apply(lambda x: int(parent_dict[str(x)]))
    
    #creating text variable
    text = ''
    #grouping all subtopics and segments of each root topic
    sub_topic_text = tdf.groupby(by='topic', sort=False)['msg'].agg(lambda x: '\n'.join(x))
    #concatinating all the root topic texts with 10 asterics
    for msg in sub_topic_text:
        text = text + msg + '\n' + '**********' + '\n'
    #remove any multiple adjacent line changes due to segment being empty
    text = re.subn(r'[^\S\n]{2,}', ' ', text)[0] #remove double or more spaces except for \n
    text = re.subn('\n\s+', '\n', text)[0] #remove any white space after line change
    text = re.subn('\s+\n', '\n', text)[0] #remove any white space before line change
    #test = re.subn(' {2,}', ' ', text)[0] #remove double or more spaces
    text = re.subn('\n{2,}', '\n', text)[0] #remove double or more line changes
    text = re.subn(r'[^\S\n]{2,}', ' ', text)[0] #remove double or more spaces AGAIN
    text = re.subn(r'\n(?=[^\*])', '\n ', text)[0] #add spaces for the start of the segments
    text = ' ' + text #add space for the first segment
    text = text.rstrip()
    
    
    #return the text of the topic
    return text

## 4. Files Creation

In [6]:
#list to store tuples

#folder to check
xml_file_path = './topics'

#all topic files
files = os.listdir(xml_file_path)

#for each topic file 
for xfile in files:
    
#     if xfile != 'ES2002d.topic.xml':
#         continue
    
    ###################################################################################################################
    # For that topic now create sub_Topics
    ###################################################################################################################
    
    #identify topic id (this will be used to get the required segments and words)
    topic_id = re.search(r'[^.]*(?=\.)',xfile).group(0)
    
    #folder to check
    xml_file_path = './topics'
    
    #create the path to file
    xfile = os.path.join(xml_file_path, xfile)
    
    #if the file is a correct file and is .xml
    if os.path.isfile(xfile) and xfile.endswith('.xml'): 
        #create the tuples
        tops, root_topics, parent_dict = parse_topic(open(xfile))
        
        
    ###################################################################################################################
    # For that topic now create sugments
    ###################################################################################################################
        
    #list to store tuples
    segs = []
    #folder to check
    xml_file_path = './segments'

    #all segment files
    files = os.listdir(xml_file_path)

    #filter segments files by topic
    regex = re.compile(topic_id)
    selected_files = filter(regex.search, files)


    #for each segment file of that topic
    for xfile in selected_files:

        #create the path to file
        xfile = os.path.join(xml_file_path, xfile)


        #if the file is a correct file and is .xml
        if os.path.isfile(xfile) and xfile.endswith('.xml'): 

            #create the tuples
            segment_part = parse_seg(open(xfile))
            segs = segs + segment_part
            
    ###################################################################################################################
    # For that topic now create words
    ###################################################################################################################
    
    #list to store tuples
    wors = []
    #folder to check
    xml_file_path = './words'

    #all segment files
    files = os.listdir(xml_file_path)


    #filter word files by topic
    regex = re.compile(topic_id)
    selected_files = filter(regex.search, files)

    #for each word file of that topic
    for xfile in selected_files:

        #create the path to file
        xfile = os.path.join(xml_file_path, xfile)

        #if the file is a correct file and is .xml
        if os.path.isfile(xfile) and xfile.endswith('.xml'): 

            #create the tuples
            word_part = parse_words(open(xfile))
            wors = wors + word_part
       
    ###################################################################################################################
    # once the tuples for all information from the xml is now taken, create a dataframe from tuples
    ###################################################################################################################     
    
    #create dataframes
    #create dataframe for words
    wdf = pd.DataFrame(wors, columns=['segment', 'ind', 'value']) 
    
    #create dataframe for topics
    tdf = pd.DataFrame(tops, columns=['topic', 'segment', 'start', 'stop'])
    #since we used regex some subtopics repeat hence delete the duplicates
    tdf.drop_duplicates(keep='first', inplace=True)
    #reset the index after duplicate deletion
    tdf.reset_index(drop=True, inplace=True)
    
    #create data frame for segments
    sdf = pd.DataFrame(segs, columns=['segment', 'start', 'stop'])
    
    #code checking purposes
#     print(topic_id)
#     print(len(tdf),':tdf')
#     print(len(sdf),':sdf')
#     print(len(wdf),':wdf')
#     print(tops[0:5])
#     print(tdf.head())
#     print(sdf[sdf.segment=='D'].head())
#     print('---end---')
        
    ###################################################################################################################
    # Create the text representation of topic and save in file
    ################################################################################################################### 
    
    Text = topic_to_text(root_topics, parent_dict, tdf, sdf, wdf)   
    
    # Open a file to store output in for the topic
    fi = open('./txt_files/'+ topic_id + ".txt", "w")
    fi.write(Text)

    # Close opend file
    fi.close()
    print(topic_id + ' --created')
    
    #empty the topic id
    topic_id = None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


ES2002a --created
ES2002b --created
ES2002c --created
ES2002d --created
ES2003a --created
ES2003b --created
ES2003c --created
ES2003d --created
ES2004a --created
ES2004b --created
ES2004c --created
ES2004d --created
ES2005a --created
ES2005b --created
ES2005c --created
ES2005d --created
ES2006a --created
ES2006b --created
ES2006d --created
ES2007a --created
ES2007b --created
ES2007c --created
ES2007d --created
ES2008a --created
ES2008b --created
ES2008c --created
ES2008d --created
ES2009a --created
ES2009b --created
ES2009c --created
ES2009d --created
ES2010a --created
ES2010b --created
ES2010c --created
ES2010d --created
ES2011a --created
ES2011b --created
ES2011c --created
ES2011d --created
ES2012a --created
ES2012b --created
ES2012c --created
ES2012d --created
ES2013a --created
ES2013b --created
ES2013c --created
ES2013d --created
ES2014a --created
ES2014b --created
ES2014c --created
ES2014d --created
ES2015a --created
ES2015d --created
ES2016a --created
ES2016b --created
ES2016c --

## 5. Summary

- Since there are alot of input and output operations needed as well as alot of information that needed to be stored, the code made use of pandas data frames. 
- However, the optimality of the code could have been increase if for example dictionaries were used. In this case, due to the shortage of time, pandas was used. 