In [13]:
import re
from enum import Enum

'''
Use in ExtractStationLine class as parameters
'''
class StationLine(Enum):
    red = 1
    green = 2
    circle = 3
    purple = 4
    blue = 5
    brown = 6
    none = 99

'''
Extraction station line in sentence
'''
class ExtractStationLine:
    redLineList = ["[nsl]","[nsl]:", "ns", "nsl", "north south", "north south line", "nsline", "ns line"]
    greenLineList = ["[ewl]","[ewl]:","ew", "ewl", "east west", "east west line", "ewline", "ew line"]
    circleLineList = ["[ccl]","[ccl]:","cc", "ccl", "circle line", "circleline", "cc line"]
    purpleLineList = ["[nel]","[nel]:","ne", "nel", "north east", "north east line", "neline", "ne line"]
    blueLineList = ["[dtl]","[dtl]:","dt", "dtl", "down town", "down town line", "dtline", "dt line"]
    brownLineList = ["[tel]","[tel]:","te", "tel", "thomson-east Coast", "thomson-east Coast line", "tecline", "tec line"]
    
    # check group of stations belongs to which line
    def checkStationJson(self, jsonData):        
        lineArr = ["NS", "EW", "CC", "NE", "DT", "TE"]
        scoreArr = [0, 0, 0, 0, 0, 0]
        for data in jsonData:        
            if re.search("ns", data["id"], re.IGNORECASE):
                scoreArr[0] += 1
            if re.search("ew", data["id"], re.IGNORECASE):
                scoreArr[1] += 1
            if re.search("cc", data["id"], re.IGNORECASE):
                scoreArr[2] += 1
            if re.search("ne", data["id"], re.IGNORECASE):
                scoreArr[3] += 1
            if re.search("dt", data["id"], re.IGNORECASE):
                scoreArr[4] += 1
            if re.search("te", data["id"], re.IGNORECASE):
                scoreArr[5] += 1
        highest = max(scoreArr)
        for idx, score in enumerate(scoreArr):
            if score == highest:
                return lineArr[idx]                                
    
    def checkStationLine(self, jsonData):
        if self.containKey(StationLine.red, jsonData):
            return "NS"
        elif self.containKey(StationLine.green, jsonData):
            return "EW"
        elif self.containKey(StationLine.circle, jsonData):
            return "CC"
        elif self.containKey(StationLine.purple, jsonData):
            return "NE"
        elif self.containKey(StationLine.blue, jsonData):
            return "DT"
        elif self.containKey(StationLine.brown, jsonData):
            return "TE"
        else:
            return "NONE"
            
    def containKey(self, stationType, jsonData):
        if stationType is StationLine.red:
            lineList = self.redLineList
        elif stationType is StationLine.green:
            lineList = self.greenLineList
        elif stationType is StationLine.circle:
            lineList = self.circleLineList
        elif stationType is StationLine.purple:
            lineList = self.purpleLineList
        elif stationType is StationLine.blue:
            lineList = self.blueLineList
        elif stationType is StationLine.brown:
            lineList = self.brownLineList
#         print(lineList)
        
        isContainKey = False
        for key in lineList:    
            result = self.findWholeWord(key)(jsonData)
            if result is not None:        
                isContainKey = True
                break
        return isContainKey

    def findWholeWord(self, w):
        return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search

In [14]:
import import_notebook
from readWrite import ReadWrite
import re

class ExtractStation:
    stationFile = "station_translation.json"
    abbFile = "abb_replacement.json"
    stationJson = ""
    abb_replace = ""
    
    def __init__(self):
        readWrite = ReadWrite()
        self.getStationsJson()
        self.abb_replace = readWrite.readJsonFile(self.abbFile)
    
    # Return list of station in json format
    def getStationsJson(self):
        readWrite = ReadWrite()
        self.stationJson = readWrite.readJsonFile(self.stationFile)
        print("Total stations: %d" % len(self.stationJson))
        print("Keys: ch_name, id, name")
        return self.stationJson
    
    # Extract sentence is reference to which station and which line
    def extract(self, sentence):
        stationObjArr = []
        matchObjArr = []
        sentence = self.replaceAbbsWords(sentence)      
        for json in self.stationJson:
            name = json['name']                                      
            isFound, matchObj = self.containWord(name, sentence)
            if(isFound):                
                stationObjArr.append(json) 
                matchObjArr.append(matchObj)
        # preserve stations direction in sentence        
        stationObjArr = self.sortByMatchObj(matchObjArr, stationObjArr)                      
        return stationObjArr

    # Check if key word is found in sentence
    def containWord(self, key, sentence):                
        isContainKey = False        
        result = self.findWholeWord(key)(sentence)        
        if result is not None:        
            isContainKey = True
        return isContainKey, result
        
    def findWholeWord(self, w):
        return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search      
    
    # sorting of matchObj using span, return sorted stationObj
    def sortByMatchObj(self, matchObjArr, stationObjArr):
        indexList = []
        for matchObj in matchObjArr:
            indexList.append(matchObj.span(1)[0])
        indexList = sorted(indexList)
        result = []
        for index in indexList:
            for idx, matchObj in enumerate(matchObjArr):
                if matchObj.span(1)[0] == index:
                    result.append(stationObjArr[idx])
        return result
                            
    def replaceAbbsWords(self, sentence): 
        try:
            for key in self.abb_replace:        
                sentence = re.sub(r"\b{}\b".format(key), self.abb_replace[key], sentence, flags=re.IGNORECASE)    
                sentence = sentence.replace('&amp;', 'and')
            return sentence
        except Exception as e:
            print("replaceAbbsWords Error: {}".format(e))

In [37]:
# Testing - identifying by type of line as keywords
extStLine = ExtractStationLine()
result = extStLine.checkStationLine("This sentence contain key words for ewl some rubbish words train station line")
print(result)

# Testing - identifying station names as keywords
extStation = ExtractStation()
stationJson = extStation.extract("This sentence contain bishan, dg in it and it should return the correct json object")
for item in stationJson:
    print(item)

# Testing - checking which group of stations belongs to which line
extStLine.checkStationJson(stationJson)

EW
Total stations: 140
Keys: ch_name, id, name
{'ch_name': '碧山', 'id': 'NS17;CC15', 'name': 'Bishan'}
{'ch_name': '多美歌', 'id': 'NE6;NS24;CC1', 'name': 'Dhoby Ghaut'}


'NS'