In [147]:
import re
from enum import Enum
from operator import attrgetter

'''
Use in ExtractStationLine class as parameters
'''
class StationLine(Enum):
    red = 1
    green = 2
    circle = 3
    purple = 4
    blue = 5
    brown = 6
    none = 99

# model object to hold name and score
class StationModel:
    stationName = ""
    score = 0
    
    def __init__(self, mStationName, mScore=0):
        self.stationName = mStationName
        self.score = mScore
    
    def getName(self):
        return self.stationName
    
    def getScore(self):
        return self.score
    
    def setScore(self, mScore):
        self.score = mScore
    
    def addScore(self, mScore):
        self.score += mScore
        
    def getString(self):
        return "{}: {}".format(self.getName(), self.getScore())

class SSPairArr:
    ssPairArr = []
    
    def __init__(self, lineArr):   
        self.ssPairArr = []
        for line in lineArr:
            self.ssPairArr.append(StationModel(line, 0))
        
    # find station by name
    def get(self, stationName):
        for pair in self.ssPairArr:
            if pair.getName().lower() == stationName.lower():
                return pair
        return None
    
    # find likely station line sentence is referring to 
    def getTopScore(self):
        highest = self.getMax()
        highCount = 0
        topScoreModels = []
        for model in self.ssPairArr:
            if model.getScore() == highest:
                highCount += 1
                topScoreModels.append(model)
        # return top score if there is only 1 top score
        # else return multiple top score models
        if highCount == 1:
            modelArr = []
            self.ssPairArr = sorted(self.ssPairArr, key=attrgetter('score'), reverse=True)
            modelArr.append(self.ssPairArr[0])
            return modelArr
        else:
            return topScoreModels
          
    
    # return the max value in models, can contain multiple same max value
    def getMax(self):
        return max(self.ssPairArr, key=attrgetter('score')).getScore()
    
    # sanity check on the models
    def mPrint(self):        
        for model in self.ssPairArr:
            print(model.getString())

    
'''
Extraction station line in sentence
'''
class ExtractStationLine:
    redLineList = ["[nsl]","[nsl]:", "ns", "nsl", "north south", "north south line", "nsline", "ns line"]
    greenLineList = ["[ewl]","[ewl]:","ew", "ewl", "east west", "east west line", "ewline", "ew line"]
    circleLineList = ["[ccl]","[ccl]:","cc", "ccl", "circle line", "circleline", "cc line"]
    purpleLineList = ["[nel]","[nel]:","ne", "nel", "north east", "north east line", "neline", "ne line"]
    blueLineList = ["[dtl]","[dtl]:","dt", "dtl", "down town", "down town line", "dtline", "dt line"]
    brownLineList = ["[tel]","[tel]:","te", "tel", "thomson-east Coast", "thomson-east Coast line", "tecline", "tec line"]    
    
    # check group of stations belongs to which line
    def checkStationJson(self, jsonData): 
        try:
            lineArr = ["NS", "EW", "CC", "NE", "DT", "TE", "BP", "SW", "SE", "PE", "STC", "PTC"]
            ssPairArr = SSPairArr(lineArr)    

            for data in jsonData:
                for line in lineArr:
                    if re.search(line, data["id"], re.IGNORECASE):
                        pair = ssPairArr.get(line)
                        pair.addScore(1)  

            topModels = ssPairArr.getTopScore()
    #         ssPairArr.mPrint()

            stNumArr = []
            # return if there is only 1 top score
            if len(topModels) == 1:
                return topModels[0].getName()
            else:
                # split line name and station number
                for data in jsonData:
                    stations = data["id"].split(";")
                    for station in stations:
                        #stNum[0] = station line, [1] = station number
                        stNum = re.split('(\d+)',station) 
                        stNumArr.append(stNum)

                for model in topModels:
                    temp = []
                    for stNum in stNumArr:
                        if model.getName().lower() == stNum[0].lower():
                            temp.append(int(stNum[1]))
                    high = max(temp)
                    low = min(temp)
                    model.setScore(abs(high-low))
    #                 print("{}: score {}".format(model.getName(), model.getScore()))
                model = min(topModels, key=attrgetter('score'))
                return model.getName()
        except Exception as e:      
            return None
                
    def checkStationLine(self, jsonData):
        if self.containKey(StationLine.red, jsonData):
            return "NS"
        elif self.containKey(StationLine.green, jsonData):
            return "EW"
        elif self.containKey(StationLine.circle, jsonData):
            return "CC"
        elif self.containKey(StationLine.purple, jsonData):
            return "NE"
        elif self.containKey(StationLine.blue, jsonData):
            return "DT"
        elif self.containKey(StationLine.brown, jsonData):
            return "TE"
        else:
            return "NONE"
            
    def containKey(self, stationType, jsonData):
        if stationType is StationLine.red:
            lineList = self.redLineList
        elif stationType is StationLine.green:
            lineList = self.greenLineList
        elif stationType is StationLine.circle:
            lineList = self.circleLineList
        elif stationType is StationLine.purple:
            lineList = self.purpleLineList
        elif stationType is StationLine.blue:
            lineList = self.blueLineList
        elif stationType is StationLine.brown:
            lineList = self.brownLineList
#         print(lineList)
        
        isContainKey = False
        for key in lineList:    
            result = self.findWholeWord(key)(jsonData)
            if result is not None:        
                isContainKey = True
                break
        return isContainKey

    def findWholeWord(self, w):
        return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search

In [148]:
import import_notebook
from readWrite import ReadWrite
import re

class ExtractStation:
    stationFile = "station_translation.json"
    abbFile = "abb_replacement.json"
    stationJson = ""
    abb_replace = ""
    
    def __init__(self):
        readWrite = ReadWrite()
        self.getStationsJson()
        self.abb_replace = readWrite.readJsonFile(self.abbFile)
    
    # Return list of station in json format
    def getStationsJson(self):
        readWrite = ReadWrite()
        self.stationJson = readWrite.readJsonFile(self.stationFile)
        print("Total stations: %d" % len(self.stationJson))
        print("Keys: ch_name, id, name")
        return self.stationJson
    
    # Extract sentence is reference to which station and which line
    def extract(self, sentence):
        stationObjArr = []
        matchObjArr = []
        sentence = self.replaceAbbsWords(sentence)      
        for json in self.stationJson:
            name = json['name']                                      
            isFound, matchObj = self.containWord(name, sentence)
            if(isFound):                
                stationObjArr.append(json) 
                matchObjArr.append(matchObj)
        # preserve stations direction in sentence        
        stationObjArr = self.sortByMatchObj(matchObjArr, stationObjArr)                      
        return stationObjArr

    # Check if key word is found in sentence
    def containWord(self, key, sentence):                
        isContainKey = False        
        result = self.findWholeWord(key)(sentence)        
        if result is not None:        
            isContainKey = True
        return isContainKey, result
        
    def findWholeWord(self, w):
        return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search      
    
    # sorting of matchObj using span, return sorted stationObj
    def sortByMatchObj(self, matchObjArr, stationObjArr):
        indexList = []
        for matchObj in matchObjArr:
            indexList.append(matchObj.span(1)[0])
        indexList = sorted(indexList)
        result = []
        for index in indexList:
            for idx, matchObj in enumerate(matchObjArr):
                if matchObj.span(1)[0] == index:
                    result.append(stationObjArr[idx])
        return result
                            
    def replaceAbbsWords(self, sentence): 
        try:
            for key in self.abb_replace:        
                sentence = re.sub(r"\b{}\b".format(key), self.abb_replace[key], sentence, flags=re.IGNORECASE)    
                sentence = sentence.replace('&amp;', 'and')
            return sentence
        except Exception as e:
            print("replaceAbbsWords Error: {}".format(e))

In [151]:
# Testing - identifying by type of line as keywords
extStLine = ExtractStationLine()
result = extStLine.checkStationLine("This sentence contain key words for ewl some rubbish words train station line")
print(result)

# Testing - identifying station names as keywords
extStation = ExtractStation()
stationJson = extStation.extract("[nel] This sentence contain Mattar, tampines, upper changi in it and it should return the correct json object")
for item in stationJson:
    print(item)

# Testing - checking which group of stations belongs to which line
extStLine.checkStationJson(stationJson)

EW
Total stations: 157
Keys: ch_name, id, name
{'id': 'DT25', 'name': 'Mattar', 'ch_name': '玛达'}
{'id': 'EW2;DT32', 'name': 'Tampines', 'ch_name': '淡滨尼'}
{'id': 'DT34', 'name': 'Upper Changi', 'ch_name': '樟宜上段'}


'DT'