In [6]:
# Import functions here, remember to add to requirements.txt if a package needs to be install via pip
import numpy as np
import datetime
import MySQLdb
import sys

from keras.models import load_model
from keras.models import Sequential
from keras.layers import Dense


# Production MySql settings
MYSQL_CREDS = {
    "host": "10.24.14.51",
    "port": 3306,
    "user": "RecruitSuggester",
    "passwd": "HPwtdJlaLq2whEH9",
    "db": "DeepLearning",
}

# Uncommit the 2 lines below for remote testing.
MYSQL_CREDS["host"] = "198.154.109.168" # Test IP - uncomment if testing from home
MYSQL_CREDS["port"] = 33306 # Test Port - uncomment if testing from home

MASTER_DATA_QUERY = """
SELECT
    Handle, Moniker, ForumID, SpectrumID, Country, State,
    Fluency0, Fluency1, Fluency2, Fluency3, Fluency4, Enlisted,
    ForumLastActive, ChatLastActive, CustomAvatar, InviteSent, Outcome
FROM
    MasterData
"""

COUNTRY_QUERY = "SELECT CountryID, CountryName from Countries"
LANGUAGE_QUERY = "SELECT LanguageID, LanguageName from Languages"


# # Ho0ber log function, added time stamps to print
def log(message):
    print("{}::  {}".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), message))
    sys.stdout.flush()

class DeepLearning(object):
    def __init__(self):
        self.mysql_conn = MySQLdb.connect(**MYSQL_CREDS)
        self.languages = {}
        self.countries = {}
        self.pull_languages_and_countries()

    def pull_languages_and_countries(self):
        """
        Cache Languages and Countries tables into local dicts to save MySQL hits for
        known values. Any new values will need to be added to these when discovered.
        """
        log("Caching language and country data locally...")
        cursor = self.mysql_conn.cursor()

        cursor.execute(COUNTRY_QUERY)
        for cid,country in cursor.fetchall():
            self.countries[country] = cid

        cursor.execute(LANGUAGE_QUERY)
        for lid,language in cursor.fetchall():
            self.languages[language] = lid

    def get_or_create_country(self, country):
        # TODO - make this actually:
        # - Add new countries to MySQL
        # - Get the ID from the newly created country row
        # - Add row to local cache dict
        # - Return the new ID
        return self.countries.get(country, 0) if country else None

    def get_or_create_language(self, language):
        # TODO - make this actually:
        # - Add new languages to MySQL
        # - Get the ID from the newly created language row
        # - Add row to local cache dict
        # - Return the new ID
        return self.languages.get(language, 0) if language else None

    def clean(self, row):
        """
        Strip off the first two columns and numericize country and fluency fields.
        """
        row = list(row) # Make the row mutable by changing to a list

        # Change country into CountryID
        row[4] = self.get_or_create_country(row[4])

        # Change all fluency fields into LanguageIDs
        for col in range(6,11):
            row[col] = self.get_or_create_language(row[col])

        # Return the resulting row, but strip off handle and moniker
        return [col if col != '' else None for col in row[2:]]

    def pull_data(self):
        """
        This function pulls MasterData from MySQL and produces two numpy 2d arrays:
        1) sampleset - A numericized version of most columns
        2) samplestringset - Only handle and moniker as strings

        This can probably be changed to just one 2d array, but this was easier to
        interface with the train code as written.
        """
        cursor = self.mysql_conn.cursor()
        numrows = cursor.execute(MASTER_DATA_QUERY)
        results = cursor.fetchall()

        # Pull out just handle and moniker for samplestringset
        names = [[r[0], r[1]] for r in results]

        # Strip off the first two columns and numericize country and fluency fields
        numbers = [self.clean(row) for row in results]

        # I went for np.array directly rather than fromiter because fromiter was finicky
        # We might need to change that, but I'm uncertain
        sampleset = np.array(numbers)
        samplestringset = np.array(names)

        return sampleset, samplestringset

    def train(self, sampleset, samplestringset):
        """
        This is TemptedSaint's code, nearly verbatim. I can't comment on it
        other than to note that I changed a bunch of 16s to 15s after we cut
        out our internal identifier. I wasn't certain if any other numbers needed
        to be shifted as a result.

        I know I borked something, as the accuracy is 0% when I run this.
        """

        sampleset[sampleset == None] = 0
        trainset = np.empty((0))

        #removing of the 5 and converting 2 to 1
        #changing 1 to 0, 3 to 1 and 4 to 2
        #changing names to comparison values

        rowcount,colcount = sampleset.shape
        rawcount,x = sampleset.shape
        rowcount -=1
        colcount -=1
        while (rowcount > -1):
            handle = samplestringset[rowcount,0]
            moniker = samplestringset[rowcount,1]
            hanlen = len(handle)
            monlen = len(moniker)
            tempset = np.empty((0))
            # charecter count
            charcnt = int(0)
            # charecter comparison count
            compcnt = int(0)
            # total comparison float
            strcomp = float(0)
            if hanlen >= monlen:
                while (charcnt < monlen):
                    if (handle[charcnt] == moniker[charcnt]):
                        compcnt += 1
                    charcnt += 1
                strcomp = compcnt / charcnt
            else:
                while (charcnt < hanlen):
                    if handle[charcnt] == moniker[charcnt]:
                        compcnt += 1
                    charcnt += 1
                strcomp = compcnt/charcnt
            a = np.empty((0))
            a = np.append(a, strcomp)
            #this will create the entire new array with the strings accounted for in comparison with recruiting value at the end
            tempset = np.empty((0))
            tempset = np.append(tempset, values=sampleset[rowcount, :colcount])
            tempset = np.append(tempset, values=[strcomp])
            tempset = np.append(tempset, values=sampleset[rowcount, colcount])
            if ((rowcount+1) == rawcount):
                trainset = tempset
            else:
                trainset = np.vstack([trainset, tempset])
            rowcount -= 1

        log(trainset.shape)

        # converting the data set so that it can be used for training
        colcount += 1
        traindata,checkdata = trainset[:,:colcount],trainset[:,colcount]

        # load neural net
        model =load_model('adi.hd5')
        
        rowcount=checkdata.shape
        results = np.zeros((rowcount,2))
        prdctdata= model.predict(traindata)
        np.concatenate((prdctdata,checkdata), axis=0, out=results)
        np.savetxt('results.txt',results, delimiter=' ', newline='\n', header='', footer='', comments='# ', encoding=None)
        print (ocount)
        

    def run(self):
        """
        Main entry-point for the class. pull_data into numpy arrays, then pass them off to train
        """
        log("Pulling MasterData from MySQL...")
        sampleset, samplestringset = self.pull_data()

        log("Training model...")
        self.train(sampleset, samplestringset)

        log("Done.")


if __name__ == "__main__":
    DeepLearning().run()


2018-05-04 22:41:21::  Caching language and country data locally...
2018-05-04 22:41:21::  Pulling MasterData from MySQL...
2018-05-04 22:41:25::  Training model...
2018-05-04 22:49:17::  (74102, 16)


TypeError: 'tuple' object cannot be interpreted as an integer