In [2]:
## Importing necessary libraries and modules.
import warnings
import numpy as np
import pandas as pd
from pprint import pprint
from collections import OrderedDict
from InputData import loadDataFrameList

warnings.filterwarnings('ignore')

In [3]:
## Loading the list of dataframes from DataPreprocessing
DataFrames = loadDataFrameList()

In [4]:
## Isolating a single team for a season to test the created functions.

## Choosing Season 2005-2006.
DataFrame = DataFrames[0]

## Isolating Data for Manchester United.
mufcData = DataFrame[(DataFrame['HomeTeam'] == 'Man United') | ( DataFrame['AwayTeam'] == 'Man United')]

In [5]:
'''-----------------------------------     Adding Goal Difference as a Feature  --------------------------------------------'''

## Creating a function that computes the columns "HTGD" ( Home Team Goal Difference ) and "ATGD" ( Away Team Goal Difference ).
def computeTGD(DataFrame) :
    
    ## Initialising the values contained in the coloumns "HTGD" and "ATGD" . (Goal Difference)
    DataFrame['HTGD'] = np.nan
    DataFrame['ATGD'] = np.nan
    
    ## Creating a list of all the teams that played in that season .
    Teams = list((DataFrame).HomeTeam.unique())

    ## Creating a Temporary DataFrame which consists of the records of the matches teamwise .
    for z in range(0, 20):

        ## Creating a Temporary DataFrame where the team was either "Home" or "Away" .
        tempDF = DataFrame[ (DataFrame['HomeTeam'] == str(Teams[z]) ) | ( DataFrame['AwayTeam'] == str(Teams[z])) ]

        ## Creating a list which contains "Matchwise Goal Difference" for the team under observation .
        MGDList = []

        for index, row in tempDF.iterrows():

            if (Teams[z] == row['HomeTeam']):
                MGDList.append(row['MHTGD'])

            elif (Teams[z] == row['AwayTeam']):
                MGDList.append(row['MATGD'])

        ## Creating a list which contains "Goal Difference" for the team under observation before coming into each match.
        GDList = []

        for i in range(0, 38):
            
            ## When the team has played no match.
            if (i == 0):
                GDList.append(0)
            
            ## When the team has played exactly one match.
            elif (i == 1):
                GDList.append(MGDList[i - 1])
            
            ## When the team has played more than 1 match.
            else:
                GDList.append(GDList[i - 1] + MGDList[i - 1])


        ## We will now normalise the Goal Difference.
        for m in range(0, 38):

            GDList[m] /= 100

        ## Creating a list for the index values of the games contained in tempDF.
        gameIndices = tempDF.index.tolist()

        ## Creating two lists which contains the index number of those games wherein the team under observation was Home or Away.

        indexHome = []
        indexAway = []

        for index, row in tempDF.iterrows():
            
            ## If team was Home Team.
            if (Teams[z] == row['HomeTeam']):
                 indexHome.append(index)
                    
            ## If team was Away Team.
            elif (Teams[z] == row['AwayTeam']):
                indexAway.append(index)

        ## Appending the appropriate "Goal Difference" values to the dataframe .
        for j in range(0, 38):

            if (gameIndices[j] in indexHome):
                DataFrame['HTGD'][gameIndices[j]] = GDList[j]
            elif (gameIndices[j] in indexAway):
                DataFrame['ATGD'][gameIndices[j]] = GDList[j]
                
    ## Filling in the coloumns for "GD".
    DataFrame['GD'] = DataFrame.apply(lambda row: row['HTGD'] - row['ATGD'], axis = 1)

In [6]:
''''-----------------------------------     Adding KPP as a Feature  --------------------------------------------'''

## Creating a function which computes the KPP (K-Past Performance) feature for Goals, Corners and Shots on Target.

def computeKPP(DataFrame, slidingWindowParameter):
    
    ## Set slidingWindowParameter to k.
    k = slidingWindowParameter

    ## Creating a list of all the teams that played in that season.
    Teams = list((DataFrame).HomeTeam.unique())
    
    ## Initialising the values contained in the coloumns "HGKPP , HCKPP , HSTKPP" and "AGKPP , ACKPP , ASTKPP". (KPP Features).
    DataFrame['HGKPP'] = np.nan
    DataFrame['AGKPP'] = np.nan
    DataFrame['HCKPP'] = np.nan
    DataFrame['ACKPP'] = np.nan
    DataFrame['HSTKPP'] = np.nan
    DataFrame['ASTKPP'] = np.nan
    
    ## Creating a Temporary DataFrame which consists of the records of the matches teamwise.
    for z in range(0, 20):

        ## Creating a Temporary DataFrame where the team was either "Home" or "Away" .
        tempDF = DataFrame[(DataFrame['HomeTeam'] == str(Teams[z])) | ( DataFrame['AwayTeam'] == str(Teams[z]))]

        ## Creating a list which contains Goals, Corners and Number of Shots on Target for the team under observation match-wise.
        Goals = []
        Corners = []
        shotsonTarget = []

        for index, row in tempDF.iterrows():
    
            if (Teams[z] == row['HomeTeam']):
                Goals.append(float(row['FTHG']))
                Corners.append(float(row['HC']))
                shotsonTarget.append(float(row['HST']))

            elif (Teams[z] == row['AwayTeam']):
                Goals.append(float(row['FTAG']))
                Corners.append(float(row['AC']))
                shotsonTarget.append(float(row['AST']))

        ## Creating lists to hold values for the corresponding KPP Features.
        # Since these features will be non existent for the first k matches of each team, fill Nan for the first k values.
        goalsKPP = [np.nan] * k
        cornersKPP = [np.nan] * k
        shotsOnTargetKPP = [np.nan] * k
        
        ## Adding appropriate values to the list.
        ## The number of computations performed will be (n + 1 - k) where :
        ## n = number of matches in the season for each team (38).
        ## k = sliding window hyper-parameter.
        for i in range(0, (39 - k)):

            ## Obtaining the slice of records to be observed.
            ## Sum the slice of records and normalize it by k.
            goalSliceSum = sum(Goals[i : (i + k)])/k
            cornerSliceSum = sum(Corners[i : (i + k)])/k
            shotsOnTargetSliceSum = sum(shotsonTarget[i : (i + k)])/k

            ## Appending to the list of the corresponding KPP features.
            goalsKPP.append(goalSliceSum)
            cornersKPP.append(cornerSliceSum)
            shotsOnTargetKPP.append(shotsOnTargetSliceSum)
            
        ## Creating a list for the index values of the games contained in the tempDF.
        gameIndices = tempDF.index.tolist()

        ## Creating two lists which contains the index number of those games wherein the team under observation was Home or Away.
        indexHome = []
        indexAway = []

        ## Segregate home and away match indices.
        for index, row in tempDF.iterrows():
            
            if (Teams[z] == row['HomeTeam']):
                 indexHome.append(index)

            elif (Teams[z] == row['AwayTeam']):
                indexAway.append(index)

        ## Appending the appropriate "KPP" values to the dataframe.
        for j in range(0, 38):

            if (gameIndices[j] in indexHome):
                DataFrame['HGKPP'][gameIndices[j]] = goalsKPP[j]
                DataFrame['HCKPP'][gameIndices[j]] = cornersKPP[j]
                DataFrame['HSTKPP'][gameIndices[j]] = shotsOnTargetKPP[j]

            elif (gameIndices[j] in indexAway):
                DataFrame['AGKPP'][gameIndices[j]] = goalsKPP[j]
                DataFrame['ACKPP'][gameIndices[j]] = cornersKPP[j]
                DataFrame['ASTKPP'][gameIndices[j]] = shotsOnTargetKPP[j]
        
        print 'Computing KPP ', Teams[z] , z 
    
    ## Filling in the coloumns for "GKPP, CKPP, STKPP".
    DataFrame['GKPP'] = DataFrame.apply(lambda row: row['HGKPP'] - row['AGKPP'], axis = 1)
    DataFrame['CKPP'] = DataFrame.apply(lambda row: row['HCKPP'] - row['ACKPP'], axis = 1)
    DataFrame['STKPP'] = DataFrame.apply(lambda row: row['HSTKPP'] - row['ASTKPP'], axis = 1)

In [7]:
''''-----------------------------------     Adding Streak and Weighted Streak as a Feature  --------------------------------------------'''

## Creating a function which computes the Streak and Weighted Streak.

def computeStreak(DataFrame, slidingWindowParameter):
    
    ## Set slidingWindowParameter to k.
    k = slidingWindowParameter

    ## Creating a list of all the teams that played in that season.
    Teams = list((DataFrame).HomeTeam.unique())
    
    ## Initialsing the values in the coloumns "HSt, ASt , HStWeigted , AStWeigted".
    DataFrame['HSt'] = np.nan
    DataFrame['ASt'] = np.nan
    DataFrame['HStWeighted'] = np.nan
    DataFrame['AStWeighted'] = np.nan
    
    ## Creating a Temporary DataFrame which consists of the records of the matches teamwise.
    for z in range(0, 20):

        ## Creating a Temporary DataFrame where the team was either "Home" or "Away" .
        tempDF = DataFrame[(DataFrame['HomeTeam'] == str(Teams[z])) | ( DataFrame['AwayTeam'] == str(Teams[z]))]
    
        ## Creating a list which contains the points assigned to each team after their match. 
        ## 0 - Loss
        ## 1 - Draw
        ## 3 - Win
        matchPoints = []

        ## Creating a list which contains the weights assigned to each match according to the sliding window hyper-parameter.
        ## The weighting scheme is such that the first match in the window will be a assigned a weight of 1 and the last match will be 
        ## assigned a weight of k.
        weightList = [(i + 1) for i in range(0, k)]
        
        for index , row in tempDF.iterrows():
            
            if (Teams[z] == row['HomeTeam']):
                if (row['FTR'] == 'A') :
                    matchPoints.append(0.0)
                elif (row['FTR'] == 'D') :
                    matchPoints.append(1.0)
                elif (row['FTR'] == 'H') :
                    matchPoints.append(3.0)

            elif (Teams[z] == row['AwayTeam']):
                if (row['FTR'] == 'H') :
                    matchPoints.append(0.0)
                elif (row['FTR'] == 'D') :
                    matchPoints.append(1.0)
                elif (row['FTR'] == 'A') :
                    matchPoints.append(3.0)
        
        ## Creating lists to hold values for the corresponding Streak and Weighted Streak Features.
        ## Since these features will be non existent for the first k matches of each team, fill Nan for the first k values.
        streak = [np.nan] * k
        weightedStreak = [np.nan] * k
        
        ## Adding appropriate values to the list.
        ## The number of computations performed will be (n + 1 - k) where :
        ## n = number of matches in the season for each team (38).
        ## k = sliding window hyper-parameter.
        for i in range(0, (39 - k)):

            ## Obtaining the slice of records to be observed.
            matchPointsSlice = matchPoints[i : (i + k)]

            ## Sum the slice of records and normalize it by 3k.
            streakValue = sum(matchPointsSlice)/(3 * k)

            ## Multiply the slice by the weights.
            ## Sum the slice of records and normalize it by (3k(k+1))/2.
            weightedStreakValue = sum(list(np.array(matchPointsSlice) * np.array(weightList)))/((1.5) * k * (k + 1))

            ## Appending to the list of the corresponding features.
            streak.append(streakValue)
            weightedStreak.append(weightedStreakValue)
            
        ## Creating a list for the index values of the games contained in the tempDF.
        gameIndices = tempDF.index.tolist()

        ## Creating two lists which contains the index number of those games wherein the team under observation was Home or Away.
        indexHome = []
        indexAway = []

        ## Segregate home and away match indices.
        for index, row in tempDF.iterrows():

            if (Teams[z] == row['HomeTeam']):
                 indexHome.append(index)

            elif (Teams[z] == row['AwayTeam']):
                indexAway.append(index)

        ## Appending the appropriate "KPP" values to the dataframe.
        for j in range(0, 38):

            if (gameIndices[j] in indexHome):
                DataFrame['HSt'][gameIndices[j]] = streak[j]
                DataFrame['HStWeighted'][gameIndices[j]] = weightedStreak[j]

            elif (gameIndices[j] in indexAway):
                DataFrame['ASt'][gameIndices[j]] = streak[j]
                DataFrame['AStWeighted'][gameIndices[j]] = weightedStreak[j]
                
        print 'Computing Streak and Weighted Streak ', Teams[z] , z 
    
    ## Filling in the coloumns for "Streak and WeightedStreak".
    DataFrame['Streak'] = DataFrame.apply(lambda row: row['HSt'] - row['ASt'], axis = 1)
    DataFrame['WeightedStreak'] = DataFrame.apply(lambda row: row['HStWeighted'] - row['AStWeighted'], axis = 1)

In [8]:
''''-----------------------------------     Adding Form as a Feature  --------------------------------------------'''

## Creating a function which computes the Form.

def computeForm(DataFrame, stealingFraction):
    
    ## Hyper-Parameter k.
    k = stealingFraction

    ## Initialising the values contained in the coloumns "HForm" and "AForm".
    DataFrame['HForm'] = 1.0
    DataFrame['AForm'] = 1.0

    ## Creating a global form dictionary with keys as team names and value as list of form.
    gFormDict = {}

    ## Creating a global form dictionary with keys as team names and value as list of match indices.
    teamMatchLookup = {}

    ## Dictionary which keeps track of a team's match indices.
    matchCounterDict = {}

    ## Creating a list of all the teams that played in that season .
    Teams = list((dataFrame).HomeTeam.unique())

    for teamName in Teams :

        ## Initialising values.
        gFormDict[teamName] = 1.0
        matchCounterDict[teamName] = 0

        ## For each team playing in the season, create a temporary dataframe to record the match numbers of each team.
        ## Create a temporary dataframe for the team under consideration.
        tempDF = DataFrame[(DataFrame['HomeTeam'] == str(teamName)) | ( DataFrame['AwayTeam'] == str(teamName))]

        ## Assigning match indices list to the relevant team.
        teamMatchLookup[teamName] =  tempDF.index.tolist()

    ## Iterating over each match in the season.
    for index, row in DataFrame.iterrows():
        
        ## Exit condition. Since the last update in the form values will be in the 2nd last match for each team, we have to run the loop till each team's 2nd last match.
        ## This condition happens at the 370th in each season.
        if (index == 370):
        
            break

        ## Update match counter for the playing teams.
        matchCounterDict[row['HomeTeam']] += 1
        matchCounterDict[row['AwayTeam']] += 1

        ## Case where home team wins. Since the home team wins here, a positive update is given to the home team and a negative update is given to the away team.        
        if (row['FTR'] == 'H'):

            ## Print form values of the teams before coming into the match.
            prevHomeForm = gFormDict[row['HomeTeam']]
            prevAwayForm = gFormDict[row['AwayTeam']]

            ## Print next match index of the Home and Away Team.
            nextMatchH = teamMatchLookup[row['HomeTeam']][matchCounterDict[row['HomeTeam']]]
            nextMatchA = teamMatchLookup[row['AwayTeam']][matchCounterDict[row['AwayTeam']]]
    
            ## Since the home team wins here, a positive update is given to the home team and a negative update is given to the away team. 
            homeUpdate = gFormDict[row['HomeTeam']] + k * gFormDict[row['AwayTeam']]
            awayUpdate = gFormDict[row['AwayTeam']] - k * gFormDict[row['AwayTeam']]

            ## Selecting next match record for the current Home Team.
            matchInfoH = DataFrame.iloc[[nextMatchH]]

            ## Check whether current Home Team is Home or Away in their next match and update Form accordingly.
            if (matchInfoH['HomeTeam'][nextMatchH] == row['HomeTeam']):
                DataFrame.loc[nextMatchH, 'HForm'] = homeUpdate

            elif (matchInfoH['AwayTeam'][nextMatchH] == row['HomeTeam']):
                DataFrame.loc[nextMatchH, 'AForm'] = homeUpdate

            ## Update value in the dictionary.
            gFormDict[row['HomeTeam']] = homeUpdate

            ## Selecting next match record for the current Away Team.
            matchInfoA = DataFrame.iloc[[nextMatchA]]

            ## Check whether current Away Team is Home or Away in their next match and update Form accordingly.
            if (matchInfoA['HomeTeam'][nextMatchA] == row['AwayTeam']):
                DataFrame.loc[nextMatchA, 'HForm'] = awayUpdate

            elif (matchInfoA['AwayTeam'][nextMatchA] == row['AwayTeam']):
                DataFrame.loc[nextMatchA, 'AForm'] = awayUpdate

            ## Update value in the dictionary.
            gFormDict[row['AwayTeam']] = awayUpdate


        ## Case where away team wins. Since the away team wins here, a positive update is given to the away team and a negative update is given to the home team.        
        if (row['FTR'] == 'A'):

            # Print form values of the teams before coming into the match.
            prevHomeForm = gFormDict[row['HomeTeam']]
            prevAwayForm = gFormDict[row['AwayTeam']]
        
            ## Print next match index of the Home and Away Team.
            nextMatchH = teamMatchLookup[row['HomeTeam']][matchCounterDict[row['HomeTeam']]]
            nextMatchA = teamMatchLookup[row['AwayTeam']][matchCounterDict[row['AwayTeam']]]

            ## Since the away team wins here, a positive update is given to the away team and a negative update is given to the home team.        
            homeUpdate = gFormDict[row['HomeTeam']] - k * gFormDict[row['HomeTeam']]
            awayUpdate = gFormDict[row['AwayTeam']] + k * gFormDict[row['HomeTeam']]

            ## Selecting next match for the current Home Team.
            matchInfoH = DataFrame.iloc[[nextMatchH]]

            ## Check whether current Home Team is Home or Away in their next match and update Form accordingly.
            if (matchInfoH['HomeTeam'][nextMatchH] == row['HomeTeam']):
                DataFrame.loc[nextMatchH, 'HForm'] = homeUpdate

            elif (matchInfoH['AwayTeam'][nextMatchH] == row['HomeTeam']):
                DataFrame.loc[nextMatchH, 'AForm'] = homeUpdate

            ## Update value in the dictionary.
            gFormDict[row['HomeTeam']] = homeUpdate

            ## Selecting next match for the current Away Team.
            matchInfoA = DataFrame.iloc[[nextMatchA]]

            ## Check whether current Away Team is Home or Away in their next match and update Form accordingly.
            if (matchInfoA['HomeTeam'][nextMatchA] == row['AwayTeam']):            
                DataFrame.loc[nextMatchA, 'HForm'] = awayUpdate

            elif (matchInfoA['AwayTeam'][nextMatchA] == row['AwayTeam']):
                DataFrame.loc[nextMatchA, 'AForm'] = awayUpdate

            ## Update value in the dictionary.
            gFormDict[row['AwayTeam']] = awayUpdate

        ## Case where a draw occurs.
        if (row['FTR'] == 'D'):

            # Print form values of the teams before coming into the match.
            prevHomeForm = gFormDict[row['HomeTeam']]
            prevAwayForm = gFormDict[row['AwayTeam']]
    
            ## Print next match index of the Home and Away Team.
            nextMatchH = teamMatchLookup[row['HomeTeam']][matchCounterDict[row['HomeTeam']]]
            nextMatchA = teamMatchLookup[row['AwayTeam']][matchCounterDict[row['AwayTeam']]]

            ## Form Updates.
            homeUpdate = gFormDict[row['HomeTeam']] - k * ((gFormDict[row['HomeTeam']]) - (gFormDict[row['AwayTeam']]))
            awayUpdate = gFormDict[row['AwayTeam']] - k * ((gFormDict[row['AwayTeam']]) - (gFormDict[row['HomeTeam']]))

            ## Selecting next match for the current Home Team.
            matchInfoH = DataFrame.iloc[[nextMatchH]]

            ## Check whether current Home Team is Home or Away in their next match and update Form accordingly.
            if (matchInfoH['HomeTeam'][nextMatchH] == row['HomeTeam']):
                DataFrame.loc[nextMatchH, 'HForm'] = homeUpdate

            elif (matchInfoH['AwayTeam'][nextMatchH] == row['HomeTeam']):
                DataFrame.loc[nextMatchH, 'AForm'] = homeUpdate

            ## Update value in the dictionary.
            gFormDict[row['HomeTeam']] = homeUpdate

            ## Selecting next match for the current Away Team.
            matchInfoA = DataFrame.iloc[[nextMatchA]]

            ## Check whether current Away Team is Home or Away in their next match and update Form accordingly.
            if (matchInfoA['HomeTeam'][nextMatchA] == row['AwayTeam']):
                DataFrame.loc[nextMatchA, 'HForm'] = awayUpdate

            elif (matchInfoA['AwayTeam'][nextMatchA] == row['AwayTeam']):
                DataFrame.loc[nextMatchA, 'AForm'] = awayUpdate

            ## Update value in the dictionary.
            gFormDict[row['AwayTeam']] = awayUpdate
    
    # Filling in the coloumns for "Form".
    DataFrame['Form'] = DataFrame.apply(lambda row: row['HForm'] - row['AForm'], axis = 1)

In [16]:
'''-----------------------------------     Adding Fifa Ratings  --------------------------------------------'''

## Creating a function that adds the scraped fifa ratings to the dataframe.
def addFifaRatings(DataFrame):

    ## Reading in the fifa ratings file.
    fifaRatings = pd.read_csv('./Log.csv', sep = ',')

    ## Singling out the season currently under observation.
    currSeason = DataFrame.Season.unique()[0]

    ## Isolating the respective season fifa ratings.
    fifaRatingsIsolated = fifaRatings[ fifaRatings['Season'] == currSeason]

    ## Initialsing the values in the coloumns for the Fifa Team Rankings .
    DataFrame['AOverall'] = np.nan
    DataFrame['HOverall'] = np.nan
    DataFrame['AAttack'] = np.nan
    DataFrame['HAttack'] = np.nan
    DataFrame['AMidfield'] = np.nan
    DataFrame['HMidfield'] = np.nan
    DataFrame['ADefense'] = np.nan
    DataFrame['HDefense'] = np.nan

    ## Creating a list of all the teams that played in that season (Non-Standard).
    TeamsNS = list((DataFrame).HomeTeam.unique())
    TeamsNS = sorted(TeamsNS, key = str.lower)

    ## Creating a list of all the teams that played in that season (Standard).
    TeamsS = list((fifaRatingsIsolated).Name.unique())
    TeamsS = sorted(TeamsS, key = str.lower)

    ## Replacing the non-standard names by the standard names in the dataframe.
    DataFrame = DataFrame.replace(TeamsNS, TeamsS)

    ## Creating a Temporary DataFrame which consists of the records of the matches teamwise .
    for z in range(0, 1):
        
        ## Creating a Temporary DataFrame where the team was either "Home" or "Away" .
        tempDF = DataFrame[ (DataFrame['HomeTeam'] == str(TeamsS[z]) ) | ( DataFrame['AwayTeam'] == str(TeamsS[z])) ]
        
        ## Parsing the attributes for the particular team under observation.
        infoRow = fifaRatingsIsolated[fifaRatingsIsolated['Name'] == TeamsS[z]]
        Overall = infoRow['Overall'] 
        Defense = infoRow['Defense']
        MidField = infoRow['Midfield']
        Attack = infoRow['Attack']
        
         ## Creating a list for the index values of the games contained in the tempDF.
        gameIndices = tempDF.index.tolist()
        print gameIndices

        ## Creating two lists which contains the index number of those games wherein the team under observation was Home or Away.
        indexHome = []
        indexAway = []

        ## Segregate home and away match indices.
        for index, row in tempDF.iterrows():

            if (TeamsS[z] == row['HomeTeam']):
                 indexHome.append(index)

            elif (TeamsS[z] == row['AwayTeam']):
                indexAway.append(index)

        ## Appending the appropriate "KPP" values to the dataframe.
        for j in range(0, 38):

            if (gameIndices[j] in indexHome):
                DataFrame['HOverall'][gameIndices[j]] = Overall
                DataFrame['HAttack'][gameIndices[j]] = Attack
                DataFrame['HMidfield'][gameIndices[j]] = MidField
                DataFrame['HDefense'][gameIndices[j]] = Defense

            elif (gameIndices[j] in indexAway):
                DataFrame['AOverall'][gameIndices[j]] = Overall
                DataFrame['AAttack'][gameIndices[j]] = Attack
                DataFrame['AMidfield'][gameIndices[j]] = MidField
                DataFrame['ADefense'][gameIndices[j]] = Defense

    ## Filling in the coloumns for "Overall, Attack, Midfield and Defense".
    DataFrame['Overall'] = DataFrame.apply(lambda row: row['HOverall'] - row['AOverall'], axis = 1)
    DataFrame['Attack'] = DataFrame.apply(lambda row: row['HAttack'] - row['AAttack'], axis = 1)
    DataFrame['Midfield'] = DataFrame.apply(lambda row: row['HMidfield'] - row['AMidfield'], axis = 1)
    DataFrame['Defense'] = DataFrame.apply(lambda row: row['HDefense'] - row['ADefense'], axis = 1)

In [51]:
'''-----------------------------------     Adding Fifa Ratings  --------------------------------------------'''

## Creating a function that adds the scraped fifa ratings to the dataframe.
def addFifaRatings(DataFrame):

    ## Reading in the fifa ratings file.
    fifaRatings = pd.read_csv('./Log.csv', sep = ',')

    ## Singling out the season currently under observation.
    currSeason = DataFrame.Season.unique()[0]

    ## Isolating the respective season fifa ratings.
    fifaRatingsIsolated = fifaRatings[ fifaRatings['Season'] == currSeason]

    ## Initialsing the values in the coloumns for the Fifa Team Rankings .
    DataFrame['AOverall'] = np.nan
    DataFrame['HOverall'] = np.nan
    DataFrame['AAttack'] = np.nan
    DataFrame['HAttack'] = np.nan
    DataFrame['AMidfield'] = np.nan
    DataFrame['HMidfield'] = np.nan
    DataFrame['ADefense'] = np.nan
    DataFrame['HDefense'] = np.nan

    ## Creating a list of all the teams that played in that season (Non-Standard).
    TeamsNS = list((DataFrame).HomeTeam.unique())
    TeamsNS = sorted(TeamsNS, key = str.lower)

    ## Creating a list of all the teams that played in that season (Standard).
    TeamsS = list((fifaRatingsIsolated).Name.unique())
    TeamsS = sorted(TeamsS, key = str.lower)

    ## Replacing the non-standard names by the standard names in the dataframe.
    DataFrame = DataFrame.replace(TeamsNS, TeamsS)

     ## Creating a Temporary DataFrame which consists of the records of the matches teamwise .
    for z in range(0, 20):
        
        ## Creating a Temporary DataFrame where the team was either "Home" or "Away" .
        tempDF = DataFrame[ (DataFrame['HomeTeam'] == str(TeamsS[z]) ) | ( DataFrame['AwayTeam'] == str(TeamsS[z])) ]
        
        ## Parsing the attributes for the particular team under observation.
        infoRow = fifaRatingsIsolated[fifaRatingsIsolated['Name'] == TeamsS[z]]
        Overall = int(infoRow['Overall'])
        Defense = int(infoRow['Defense'])
        MidField = int(infoRow['Midfield'])
        Attack = int(infoRow['Attack'])
        
         ## Creating a list for the index values of the games contained in the tempDF.
        gameIndices = tempDF.index.tolist()

        ## Creating two lists which contains the index number of those games wherein the team under observation was Home or Away.
        indexHome = []
        indexAway = []

        ## Segregate home and away match indices.
        for index, row in tempDF.iterrows():

            if (TeamsS[z] == row['HomeTeam']):
                 indexHome.append(index)

            elif (TeamsS[z] == row['AwayTeam']):
                indexAway.append(index)

        ## Appending the appropriate "KPP" values to the dataframe.
        for j in range(0, 38):

            if (gameIndices[j] in indexHome):
                DataFrame['HOverall'][gameIndices[j]] = Overall
                DataFrame['HAttack'][gameIndices[j]] = Attack
                DataFrame['HMidfield'][gameIndices[j]] = MidField
                DataFrame['HDefense'][gameIndices[j]] = Defense

            elif (gameIndices[j] in indexAway):
                DataFrame['AOverall'][gameIndices[j]] = Overall
                DataFrame['AAttack'][gameIndices[j]] = Attack
                DataFrame['AMidfield'][gameIndices[j]] = MidField
                DataFrame['ADefense'][gameIndices[j]] = Defense

    ## Filling in the coloumns for "Overall, Attack, Midfield and Defense".
    DataFrame['Overall'] = DataFrame.apply(lambda row: row['HOverall'] - row['AOverall'], axis = 1)
    DataFrame['Attack'] = DataFrame.apply(lambda row: row['HAttack'] - row['AAttack'], axis = 1)
    DataFrame['Midfield'] = DataFrame.apply(lambda row: row['HMidfield'] - row['AMidfield'], axis = 1)
    DataFrame['Defense'] = DataFrame.apply(lambda row: row['HDefense'] - row['ADefense'], axis = 1)
    
    return DataFrame

In [52]:
df = addFifaRatings(DataFrame)
df

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,AAttack,HAttack,AMidfield,HMidfield,ADefense,HDefense,Overall,Attack,Midfield,Defense
0,E0,2005-08-13,Aston Villa,Bolton Wanderers,2,2,D,2,2,D,...,79.0,84.0,77.0,75.0,70.0,77.0,3.0,5.0,-2.0,7.0
1,E0,2005-08-13,Everton,Manchester United,0,2,A,0,1,A,...,92.0,80.0,85.0,80.0,84.0,80.0,-6.0,-12.0,-5.0,-4.0
2,E0,2005-08-13,Fulham,Birmingham City,0,0,D,0,0,D,...,76.0,73.0,76.0,78.0,75.0,74.0,-1.0,-3.0,2.0,-1.0
3,E0,2005-08-13,Manchester City,West Bromwich,0,0,D,0,0,D,...,74.0,80.0,73.0,77.0,71.0,78.0,5.0,6.0,4.0,7.0
4,E0,2005-08-13,Middlesbrough,Liverpool,0,0,D,0,0,D,...,90.0,83.0,87.0,79.0,81.0,79.0,-5.0,-7.0,-8.0,-2.0
5,E0,2005-08-13,Portsmouth,Tottenham Hotspur,0,2,A,0,1,A,...,83.0,76.0,79.0,71.0,76.0,75.0,-5.0,-7.0,-8.0,-1.0
6,E0,2005-08-13,Sunderland,Charlton Athletic,1,3,A,1,1,D,...,70.0,72.0,77.0,66.0,73.0,65.0,-5.0,2.0,-11.0,-8.0
7,E0,2005-08-13,West Ham United,Blackburn Rovers,3,1,H,0,1,A,...,78.0,70.0,72.0,73.0,73.0,71.0,-1.0,-8.0,1.0,-2.0
8,E0,2005-08-14,Arsenal,Newcastle United,2,0,H,0,0,D,...,86.0,90.0,80.0,84.0,76.0,86.0,6.0,4.0,4.0,10.0
9,E0,2005-08-14,Wigan Athletic,Chelsea,0,1,A,0,0,D,...,87.0,75.0,88.0,68.0,85.0,72.0,-17.0,-12.0,-20.0,-13.0


In [66]:
''''-----------------------------------     Adding Time Till Last Match as a Feature  --------------------------------------------'''

## Creating a function which computes the Time Till Last Match.

def computeTTLM(DataFrame):
    
    ## Initialsing the values in the coloumns "HTTLM" and "ATTLM".
    DataFrame['HTTLM'] = np.nan
    DataFrame['ATTLM'] = np.nan
    
    ## Converting the Date column to pandas datetime format.
    DataFrame['Date'] = pd.to_datetime(DataFrame['Date'])
    
    ## Creating a list of all the teams that played in that season.
    Teams = list((DataFrame).HomeTeam.unique())
    
    ## Creating a Temporary DataFrame which consists of the records of the matches teamwise.
    for z in range(0, 20):

        ## Creating a Temporary DataFrame where the team was either "Home" or "Away" .
        tempDF = DataFrame[(DataFrame['HomeTeam'] == str(Teams[z])) | ( DataFrame['AwayTeam'] == str(Teams[z]))]
        
        ## Obtaining a list of the number of days since last match of the team under observation.
        ttlmList = (tempDF['Date'] - tempDF['Date'].shift(1))
        ttlmList = ttlmList.dt.days
        ttlmList = ttlmList.fillna(0)
        ttlmList = list(ttlmList)
        
        ## Creating a list for the index values of the games contained in the tempDF.
        gameIndices = tempDF.index.tolist()

        ## Creating two lists which contains the index number of those games wherein the team under observation was Home or Away.
        indexHome = []
        indexAway = []

        ## Segregate home and away match indices.
        for index, row in tempDF.iterrows():

            if (Teams[z] == row['HomeTeam']):
                 indexHome.append(index)

            elif (Teams[z] == row['AwayTeam']):
                indexAway.append(index)
        

        ## Appending the appropriate "TTLM" values to the dataframe.
        for j in range(0, 38):

            if (gameIndices[j] in indexHome):
                DataFrame['HTTLM'][gameIndices[j]] = ttlmList[j]

            elif (gameIndices[j] in indexAway):
                DataFrame['ATTLM'][gameIndices[j]] = ttlmList[j]
                
        print 'Computing Time till Last Match ', Teams[z] , z    
        
    ## Filling in the coloumns for "TTLM".
    DataFrame['TTLM'] = DataFrame.apply(lambda row: row['HTTLM'] - row['ATTLM'], axis = 1)

In [27]:
for i, dataFrame in enumerate(DataFrames):
    
    ## List of features who's initial values are Nan.
    nanFeatures = ['GKPP', 'HGKPP', 'AGKPP', 'CKPP', 'HCKPP', 'ACKPP', 'STKPP', 'HSTKPP', 'ASTKPP', 'Streak', 'HSt', 'ASt', 'WeightedStreak', 'HStWeighted', 'AStWeighted']
    
    dataFrame['MHTGD'] = dataFrame.apply(lambda row: row['FTHG'] - row['FTAG'], axis = 1)
    dataFrame['MATGD'] = dataFrame.apply(lambda row: row['FTAG'] - row['FTHG'], axis = 1)
    
    ## Computing the features.
    computeTGD(dataFrame)
    computeTTLM(dataFrame)
    computeKPP(dataFrame, 6)
    computeStreak(dataFrame, 6)
    computeForm(dataFrame, 0.33)
    
    ## Dropping all rows containing Nan values for the above feature list.
    dataFrame = dataFrame.dropna(subset = nanFeatures)
    
    print i

0
1
2
3
4
5
6
7
8
9
10
11
12


<h1><center> ProtoTyping for designing Form Feature Function </center></h1>

In [16]:
## Hyper-Parameter k.
k = 0.5

## Dummy dataframe.
dataFrame = DataFrames[8]

## Initialising the values contained in the coloumns "HForm" and "AForm".
dataFrame['HForm'] = 1.0
dataFrame['AForm'] = 1.0

## Creating a global form dictionary with keys as team names and value as list of form.
gFormDict = {}

## Creating a global form dictionary with keys as team names and value as list of match indices.
teamMatchLookup = {}

## Dictionary which keeps track of a team's match indices.
matchCounterDict = {}

## Creating a list of all the teams that played in that season .
Teams = list((dataFrame).HomeTeam.unique())

for teamName in Teams :
    
    ## Initialising values.
    gFormDict[teamName] = 1.0
    matchCounterDict[teamName] = 0
        
    ## For each team playing in the season, create a temporary dataframe to record the match numbers of each team.
    ## Create a temporary dataframe for the team under consideration.
    tempDF = dataFrame[(dataFrame['HomeTeam'] == str(teamName)) | ( dataFrame['AwayTeam'] == str(teamName))]
    
    ## Assigning match indices list to the relevant team.
    teamMatchLookup[teamName] =  tempDF.index.tolist()

## Iterating over each match in the season.
for index, row in dataFrame.iterrows():
    
    ## Print match number.
    print "Match number : ", (index)
    
    ## Exit condition. Since the last update in the form values will be in the 2nd last match for each team, we have to run the loop till each team's 2nd last match.
    ## This condition happens at the 370th in each season.
    if (index == 370):
        
        break
    
    ## Update match counter for the playing teams.
    matchCounterDict[row['HomeTeam']] += 1
    matchCounterDict[row['AwayTeam']] += 1
    
    ## Case where home team wins. Since the home team wins here, a positive update is given to the home team and a negative update is given to the away team.        
    if (row['FTR'] == 'H'):
        
        ## Printing useful match information.
        print "Home Team : ", row['HomeTeam']
        print "Away Team : ", row['AwayTeam']
        print "Result : ", row['FTR']        
        
        ## Print form values of the teams before coming into the match.
        prevHomeForm = gFormDict[row['HomeTeam']]
        prevAwayForm = gFormDict[row['AwayTeam']]
        print  "Home Team Form Value before match : ", prevHomeForm
        print  "Away Team Form Value before match : ", prevAwayForm
        
        ## Print next match index of the Home and Away Team.
        nextMatchH = teamMatchLookup[row['HomeTeam']][matchCounterDict[row['HomeTeam']]]
        nextMatchA = teamMatchLookup[row['AwayTeam']][matchCounterDict[row['AwayTeam']]]
        print "Next Home Team Match Index : ", nextMatchH
        print "Next Away Team Match Index : ", nextMatchA
        
        ## Since the home team wins here, a positive update is given to the home team and a negative update is given to the away team. 
        homeUpdate = gFormDict[row['HomeTeam']] + k * gFormDict[row['AwayTeam']]
        awayUpdate = gFormDict[row['AwayTeam']] - k * gFormDict[row['AwayTeam']]
        
        ## Selecting next match record for the current Home Team.
        matchInfoH = dataFrame.iloc[[nextMatchH]]
        
        ## Check whether current Home Team is Home or Away in their next match and update Form accordingly.
        if (matchInfoH['HomeTeam'][nextMatchH] == row['HomeTeam']):
            dataFrame.loc[nextMatchH, 'HForm'] = homeUpdate
        
        elif (matchInfoH['AwayTeam'][nextMatchH] == row['HomeTeam']):
            dataFrame.loc[nextMatchH, 'AForm'] = homeUpdate

        ## Update value in the dictionary.
        gFormDict[row['HomeTeam']] = homeUpdate
        
        ## Selecting next match record for the current Away Team.
        matchInfoA = dataFrame.iloc[[nextMatchA]]
        
        ## Check whether current Away Team is Home or Away in their next match and update Form accordingly.
        if (matchInfoA['HomeTeam'][nextMatchA] == row['AwayTeam']):
            dataFrame.loc[nextMatchA, 'HForm'] = awayUpdate
        
        elif (matchInfoA['AwayTeam'][nextMatchA] == row['AwayTeam']):
            dataFrame.loc[nextMatchA, 'AForm'] = awayUpdate
        
        ## Update value in the dictionary.
        gFormDict[row['AwayTeam']] = awayUpdate
        
        ## dataFrame.loc[index, 'HForm'] = gFormDict[row['HomeTeam']] + k * gFormDict[row['AwayTeam']]
        ## dataFrame.loc[index, 'AForm'] = gFormDict[row['AwayTeam']] - k * gFormDict[row['AwayTeam']]

        ## Update it in appropriate team name dictionary.
        ## gFormDict[row['HomeTeam']] = dataFrame.loc[index, 'HForm']
        ## gFormDict[row['AwayTeam']] = dataFrame.loc[index, 'AForm']
        
        ## Visually show the update.
        print "New Form Update for Home : " + str(prevHomeForm) + " + " + str(k) + " * " + str(prevAwayForm)
        print "New Form Update for Away : " + str(prevAwayForm) + " - " + str(k) + " * " + str(prevAwayForm)
        
        ## Print actual form values of the teams after the match.
        print "Home Team Form Value after the match : ", gFormDict[row['HomeTeam']]
        print "Away Team Form Value after the match : ", gFormDict[row['AwayTeam']]
        
        print " "
        
    ## Case where away team wins. Since the away team wins here, a positive update is given to the away team and a negative update is given to the home team.        
    if (row['FTR'] == 'A'):
        
        ## Printing useful match information.
        print "Home Team : ", row['HomeTeam']
        print "Away Team : ", row['AwayTeam']
        print "Result : ", row['FTR']

        # Print form values of the teams before coming into the match.
        prevHomeForm = gFormDict[row['HomeTeam']]
        prevAwayForm = gFormDict[row['AwayTeam']]
        print  "Home Team Form Value before match : ", prevHomeForm
        print  "Away Team Form Value before match : ", prevAwayForm
        
        ## Print next match index of the Home and Away Team.
        nextMatchH = teamMatchLookup[row['HomeTeam']][matchCounterDict[row['HomeTeam']]]
        nextMatchA = teamMatchLookup[row['AwayTeam']][matchCounterDict[row['AwayTeam']]]
        print "Next Home Team Match Index : ", nextMatchH
        print "Next Away Team Match Index : ", nextMatchA
        
        ## Since the away team wins here, a positive update is given to the away team and a negative update is given to the home team.        
        homeUpdate = gFormDict[row['HomeTeam']] - k * gFormDict[row['HomeTeam']]
        awayUpdate = gFormDict[row['AwayTeam']] + k * gFormDict[row['HomeTeam']]
        
        
        ## Selecting next match for the current Home Team.
        matchInfoH = dataFrame.iloc[[nextMatchH]]
        
        ## Check whether current Home Team is Home or Away in their next match and update Form accordingly.
        if (matchInfoH['HomeTeam'][nextMatchH] == row['HomeTeam']):
            dataFrame.loc[nextMatchH, 'HForm'] = homeUpdate
        
        elif (matchInfoH['AwayTeam'][nextMatchH] == row['HomeTeam']):
            dataFrame.loc[nextMatchH, 'AForm'] = homeUpdate
        
        ## Update value in the dictionary.
        gFormDict[row['HomeTeam']] = homeUpdate
        
        ## Selecting next match for the current Away Team.
        matchInfoA = dataFrame.iloc[[nextMatchA]]
        
        ## Check whether current Away Team is Home or Away in their next match and update Form accordingly.
        if (matchInfoA['HomeTeam'][nextMatchA] == row['AwayTeam']):            
            dataFrame.loc[nextMatchA, 'HForm'] = awayUpdate
        
        elif (matchInfoA['AwayTeam'][nextMatchA] == row['AwayTeam']):
            dataFrame.loc[nextMatchA, 'AForm'] = awayUpdate
            
        ## Update value in the dictionary.
        gFormDict[row['AwayTeam']] = awayUpdate
            
        ## Update Form.
        ## dataFrame.loc[index, 'AForm'] = gFormDict[row['AwayTeam']] + k * gFormDict[row['HomeTeam']]
        ## dataFrame.loc[index, 'HForm'] = gFormDict[row['HomeTeam']] - k * gFormDict[row['HomeTeam']]

        ## Update it in appropriate team name dictionary.
        ## gFormDict[row['HomeTeam']] = dataFrame.loc[index, 'HForm']
        ## gFormDict[row['AwayTeam']] = dataFrame.loc[index, 'AForm']

        ## Visually show the update.
        print "New Form Update for Home : " + str(prevHomeForm) + " - " + str(k) + " * " + str(prevHomeForm)
        print "New Form Update for Away : " + str(prevAwayForm) + " + " + str(k) + " * " + str(prevHomeForm)

        ## Print actual form values of the teams after the match.
        print "Home Team Form Value after the match : ", gFormDict[row['HomeTeam']]
        print "Away Team Form Value after the match : ", gFormDict[row['AwayTeam']]
        
        print " "
            
    ## Case where a draw occurs.
    if (row['FTR'] == 'D'):
        
        ## Printing useful match information.
        print "Home Team : ", row['HomeTeam']
        print "Away Team : ", row['AwayTeam']
        print "Result : ", row['FTR']

        # Print form values of the teams before coming into the match.
        prevHomeForm = gFormDict[row['HomeTeam']]
        prevAwayForm = gFormDict[row['AwayTeam']]
        print  "Home Team Form Value before match : ", prevHomeForm
        print  "Away Team Form Value before match : ", prevAwayForm
        
        ## Print next match index of the Home and Away Team.
        nextMatchH = teamMatchLookup[row['HomeTeam']][matchCounterDict[row['HomeTeam']]]
        nextMatchA = teamMatchLookup[row['AwayTeam']][matchCounterDict[row['AwayTeam']]]
        print "Next Home Team Match Index : ", nextMatchH
        print "Next Away Team Match Index : ", nextMatchA
        
        ## Form Updates.
        homeUpdate = gFormDict[row['HomeTeam']] - k * ((gFormDict[row['HomeTeam']]) - (gFormDict[row['AwayTeam']]))
        awayUpdate = gFormDict[row['AwayTeam']] - k * ((gFormDict[row['AwayTeam']]) - (gFormDict[row['HomeTeam']]))
        
        ## Selecting next match for the current Home Team.
        matchInfoH = dataFrame.iloc[[nextMatchH]]
        
        ## Check whether current Home Team is Home or Away in their next match and update Form accordingly.
        if (matchInfoH['HomeTeam'][nextMatchH] == row['HomeTeam']):
            dataFrame.loc[nextMatchH, 'HForm'] = homeUpdate
        
        elif (matchInfoH['AwayTeam'][nextMatchH] == row['HomeTeam']):
            dataFrame.loc[nextMatchH, 'AForm'] = homeUpdate
        
        ## Update value in the dictionary.
        gFormDict[row['HomeTeam']] = homeUpdate
        
        ## Selecting next match for the current Away Team.
        matchInfoA = dataFrame.iloc[[nextMatchA]]
        
        ## Check whether current Away Team is Home or Away in their next match and update Form accordingly.
        if (matchInfoA['HomeTeam'][nextMatchA] == row['AwayTeam']):
            dataFrame.loc[nextMatchA, 'HForm'] = awayUpdate
        
        elif (matchInfoA['AwayTeam'][nextMatchA] == row['AwayTeam']):
            dataFrame.loc[nextMatchA, 'AForm'] = awayUpdate
            
        ## Update value in the dictionary.
        gFormDict[row['AwayTeam']] = awayUpdate
        
        ## Update Form.
        ## dataFrame.loc[index, 'HForm'] = gFormDict[row['HomeTeam']] - k * ((gFormDict[row['HomeTeam']]) - (gFormDict[row['AwayTeam']]))
        ## dataFrame.loc[index, 'AForm'] = gFormDict[row['AwayTeam']] - k * ((gFormDict[row['AwayTeam']]) - (gFormDict[row['HomeTeam']]))

        ## Update it in appropriate team name dictionary.
        ## gFormDict[row['HomeTeam']] = dataFrame.loc[index, 'HForm']
        ## gFormDict[row['AwayTeam']] = dataFrame.loc[index, 'AForm']

        ## Visually show the update.
        print "New Form Update for Home : " + str(prevHomeForm) + " - " + str(k) + " * (" + str(prevHomeForm) + " - " + str(prevAwayForm) + ")"
        print "New Form Update for Away : " + str(prevAwayForm) + " - " + str(k) + " * (" + str(prevAwayForm) + " - " + str(prevHomeForm) + ")"

        ## Print actual form values of the teams after the match.
        print "Home Team Form Value after the match : ", gFormDict[row['HomeTeam']]
        print "Away Team Form Value after the match : ", gFormDict[row['AwayTeam']]
        
        print " "

Match number :  0
Home Team :  Arsenal
Away Team :  Aston Villa
Result :  A
Home Team Form Value before match :  1.0
Away Team Form Value before match :  1.0
Next Home Team Match Index :  13
Next Away Team Match Index :  10
New Form Update for Home : 1.0 - 0.5 * 1.0
New Form Update for Away : 1.0 + 0.5 * 1.0
Home Team Form Value after the match :  0.5
Away Team Form Value after the match :  1.5
 
Match number :  1
Home Team :  Liverpool
Away Team :  Stoke
Result :  H
Home Team Form Value before match :  1.0
Away Team Form Value before match :  1.0
Next Home Team Match Index :  11
Next Away Team Match Index :  17
New Form Update for Home : 1.0 + 0.5 * 1.0
New Form Update for Away : 1.0 - 0.5 * 1.0
Home Team Form Value after the match :  1.5
Away Team Form Value after the match :  0.5
 
Match number :  2
Home Team :  Norwich
Away Team :  Everton
Result :  D
Home Team Form Value before match :  1.0
Away Team Form Value before match :  1.0
Next Home Team Match Index :  14
Next Away Team Ma

In [None]:
df = dataFrame[(dataFrame['HomeTeam'] == 'Liverpool') | ( dataFrame['AwayTeam'] == 'Liverpool')]
df

<h1><center> ProtoTyping for designing KPP Feature Function </center></h1>

In [None]:
## Hyper-Parameter k.
k = 2

## Initialising the values contained in the coloumns "HGKPP , HCKPP , HSTKPP" and "AGKPP , ACKPP , ASTKPP". (KPP Features).
mufcData['HGKPP'] = np.nan
mufcData['AGKPP'] = np.nan
mufcData['HCKPP'] = np.nan
mufcData['ACKPP'] = np.nan
mufcData['HSTKPP'] = np.nan
mufcData['ASTKPP'] = np.nan

## Creating a list which contains Goals, Corners and Number of Shots on Target for the team under observation.
Goals = []
Corners = []
shotsonTarget = []

for index, row in mufcData.iterrows():
    
    ## if (Teams[i] == row['HomeTeam']):
    if ('Man United' == row['HomeTeam']):    
        Goals.append(float(row['FTHG']))
        Corners.append(float(row['HC']))
        shotsonTarget.append(float(row['HST']))
        
    ## elif (Teams[i] == row['AwayTeam']):
    elif ('Man United' == row['AwayTeam']):
        Goals.append(float(row['FTAG']))
        Corners.append(float(row['AC']))
        shotsonTarget.append(float(row['AST']))

## Creating lists to hold values for the corresponding KPP Features.
# Since these features will be non existent for the first k matches of each team, fill Nan for the first k values.
goalsKPP = [np.nan] * k
cornersKPP = [np.nan] * k
shotsOnTargetKPP = [np.nan] * k

## Adding appropriate values to the list.
## The number of computations performed will be (n + 1 - k) where :
## n = number of matches in the season for each team (38).
## k = sliding window hyper-parameter.
for i in range(0, (39 - k)):
    
    ## Obtaining the slice of records to be observed.
    ## Sum the slice of records and normalize it by k.
    goalSliceSum = sum(Goals[i : (i + k)])/k
    cornerSliceSum = sum(Corners[i : (i + k)])/k
    shotsOnTargetSliceSum = sum(shotsonTarget[i : (i + k)])/k
    
    ## Appending to the list of the corresponding KPP features.
    goalsKPP.append(goalSliceSum)
    cornersKPP.append(cornerSliceSum)
    shotsOnTargetKPP.append(shotsOnTargetSliceSum)
    
## Creating a list for the index values of the games contained in the tempDF.
gameIndices = mufcData.index.tolist()

## Creating two lists which contains the index number of those games wherein the team under observation was Home or Away.
indexHome = []
indexAway = []

## Segregate home and away match indices.
for index, row in mufcData.iterrows():
    ## if (Teams[i] == row['HomeTeam']):
    if ('Man United' == row['HomeTeam']): 
         indexHome.append(index)
    
    ## elif (Teams[i] == row['AwayTeam']):
    elif ('Man United' == row['AwayTeam']):
        indexAway.append(index)

## Appending the appropriate "KPP" values to the dataframe.
for j in range(0, 38):

    if (gameIndices[j] in indexHome):
        mufcData['HGKPP'][gameIndices[j]] = goalsKPP[j]
        mufcData['HCKPP'][gameIndices[j]] = cornersKPP[j]
        mufcData['HSTKPP'][gameIndices[j]] = shotsOnTargetKPP[j]

    elif (gameIndices[j] in indexAway):
        mufcData['AGKPP'][gameIndices[j]] = goalsKPP[j]
        mufcData['ACKPP'][gameIndices[j]] = cornersKPP[j]
        mufcData['ASTKPP'][gameIndices[j]] = shotsOnTargetKPP[j]

print goalsKPP
mufcData

<h1><center> ProtoTyping for designing Streak and Weighted Streak Feature Function </center></h1>

In [None]:
## Hyper-Parameter k.
k = 2

## Initialsing the values in the coloumns "HSt, ASt , HStWeigted , AStWeigted".
mufcData['HSt'] = np.nan
mufcData['ASt'] = np.nan
mufcData['HStWeighted'] = np.nan
mufcData['AStWeighted'] = np.nan

## Creating a list which contains the points assigned to each team after their match. 
## 0 - Loss
## 1 - Draw
## 3 - Win
matchPoints = []

## Creating a list which contains the weights assigned to each match according to the sliding window hyper-parameter.
## The weighting scheme is such that the first match in the window will be a assigned a weight of 1 and the last match will be 
## assigned a weight of k.
weightList = [(i + 1) for i in range(0, k)]

for index , row in mufcData.iterrows():
            
    ## if (Teams[i] == row['HomeTeam']):
    if ('Man United' == row['HomeTeam']): 
        if (row['FTR'] == 'A') :
            matchPoints.append(0.0)
        elif (row['FTR'] == 'D') :
            matchPoints.append(1.0)
        elif (row['FTR'] == 'H') :
            matchPoints.append(3.0)

    ## elif (Teams[i] == row['AwayTeam']):
    elif ('Man United' == row['AwayTeam']):
        if (row['FTR'] == 'H') :
            matchPoints.append(0.0)
        elif (row['FTR'] == 'D') :
            matchPoints.append(1.0)
        elif (row['FTR'] == 'A') :
            matchPoints.append(3.0)

## Creating lists to hold values for the corresponding Streak and Weighted Streak Features.
## Since these features will be non existent for the first k matches of each team, fill Nan for the first k values.
streak = [np.nan] * k
weightedStreak = [np.nan] * k

## print matchPoints

## Adding appropriate values to the list.
## The number of computations performed will be (n + 1 - k) where :
## n = number of matches in the season for each team (38).
## k = sliding window hyper-parameter.
for i in range(0, (39 - k)):
    
    ## Obtaining the slice of records to be observed.
    matchPointsSlice = matchPoints[i : (i + k)]
    
    ## Sum the slice of records and normalize it by 3k.
    streakValue = sum(matchPointsSlice)/(3 * k)
    
    ## Multiply the slice by the weights.
    ## Sum the slice of records and normalize it by (3k(k+1))/2.
    weightedStreakValue = sum(list(np.array(matchPointsSlice) * np.array(weightList)))/((1.5) * k * (k + 1))
    
    ## Appending to the list of the corresponding features.
    streak.append(streakValue)
    weightedStreak.append(weightedStreakValue)

## Creating a list for the index values of the games contained in the tempDF.
gameIndices = mufcData.index.tolist()

## Creating two lists which contains the index number of those games wherein the team under observation was Home or Away.
indexHome = []
indexAway = []

## Segregate home and away match indices.
for index, row in mufcData.iterrows():
    
    ## if (Teams[i] == row['HomeTeam']):
    if ('Man United' == row['HomeTeam']): 
         indexHome.append(index)
    
    ## elif (Teams[i] == row['AwayTeam']):
    elif ('Man United' == row['AwayTeam']):
        indexAway.append(index)

## Appending the appropriate "KPP" values to the dataframe.
for j in range(0, 38):

    if (gameIndices[j] in indexHome):
        mufcData['HSt'][gameIndices[j]] = streak[j]
        mufcData['HStWeighted'][gameIndices[j]] = weightedStreak[j]

    elif (gameIndices[j] in indexAway):
        mufcData['ASt'][gameIndices[j]] = streak[j]
        mufcData['AStWeighted'][gameIndices[j]] = weightedStreak[j]


print streak
mufcData

<h1><center> Prototype for designing Time Till Last Match Feature Function </center></h1>

In [23]:
## Converting the Date column to pandas datetime format.
mufcData['Date'] = pd.to_datetime(mufcData['Date'])

## Initialsing the values in the coloumn "TTLM"
mufcData['TTLM'] = np.nan

mufcData['TTLM'] = (mufcData['Date'] - mufcData['Date'].shift(1))
mufcData['TTLM'] = mufcData['TTLM'].dt.days
mufcData['TTLM'] = mufcData['TTLM'].fillna(0)
mufcData

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BbMx<2.5,BbAv<2.5,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,Season,TTLM
1,E0,2005-08-13,Everton,Man United,0,2,A,0,1,A,...,1.87,1.79,23,0.75,2.05,2.0,1.93,1.86,2005-2006,0.0
14,E0,2005-08-20,Man United,Aston Villa,1,0,H,0,0,D,...,2.1,1.97,25,-1.5,1.95,1.87,2.0,1.95,2005-2006,7.0
35,E0,2005-08-28,Newcastle,Man United,0,2,A,0,0,D,...,1.95,1.82,24,0.75,2.1,2.07,1.86,1.79,2005-2006,8.0
39,E0,2005-09-10,Man United,Man City,1,1,D,1,0,H,...,1.95,1.87,26,-1.25,2.11,2.03,1.86,1.83,2005-2006,13.0
52,E0,2005-09-18,Liverpool,Man United,0,0,D,0,0,D,...,1.72,1.66,24,0.0,2.03,1.98,1.92,1.85,2005-2006,8.0
60,E0,2005-09-24,Man United,Blackburn,1,2,A,0,1,A,...,2.0,1.86,25,-1.5,2.07,1.98,1.92,1.85,2005-2006,6.0
68,E0,2005-10-01,Fulham,Man United,2,3,A,2,3,A,...,1.8,1.72,27,0.75,2.0,1.94,1.96,1.92,2005-2006,7.0
79,E0,2005-10-15,Sunderland,Man United,1,3,A,0,1,A,...,1.85,1.76,24,1.0,2.0,1.92,1.97,1.91,2005-2006,14.0
90,E0,2005-10-22,Man United,Tottenham,1,1,D,1,0,H,...,1.81,1.75,27,-1.0,2.25,2.08,1.82,1.74,2005-2006,7.0
100,E0,2005-10-29,Middlesbrough,Man United,4,1,H,3,0,H,...,1.8,1.73,27,0.5,2.1,2.01,1.89,1.84,2005-2006,7.0
