CSC2515 Group Project

Members: Zhanwen Tan

In [None]:
import os
import pandas as pd
def load_df(file_name):
    path = os.path.join(file_name)
    data_df = pd.read_csv(path)
    return data_df

In [34]:
def evaluate(data_df):
    #Print data shape
    print("The dimension of data is {}".format(data_df.shape))
    
    #Print column names
    print("All the columns are listed as follow")
    print(data_df.columns.tolist())
    
    #Print column types
    print("The type of each column is:")
    print (data_df.dtypes)
    
    #Print macth type count
    print("The count of each match type is:")
    print(data_df.groupby(['matchType']).size())
    
    #Detect missing values in data
    print("Is there any missing value in data? Ans: {}".format(data_df.isna().any().any()))
    null_data = data_df[data_df.isna().any(axis=1)]
    print("The row that has missing value is:")
    print(null_data)
    #print(null_data.isna().any())
    
    #Describe the data
    #data_df.describe()

    #Calculate the correlation between features
    #data_df.corr().style.background_gradient(cmap='coolwarm').set_precision(3)

In [35]:
import numpy as np
def preprocess(data_df):
    #Drop the three columns: Id, groupId, matchId. Also drop the row with missing value
    data_df = data_df.drop(['Id','groupId','matchId'],axis=1).drop([2744604])
    print("Does the data have missing values now? Ans:{}".format(data_df.isna().any().any()))

    #Delete rows with specific match types
    data_df=data_df[data_df['matchType'].isin(['duo','duo-fpp','solo','solo-fpp','squad','squad-fpp'])]
    
    #Combine match types
    data_df["matchType"].replace({"duo-fpp":"duo", "solo-fpp":"solo", "squad-fpp":"squad"}, inplace=True)
    
    #Count the match types after preprocessing
    print("After preprocessing, the count of each match type is:")
    print(data_df.groupby(['matchType']).size())
    
    #Map the macthType column into distinct integers
    data_df.matchType = pd.Categorical(data_df.matchType)
    print(dict(enumerate(data_df['matchType'].cat.categories)))#Print the mappings
    data_df.matchType = data_df.matchType.cat.codes
    
    #Transform pandas data frame to numpy array
    data = np.array(data_df)
    
    return data

In [36]:
def partition(data_preprocessed, datasize_per_matchtype):
    MATCH_TYPE_INDEX = 12
    data_solo = data_preprocessed[data_preprocessed[:,MATCH_TYPE_INDEX] == 0]
    data_solo = data_solo[:datasize_per_matchtype]
    
    data_duo = data_preprocessed[data_preprocessed[:,MATCH_TYPE_INDEX] == 1]
    data_duo = data_duo[:datasize_per_matchtype]
    
    data_squad = data_preprocessed[data_preprocessed[:,MATCH_TYPE_INDEX] == 2]
    data_squad = data_squad[:datasize_per_matchtype]
    
    data_all = np.concatenate((data_solo,data_duo,data_squad))
    
    X_all = data_all[:,:-1]
    y_all = data_all[:,-1]
    X_solo = data_solo[:,:-1]
    y_solo = data_solo[:,-1]
    X_duo = data_duo[:,:-1]
    y_duo = data_duo[:,-1]
    X_squad = data_squad[:,:-1]
    y_squad = data_squad[:,-1]
    
    return X_all, y_all, X_solo, y_solo, X_duo, y_duo, X_squad, y_squad

In [37]:
#Common interface to calculate metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
def cal_metrics(y_predict,y_label,model_name):
    mae = mean_absolute_error(y_label, y_predict)
    mse = mean_squared_error(y_label, y_predict)
    rscore = r2_score(y_label, y_predict)
    
    print("Below are metrics for {}".format(model_name))
    print("MAE: {}".format(mae))
    print("MSE: {}".format(mse))
    print("R-Score: {}".format(rscore))
    
    return mae, mse, rscore

In [44]:
#Baseline Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
def train_test_lin_reg(X_all, y_all, X_solo, y_solo, X_duo, y_duo, X_squad, y_squad):
    X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X_all, y_all, test_size=0.2, random_state=42)
    lin_reg_all = LinearRegression().fit(X_train_all, y_train_all)
    predictions_all = lin_reg_all.predict(X_test_all)
    _,_,_ = cal_metrics(predictions_all,y_test_all,"Linear Regression Model of mixed match type")

    X_train_solo, X_test_solo, y_train_solo, y_test_solo = train_test_split(X_solo, y_solo, test_size=0.2, random_state=42)
    lin_reg_solo = LinearRegression().fit(X_train_solo, y_train_solo)
    predictions_solo = lin_reg_solo.predict(X_test_solo)
    mae_1, mse_1, rscore_1 = cal_metrics(predictions_solo,y_test_solo,"Linear Regression Model of solo match type")

    X_train_duo, X_test_duo, y_train_duo, y_test_duo = train_test_split(X_duo, y_duo, test_size=0.2, random_state=42)
    lin_reg_duo = LinearRegression().fit(X_train_duo, y_train_duo)
    predictions_duo = lin_reg_duo.predict(X_test_duo)
    mae_2, mse_2, rscore_2 = cal_metrics(predictions_duo,y_test_duo,"Linear Regression Model of duo match type")

    X_train_squad, X_test_squad, y_train_squad, y_test_squad = train_test_split(X_squad, y_squad, test_size=0.2, random_state=42)
    lin_reg_squad = LinearRegression().fit(X_train_squad, y_train_squad)
    predictions_squad = lin_reg_squad.predict(X_test_squad)
    mae_3, mse_3, rscore_3 = cal_metrics(predictions_squad,y_test_squad,"Linear Regression Model of squad match type")
    
    average_MAE = (mae_1+mae_2+mae_3)/3
    average_MSE = (mse_1+mse_2+mse_3)/3
    average_rscore = (rscore_1+rscore_2+rscore_3)/3
    print("Average metrics over 3 sub-models is as follow")
    print("Average MAE:{}, Average MSE:{}, Average R-Score:{}".format(average_MAE,average_MSE,average_rscore))

In [39]:
#random forest.Please implement and tune
#def train_test_random_forest(X_all, y_all, X_solo, y_solo, X_duo, y_duo, X_squad, y_squad):
    

In [40]:
#Neural network.Please implement and tune
#def train_test_nn(X_all, y_all, X_solo, y_solo, X_duo, y_duo, X_squad, y_squad):

In [41]:
#Other models

In [42]:
#Model comparison and draw diagram

In [None]:
if __name__ == '__main__':
    
    #Load data
    FILE_LOCATION = 'train_V2.csv'
    data_df = load_df(FILE_LOCATION)
    
    #Evaluate the data
    evaluate(data_df)
    
    #Preprocess the data
    data_preprocessed = preprocess(data_df)

    #print(data_preprocessed[:,12])
    
    #Partition the data
    DATASET_SIZE_PER_TYPE = 2000
    X_all, y_all, X_solo, y_solo, X_duo, y_duo, X_squad, y_squad = partition(data_preprocessed,DATASET_SIZE_PER_TYPE)
    #print(X_all.shape, y_all.shape, X_solo.shape, y_solo.shape, X_duo.shape, y_duo.shape, X_squad.shape, y_squad.shape)
    
    #Traina and test baseline LinearRegression
    train_test_lin_reg(X_all, y_all, X_solo, y_solo, X_duo, y_duo, X_squad, y_squad)
    
    #Traina and test random forest
    #train_test_random_forest(X_all, y_all, X_solo, y_solo, X_duo, y_duo, X_squad, y_squad)
    
    #Traina and test neural network
    #train_test_nn(X_all, y_all, X_solo, y_solo, X_duo, y_duo, X_squad, y_squad)
    
    #draw graph
    

The dimension of data is (4446966, 29)
All the columns are listed as follow
['Id', 'groupId', 'matchId', 'assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills', 'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace', 'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills', 'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'winPoints', 'winPlacePerc']
The type of each column is:
Id                  object
groupId             object
matchId             object
assists              int64
boosts               int64
damageDealt        float64
DBNOs                int64
headshotKills        int64
heals                int64
killPlace            int64
killPoints           int64
kills                int64
killStreaks          int64
longestKill        float64
matchDuration        int64
matchType           object
maxPlace             int64
numGroups            int64
rankPoints           in