In [1]:
import numpy as np
import pandas as pd
import gc, sys
gc.enable()
pd.set_option('display.max_columns', 500)
train_dir = '../data/train_V2.csv'
test_dir = '../data/test_V2.csv'

In [2]:
def feature_engineering(is_train=True, debug=True, m_type='squad-fpp'):
    test_idx = None
    if is_train:
        print("processing train.csv")
        if debug == True:
            df = pd.read_csv(train_dir)
        else:
            df = pd.read_csv(train_dir)

        df = df[df['maxPlace'] > 1]
    else:
        print("processing test.csv")
        df = pd.read_csv(test_dir)
        
    df = df[df['matchType']==m_type]
    
    
    if not is_train:
        test_idx = df['Id'].values
        
    target = 'winPlacePerc'

    df['headshotrate'] = df['kills'] / df['headshotKills']
    df['killStreakrate'] = df['killStreaks'] / df['kills']
    df['healthitems'] = df['heals'] + df['boosts']
    df['totalDistance'] = df['rideDistance'] + df["walkDistance"] + df["swimDistance"]
    df['killPlace_over_maxPlace'] = df['killPlace'] / df['maxPlace']
    df['headshotKills_over_kills'] = df['headshotKills'] / df['kills']
    df['distance_over_weapons'] = df['totalDistance'] / df['weaponsAcquired']
    df['walkDistance_over_heals'] = df['walkDistance'] / df['heals']
    df['walkDistance_over_kills'] = df['walkDistance'] / df['kills']
    df['killsPerWalkDistance'] = df['kills'] / df['walkDistance']
    df["skill"] = df["headshotKills"] + df["roadKills"]

    df[df == np.Inf] = np.NaN
    df[df == np.NINF] = np.NaN

    df.fillna(0, inplace=True)

    features = list(df.columns)
    features.remove("Id")
    features.remove("matchId")
    features.remove("groupId")
    features.remove("matchType")

    y = None

    if is_train:
        y = np.array(df.groupby(['matchId', 'groupId'])[target].agg('mean'), dtype=np.float64)
        features.remove(target)

    agg = df.groupby(['matchId', 'groupId'])[features].agg('mean')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()

    if is_train:
        df_out = agg.reset_index()[['matchId', 'groupId']]
    else:
        df_out = df[['matchId', 'groupId']]

    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_mean", "_mean_rank"], how='left', on=['matchId', 'groupId'])

    agg = df.groupby(['matchId', 'groupId'])[features].agg('max')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_max", "_max_rank"], how='left', on=['matchId', 'groupId'])

    agg = df.groupby(['matchId', 'groupId'])[features].agg('min')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['matchId', 'groupId'])

    agg = df.groupby(['matchId', 'groupId']).size().reset_index(name='group_size')
    df_out = df_out.merge(agg, how='left', on=['matchId', 'groupId'])

    agg = df.groupby(['matchId'])[features].agg('mean').reset_index()
    df_out = df_out.merge(agg, suffixes=["", "_match_mean"], how='left', on=['matchId'])

    agg = df.groupby(['matchId']).size().reset_index(name='match_size')
    df_out = df_out.merge(agg, how='left', on=['matchId'])

    df_out.drop(["matchId", "groupId"], axis=1, inplace=True)

    X = df_out

    feature_names = list(df_out.columns)

    del df, df_out, agg, agg_rank
    gc.collect()

    print("have processed")
    return X, y, feature_names, test_idx

In [3]:
types = ['crashfpp', 'crashtpp', 'flarefpp', 'flaretpp', 'normal-duo',
 'normal-duo-fpp', 'normal-solo', 'normal-solo-fpp', 'normal-squad',
 'normal-squad-fpp', 'solo', 'solo-fpp', 'duo', 'duo-fpp', 'squad', 'squad-fpp']

In [4]:
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
import time
scaler = StandardScaler()

idxs = []
preds = []

for m_type in types:
    t0 = time.time()
    print("match type：{}".format(m_type))
    # 读取数据
    x_train, y_train, train_columns, _ = feature_engineering(True, True, m_type)
    x_test, _, _ , test_idx = feature_engineering(False, True, m_type)
    
    print('number train: {}'.format(len(x_train)))
    
    # 归一化
    x_train = scaler.fit_transform(x_train)  
    x_test = scaler.fit_transform(x_test) 
    
    # 训练
    clf = MLPRegressor(hidden_layer_sizes=(256, 128, 64, 32, ), activation='relu', 
                   solver='adam', alpha=0.0001, batch_size=128,  learning_rate='constant', 
                   learning_rate_init=0.001, max_iter=500, shuffle=True, random_state=42,
                   verbose=True, early_stopping=True, validation_fraction=0.2,
                   n_iter_no_change=10)
    clf.fit(x_train, y_train)
    
    
    # 预测
    y_pred = clf.predict(x_test)
    y_pred[y_pred > 1] = 1
    y_pred[y_pred < 0] = 0
    
    # 整合结果
    idxs.extend(test_idx)
    preds.extend(y_pred)

submission = pd.DataFrame({'Id': idxs, 'winPlacePerc': preds})
submission.to_csv('submission.csv', index=False)

match type：crashfpp
processing train.csv


KeyboardInterrupt: 