In [None]:
import gc
import time

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import random
random.seed(1337)
np.random.seed(1337)

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
# Credit for some of this method here: https://www.kaggle.com/rejasupotaro/effective-feature-engineering
def load_train():
    gc.collect()
    df = pd.read_csv('../input/train_V2.csv')
    invalid_match_ids = df[df['winPlacePerc'].isna()]['matchId'].values
    df = df[-df['matchId'].isin(invalid_match_ids)]
    return df

def load_test():
    gc.collect()
    df = pd.read_csv('../input/test_V2.csv')
    return df

In [None]:
df_train = load_train()
df_test = load_test()

In [None]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    #start_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    #end_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
df_train['playersJoined'] = df_train.groupby('matchId')['matchId'].transform('count')
df_test['playersJoined'] = df_test.groupby('matchId')['matchId'].transform('count')

In [None]:
df_train['killsNorm'] = df_train['kills']*((100-df_train['playersJoined'])/100 + 1)
df_train['damageDealtNorm'] = df_train['damageDealt']*((100-df_train['playersJoined'])/100 + 1)
df_train['maxPlaceNorm'] = df_train['maxPlace']*((100-df_train['playersJoined'])/100 + 1)
df_train['matchDurationNorm'] = df_train['matchDuration']*((100-df_train['playersJoined'])/100 + 1)
df_train = reduce_mem_usage(df_train)
df_train.head()

In [None]:
df_test['killsNorm'] = df_test['kills']*((100-df_test['playersJoined'])/100 + 1)
df_test['damageDealtNorm'] = df_test['damageDealt']*((100-df_test['playersJoined'])/100 + 1)
df_test['maxPlaceNorm'] = df_test['maxPlace']*((100-df_test['playersJoined'])/100 + 1)
df_test['matchDurationNorm'] = df_test['matchDuration']*((100-df_test['playersJoined'])/100 + 1)
df_test = reduce_mem_usage(df_test)
df_test.head()

In [None]:
target = 'winPlacePerc'
drop_cols = ['Id', 'groupId', 'matchId', target]
select = [x for x in df_train.columns if x not in drop_cols]
X_train = df_train.loc[:, select]
X_train.head()

In [None]:
X_test = df_test.loc[:, select]
X_test.head()

In [None]:
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca2 = PCA(n_components=2)
pca2.fit(X_train)

In [None]:
P2_train = pca2.transform(X_train)
P2_test = pca2.transform(X_test)

In [None]:
plt.plot(P2_train[:100000, 0], P2_train[:100000, 1], linestyle='', marker='o', markersize=0.7)
plt.xlabel('Component 0')
plt.ylabel('Component 1')
plt.title('PCA w/ Two Components on Training Data')
plt.show()

In [None]:
plt.plot(P2_test[:100000, 0], P2_test[:100000, 1], linestyle='', marker='o', markersize=0.7)
plt.xlabel('Component 0')
plt.ylabel('Component 1')
plt.title('PCA w/ Two Components on Testing Data')
plt.show()

In [None]:
from sklearn.cluster import KMeans

In [None]:
kms3 = KMeans(n_clusters=3).fit(P2_train)
kms4 = KMeans(n_clusters=4).fit(P2_train)
kms5 = KMeans(n_clusters=5).fit(P2_train)

In [None]:
plt.scatter(P2_train[:100000, 0], P2_train[:100000, 1], c=kms3.labels_[:100000])
plt.xlabel('Component 0')
plt.ylabel('Component 1')
plt.title('Three Clusters')
plt.show()

In [None]:
plt.scatter(P2_train[:100000, 0], P2_train[:100000, 1], c=kms4.labels_[:100000])
plt.xlabel('Component 0')
plt.ylabel('Component 1')
plt.title('Four Clusters')
plt.show()

In [None]:
plt.scatter(P2_train[:100000, 0], P2_train[:100000, 1], c=kms5.labels_[:100000])
plt.xlabel('Component 0')
plt.ylabel('Component 1')
plt.title('Five Clusters')
plt.show()

In [None]:
def cluster_features(df, model, pca):
    P = pca.transform(df)
    new_df = pd.DataFrame()
    new_df['cluster'] = model.predict(P)
    one_hot = pd.get_dummies(new_df['cluster'])
    one_hot.columns = one_hot.columns.map(lambda x: 'cluster_' + str(x))
    new_df = new_df.join(one_hot)
    new_df = new_df.drop('cluster', axis=1)
    new_df = new_df.fillna(0)
    return new_df
    
def centroid_features(df, model, pca):
    P = pd.DataFrame(pca.transform(df))
    new_df = pd.DataFrame()
    cluster = 0
    for centers in model.cluster_centers_:
        new_df['distance_{}'.format(cluster)] = np.linalg.norm(P[[0, 1]].sub(np.array(centers)), axis=1)
        cluster += 1
    return new_df

In [None]:
def norm_features(df):
    df['playersJoined'] = df.groupby('matchId')['matchId'].transform('count')
    df['killsNorm'] = df['kills']*((100-df['playersJoined'])/100 + 1)
    df['damageDealtNorm'] = df['damageDealt']*((100-df['playersJoined'])/100 + 1)
    df['maxPlaceNorm'] = df['maxPlace']*((100-df['playersJoined'])/100 + 1)
    df['matchDurationNorm'] = df['matchDuration']*((100-df['playersJoined'])/100 + 1)
    df = reduce_mem_usage(df)
    return df

def one_hot_encode(df):
    return pd.get_dummies(df, columns=['matchType'])

def remove_categories(df):
    target = 'winPlacePerc'
    drop_cols = ['Id', 'groupId', 'matchId', 'matchType', target]
    select = [x for x in df.columns if x not in drop_cols]
    return df.loc[:, select]

In [None]:
def kmeans_5_clusters(df):
    res = cluster_features(remove_categories(one_hot_encode(norm_features(df))), kms5, pca2)
    res.columns = res.columns.map(lambda x: 'kms5_' + str(x))
    return res
    
def kmeans_5_centroids(df):
    res = centroid_features(remove_categories(one_hot_encode(norm_features(df))), kms5, pca2)
    res.columns = res.columns.map(lambda x: 'kms5_' + str(x))
    return res

def kmeans_4_clusters(df):
    res = cluster_features(remove_categories(one_hot_encode(norm_features(df))), kms4, pca2)
    res.columns = res.columns.map(lambda x: 'kms4_' + str(x))
    return res
    
def kmeans_4_centroids(df):
    res = centroid_features(remove_categories(one_hot_encode(norm_features(df))), kms4, pca2)
    res.columns = res.columns.map(lambda x: 'kms4_' + str(x))
    return res

def kmeans_3_clusters(df):
    res = cluster_features(remove_categories(one_hot_encode(norm_features(df))), kms3, pca2)
    res.columns = res.columns.map(lambda x: 'kms3_' + str(x))
    return res
    
def kmeans_3_centroids(df):
    res = centroid_features(remove_categories(one_hot_encode(norm_features(df))), kms3, pca2)
    res.columns = res.columns.map(lambda x: 'kms3_' + str(x))
    return res

In [None]:
kmeans_5_clusters(df_test).head()