# Balance training data using Synthetic Sampling

In [13]:
import pandas as pd
import numpy as np

from joblib import dump
from joblib import load

from collections import Counter

from sklearn.model_selection import train_test_split

In [2]:
#Read full, raw, labelled training data

df = pd.read_csv('../data/raw/2022_train.csv')

In [3]:
# Copy full data set to df_cleaned 

df_cleaned = df.copy()

In [4]:
# Drop Id column

df_cleaned.drop('Id', axis=1, inplace=True)

In [5]:
list(df_cleaned.columns)

['GP',
 'MIN',
 'PTS',
 'FGM',
 'FGA',
 'FG%',
 '3P Made',
 '3PA',
 '3P%',
 'FTM',
 'FTA',
 'FT%',
 'OREB',
 'DREB',
 'REB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'TARGET_5Yrs']

In [6]:
TARGET_5Yrs = df_cleaned.pop('TARGET_5Yrs')

In [7]:
X_data, X_test, y_data, y_test = train_test_split (df_cleaned, TARGET_5Yrs, test_size=0.2, random_state=8)

In [11]:
print(X_data.info())
print(y_data.info())
print(X_test.info())
print(y_test.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6400 entries, 3617 to 4547
Data columns (total 19 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   GP       6400 non-null   int64  
 1   MIN      6400 non-null   float64
 2   PTS      6400 non-null   float64
 3   FGM      6400 non-null   float64
 4   FGA      6400 non-null   float64
 5   FG%      6400 non-null   float64
 6   3P Made  6400 non-null   float64
 7   3PA      6400 non-null   float64
 8   3P%      6400 non-null   float64
 9   FTM      6400 non-null   float64
 10  FTA      6400 non-null   float64
 11  FT%      6400 non-null   float64
 12  OREB     6400 non-null   float64
 13  DREB     6400 non-null   float64
 14  REB      6400 non-null   float64
 15  AST      6400 non-null   float64
 16  STL      6400 non-null   float64
 17  BLK      6400 non-null   float64
 18  TOV      6400 non-null   float64
dtypes: float64(18), int64(1)
memory usage: 1000.0 KB
None
<class 'pandas.core.series.Series

In [16]:
print(Counter(y_data))
print(Counter(y_test))

Counter({1: 5326, 0: 1074})
Counter({1: 1343, 0: 257})


# Balance training dataset by SMOTE

In [19]:
from imblearn.over_sampling import SMOTE

In [20]:
sm = SMOTE(random_state = 8)

In [21]:
X_SMOTE, y_SMOTE = sm.fit_resample(X_data, y_data)

In [24]:
print(X_SMOTE.info())
print(Counter(y_SMOTE))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10652 entries, 0 to 10651
Data columns (total 19 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   GP       10652 non-null  int64  
 1   MIN      10652 non-null  float64
 2   PTS      10652 non-null  float64
 3   FGM      10652 non-null  float64
 4   FGA      10652 non-null  float64
 5   FG%      10652 non-null  float64
 6   3P Made  10652 non-null  float64
 7   3PA      10652 non-null  float64
 8   3P%      10652 non-null  float64
 9   FTM      10652 non-null  float64
 10  FTA      10652 non-null  float64
 11  FT%      10652 non-null  float64
 12  OREB     10652 non-null  float64
 13  DREB     10652 non-null  float64
 14  REB      10652 non-null  float64
 15  AST      10652 non-null  float64
 16  STL      10652 non-null  float64
 17  BLK      10652 non-null  float64
 18  TOV      10652 non-null  float64
dtypes: float64(18), int64(1)
memory usage: 1.5 MB
None
Counter({1: 5326, 0: 5326})


In [25]:
np.save('../data/processed/X_SMOTE', X_SMOTE)
np.save('../data/processed/y_SMOTE', y_SMOTE)
np.save('../data/processed/X_test_SMOTE', X_test)
np.save('../data/processed/y_test_SMOTE', y_test)

In [27]:
from joblib import dump
from joblib import load

In [14]:
dump(scaler, '../models/scaler.joblib')

['../models/scaler.joblib']

In [30]:
scaler_load = load('../models/scaler.joblib')

In [15]:
from sklearn.model_selection import train_test_split

In [17]:
X_data, X_test, y_data, y_test = train_test_split (df_cleaned, TARGET_5Yrs, test_size=0.2, random_state=8)

In [18]:
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.2, random_state=8)

In [19]:
np.save('../data/processed/X_train', X_train)
np.save('../data/processed/X_val',   X_val)
np.save('../data/processed/X_test',  X_test)
np.save('../data/processed/y_train', y_train)
np.save('../data/processed/y_val',   y_val)
np.save('../data/processed/y_test',  y_test)

# Upsample raw data to balance with respect to TARGET_5Yrs variable.
follow this tutorial:
https://wellsr.com/python/upsampling-and-downsampling-imbalanced-data-in-python/

In [9]:
#Import requied packages

from sklearn.utils import resample

In [29]:
# Copy raw data for upsampling

df_upsample = df.copy()

In [30]:
# count the instances of each target category

df_upsample.groupby('TARGET_5Yrs').size()

TARGET_5Yrs
0    1331
1    6669
dtype: int64

In [12]:
# split the raw data into two sets using the TARGET_5Yrs variable.

TARGET_0 = df_upsample[df_upsample["TARGET_5Yrs"] == 0]
TARGET_1 = df_upsample[df_upsample["TARGET_5Yrs"] == 1]
print(TARGET_0.shape)
print(TARGET_1.shape)

NameError: name 'df_upsample' is not defined

In [32]:
# upsample TARGET_0

TARGET_0_ups = resample(TARGET_0, replace=True, n_samples = len(TARGET_1), random_state = 8)

print(TARGET_0_ups.shape)

(6669, 21)


In [33]:
df_upsample = pd.concat([TARGET_1, TARGET_0_ups])

print(df_upsample.shape)
print(df_upsample.groupby('TARGET_5Yrs').size())

(13338, 21)
TARGET_5Yrs
0    6669
1    6669
dtype: int64


In [34]:
df_upsample.drop('Id', axis=1, inplace=True)
TARGET_UPS = df_upsample.pop('TARGET_5Yrs')

scaler_ups = StandardScaler()

df_upsample = scaler_ups.fit_transform(df_upsample)

dump(scaler_ups, '../models/scaler_ups.joblib')

['../models/scaler_ups.joblib']

In [37]:
X_ups_data, X_ups_test, y_ups_data, y_ups_test = train_test_split (df_upsample, TARGET_UPS, test_size=0.2, random_state=8)

In [39]:
np.save('../data/processed/X_ups_data', X_ups_data)
np.save('../data/processed/X_ups_test',  X_ups_test)
np.save('../data/processed/y_ups_data', y_ups_data)
np.save('../data/processed/y_ups_test',  y_ups_test)