# MLSMOTE

A python code that implemented the [MLSMOTE](https://www.sciencedirect.com/science/article/abs/pii/S0950705115002737) algorithm was available here: https://github.com/niteshsukhwani/MLSMOTE. However, the code had a bug and wasn't efficiently using the pandas. I fixed and modified the code, and here it is.

**If you find this notebook useful, please don't forget to upvote.**

In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors

In [2]:
#サンプルデータ生成
def create_dataset(n_sample=1000):
    ''' 
    Create a unevenly distributed sample data set multilabel  
    classification using make_classification function
    
    args
    nsample: int, Number of sample to be created
    
    return
    X: pandas.DataFrame, feature vector dataframe with 10 features 
    y: pandas.DataFrame, target vector dataframe with 5 labels
    '''
    X, y = make_classification(n_classes=5, class_sep=2,
                               weights=[0.1,0.025, 0.205, 0.008, 0.9], n_informative=3, n_redundant=1, flip_y=0,
                               n_features=10, n_clusters_per_class=1, n_samples=1000, random_state=10)
    y = pd.get_dummies(y, prefix='class')
    return pd.DataFrame(X), y

def get_tail_label(df: pd.DataFrame, ql=[0.05, 1.]) -> list:
    """
    Find the underrepresented targets.
    Underrepresented targets are those which are observed less than the median occurance.
    Targets beyond a quantile limit are filtered.
    
    不足ターゲットを見つける。
    ターゲットの頻度を分布として扱い、分位パラメータ(default:[0.05, 1.])の外に出たターゲットを返却する。
    """
    irlbl = df.sum(axis=0)
    irlbl = irlbl[(irlbl > irlbl.quantile(ql[0])) & ((irlbl < irlbl.quantile(ql[1])))]  # Filtering
    irlbl = irlbl.max() / irlbl
    threshold_irlbl = irlbl.median()
    tail_label = irlbl[irlbl > threshold_irlbl].index.tolist()
    return tail_label

def get_minority_samples(X: pd.DataFrame, y: pd.DataFrame, ql=[0.05, 1.]):
    """
    get_tail_labelを起動。結果をDataFrame化する。
    
    return
    X_sub: pandas.DataFrame, the feature vector minority dataframe
    y_sub: pandas.DataFrame, the target vector minority dataframe
    """
    tail_labels = get_tail_label(y, ql=ql)
    index = y[y[tail_labels].apply(lambda x: (x == 1).any(), axis=1)].index.tolist()
    
    X_sub = X[X.index.isin(index)].reset_index(drop = True)
    y_sub = y[y.index.isin(index)].reset_index(drop = True)
    return X_sub, y_sub

def nearest_neighbour(X: pd.DataFrame, neigh) -> list:
    """
    Give index of 10 nearest neighbor of all the instance
    
    sklearnメソッド：kneighborsの起動。
    k近傍法の実施。
    
    args
    X: np.array, array whose nearest neighbor has to find
    
    return
    indices: list of list, index of 5 NN of each element in X
    """
    nbs = NearestNeighbors(n_neighbors=neigh, metric='euclidean', algorithm='kd_tree').fit(X)
    euclidean, indices = nbs.kneighbors(X)
    return indices

def MLSMOTE(X, y, n_sample, neigh=5):
    """
    Give the augmented data using MLSMOTE algorithm
    
    args
    X: pandas.DataFrame, input vector DataFrame
    y: pandas.DataFrame, feature vector dataframe
    n_sample: int, number of newly generated sample
    
    return
    new_X: pandas.DataFrame, augmented feature vector data
    target: pandas.DataFrame, augmented target vector data
    """
    indices2 = nearest_neighbour(X, neigh=5)
    n = len(indices2)
    new_X = np.zeros((n_sample, X.shape[1]))
    target = np.zeros((n_sample, y.shape[1]))
    for i in range(n_sample):
        reference = random.randint(0, n-1)
        neighbor = random.choice(indices2[reference, 1:])
        all_point = indices2[reference]
        nn_df = y[y.index.isin(all_point)]
        ser = nn_df.sum(axis = 0, skipna = True)
        target[i] = np.array([1 if val > 0 else 0 for val in ser])
        ratio = random.random()
        gap = X.loc[reference,:] - X.loc[neighbor,:]
        new_X[i] = np.array(X.loc[reference,:] + ratio * gap)
    new_X = pd.DataFrame(new_X, columns=X.columns)
    target = pd.DataFrame(target, columns=y.columns)
    return new_X, target

In [3]:
X, y = create_dataset()  # Creating a Dataframe
X_sub, y_sub = get_minority_samples(X, y)  # Getting minority samples of that datframe
X_res, y_res = MLSMOTE(X_sub, y_sub, 100, 5)  # Applying MLSMOTE to augment the dataframe

In [4]:
y_res.head()

Unnamed: 0,class_0,class_1,class_2,class_3,class_4
0,0.0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0


In [5]:
X_res.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.968825,-0.357867,2.154377,0.12635,1.329537,1.519442,1.01656,-1.050995,-0.563463,0.361751
1,-2.530707,-1.121641,1.848724,0.493394,-1.682661,-0.301992,-0.300126,-1.171308,-1.704773,1.578272
2,1.157707,-1.187426,2.411537,-2.516072,0.371667,1.704293,1.348147,-1.358077,-2.327619,2.168738
3,-1.481997,-0.48485,1.964237,0.088615,-1.071162,-0.181178,-0.485371,-1.492589,-1.912194,1.779095
4,-2.174688,-3.754205,1.530664,0.500295,2.816754,-0.409808,0.123382,-4.387697,-4.608847,4.595554


In [6]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-1.700263,0.602477,-2.478509,0.433484,1.109780,-0.246891,0.525389,-1.902413,1.991422,-1.848972
1,2.267874,-0.350847,2.187252,-0.769722,0.601022,-0.234049,-1.844444,2.092082,2.299990,-2.585919
2,-1.207028,0.230167,-2.035403,-0.401339,0.595955,2.066251,-2.311038,-1.804673,2.197500,-2.106087
3,-1.326205,-1.417870,2.085587,-0.453313,0.457689,0.868153,0.784653,2.348514,2.546447,-2.829462
4,-0.781795,1.495530,-2.194913,-0.264658,0.906237,-0.633937,-0.907535,-2.706511,1.255595,-1.120307
...,...,...,...,...,...,...,...,...,...,...
995,-2.287050,-0.298746,1.586469,-0.464742,1.103363,1.050799,0.960210,-2.157864,2.600376,-2.887021
996,-0.027245,1.842409,1.793464,0.013021,0.980886,-0.037242,-0.309943,-2.104098,-2.168505,2.055803
997,0.833553,-0.130508,-1.954786,-0.074340,-0.410230,0.837448,-1.627014,-1.333642,2.935449,-2.876682
998,-1.088050,-0.213819,-2.207270,-1.106803,-1.096291,0.722919,0.833781,-2.433188,1.347741,-1.211900


In [7]:
y

Unnamed: 0,class_0,class_1,class_2,class_3,class_4
0,0,0,0,0,1
1,0,0,1,0,0
2,0,0,0,0,1
3,0,0,1,0,0
4,0,0,0,0,1
...,...,...,...,...,...
995,1,0,0,0,0
996,0,1,0,0,0
997,0,0,0,0,1
998,0,0,0,0,1


In [9]:
X_sub.shape

(25, 10)

In [11]:
X_res

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.968825,-0.357867,2.154377,0.126350,1.329537,1.519442,1.016560,-1.050995,-0.563463,0.361751
1,-2.530707,-1.121641,1.848724,0.493394,-1.682661,-0.301992,-0.300126,-1.171308,-1.704773,1.578272
2,1.157707,-1.187426,2.411537,-2.516072,0.371667,1.704293,1.348147,-1.358077,-2.327619,2.168738
3,-1.481997,-0.484850,1.964237,0.088615,-1.071162,-0.181178,-0.485371,-1.492589,-1.912194,1.779095
4,-2.174688,-3.754205,1.530664,0.500295,2.816754,-0.409808,0.123382,-4.387697,-4.608847,4.595554
...,...,...,...,...,...,...,...,...,...,...
95,0.317542,1.001988,2.239175,-0.033305,-1.205573,1.502037,1.369951,-2.057153,-2.715552,2.581587
96,-2.392741,-0.899525,1.346543,-2.380844,0.057672,2.416931,-0.079693,-1.737105,-1.720925,1.638307
97,0.361624,-0.564823,2.398153,1.253172,0.562420,-0.511692,0.293064,-1.774424,-2.363050,2.202243
98,1.574557,1.524844,2.650075,0.032259,-2.524918,-2.807256,0.521508,1.037684,-0.838129,0.622136
