# VD Dataset with Gradual Drift 

## 1. Necessary Imports 

In [8]:
import pandas as pd
import numpy as np

import scipy.stats as stats
from scipy.stats import norm
import random
from numpy.random import seed
from numpy.random import randn



import warnings
warnings.filterwarnings('ignore')

## 2. Non-Drifted Dataset 

#### To keep a consistent copy of the dataset , data is generated and saved in this notebook. For other experiemnts , the save data will be read. 

In [None]:
import numpy as np
import pandas as pd
import math
from sklearn.utils import shuffle

# Set the parameters for the binomial distribution
n = 10
p = 0.5

# Generate the first 9600 rows ( 300 batches of size 32) with binomial distribution and target label 1
X1 = np.random.binomial(n, p, size=(9600, 5))
y1 = np.ones((9600,))

# Generate the next 9600 rows with logistic distribution and target label 0
loc=0.38
X2 = np.random.logistic(size=(9600, 5))
y2 = np.zeros((9600,))


# Concatenate the data and labels
X = np.concatenate((X1, X2))
y = np.concatenate((y1, y2))


# Create a pandas dataframe with the data and labels
data = pd.DataFrame(data=X, columns=['col1', 'col2', 'col3', 'col4', 'col5'])
data['class'] = y


data = shuffle(data)
data=data.reset_index(drop=True)

data["class"] = np.where(data["class"] ==1.0,1,0)

data.to_csv('vd_nondrifted.csv', index=False)


In [9]:
data=pd.read_csv('vd_nondrifted.csv')

In [10]:
data

Unnamed: 0,col1,col2,col3,col4,col5,class
0,7.000000,6.000000,8.000000,6.000000,5.000000,1
1,-2.729465,0.395907,-1.244328,1.058099,0.717900,0
2,-0.144599,-0.698764,5.386546,0.453879,0.461973,0
3,8.000000,8.000000,4.000000,4.000000,6.000000,1
4,3.000000,4.000000,5.000000,9.000000,5.000000,1
...,...,...,...,...,...,...
19195,6.000000,2.000000,3.000000,7.000000,7.000000,1
19196,7.000000,9.000000,6.000000,6.000000,7.000000,1
19197,4.000000,5.000000,7.000000,5.000000,6.000000,1
19198,0.666549,-0.170226,3.552791,-2.485941,3.502892,0


In [11]:
data['class'].value_counts()

0    9600
1    9600
Name: class, dtype: int64

## 3. Drifted Dataset

In [None]:
non_drifted_data=data.tail(640)

non_drifted_data.reset_index(drop=True)

# Generate the next 320 rows with binomial distribution and target label 1
Xn1 = np.random.binomial(n, p, size=(320, 5))
yn1 = np.ones((320,))

# Generate the last 320 rows with logistic distribution and target label 0
Xn2 = np.random.logistic(size=(320, 5))
yn2 = np.zeros((320,))

X = np.concatenate((Xn1,Xn2))
y = np.concatenate((yn1,yn2))

# Introduce gradual drift in batches of 32 rows for class 1
p_drift1 = 0.8  # Increase the binomial parameter by 60% (0.5 * 1.6)  
X[0:32, 0] = np.random.binomial(n, p_drift1, size=(32,))
X[32:64,:2] = np.random.binomial(n, p_drift1, size=(32, 2))
X[64:96, :3] = np.random.binomial(n, p_drift1, size=(32, 3))
X[96:128, :4] = np.random.binomial(n, p_drift1, size=(32, 4))
X[128:160, :5] = np.random.binomial(n, p_drift1, size=(32, 5))



# Create a pandas dataframe with the data and labels
data_n = pd.DataFrame(data=X, columns=['col1', 'col2', 'col3', 'col4', 'col5'])
data_n['class'] = y

data_n

data_n_1=data_n[0:160]
data_n_2=data_n[320:480]
data_n_1.reset_index(inplace=True, drop=True)
data_n_2.reset_index(inplace=True, drop=True)


i=0
j=16
drifted_data=pd.DataFrame()
while(i<160):
    drifted_data=pd.concat([drifted_data,data_n_1[i:j],data_n_2[i:j]])
    drifted_data.reset_index(drop=True,inplace=True)
    i+=16
    j+=16
    print(i,j)

#normalize(drifted_data)

drifted_data["class"] = np.where(drifted_data["class"] ==1.0,1,0)

drifted_data=pd.concat([non_drifted_data,drifted_data])
drifted_data.reset_index(inplace=True, drop=True)

drifted_data.to_csv('vd_drifted.csv', index=False)
