# Lab - Handling Data Imbalance in Classification Models


# Feature selection with p-values

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
#pd.set_option('display.max_columns',None)
#pd.set_option('display.max_columns')
#pd.set_option('display.max_rows',None)
#pd.set_option('display.max_rows')

In [2]:
numerical = pd.read_csv('numerical.csv')
numerical.shape

(95412, 315)

In [3]:
targets = pd.read_csv('target.csv')

In [4]:
categorical= pd.read_csv('categorical.csv')
categorical.shape

(95412, 22)

Look critically at the dtypes of numerical and categorical columns

In [5]:
pd.set_option('display.max_rows',None)
numerical.dtypes

TCODE         int64
AGE         float64
INCOME        int64
WEALTH1       int64
HIT           int64
MALEMILI      int64
MALEVET       int64
VIETVETS      int64
WWIIVETS      int64
LOCALGOV      int64
STATEGOV      int64
FEDGOV        int64
WEALTH2       int64
POP901        int64
POP902        int64
POP903        int64
POP90C1       int64
POP90C2       int64
POP90C3       int64
POP90C4       int64
POP90C5       int64
ETH1          int64
ETH2          int64
ETH3          int64
ETH4          int64
ETH5          int64
ETH6          int64
ETH7          int64
ETH8          int64
ETH9          int64
ETH10         int64
ETH11         int64
ETH12         int64
ETH13         int64
ETH14         int64
ETH15         int64
ETH16         int64
AGE901        int64
AGE902        int64
AGE903        int64
AGE904        int64
AGE905        int64
AGE906        int64
AGE907        int64
CHIL1         int64
CHIL2         int64
CHIL3         int64
AGEC1         int64
AGEC2         int64
AGEC3         int64


In [6]:
categorical.dtypes
pd.set_option('display.max_rows',9)

In [7]:
categorical.shape
#categorical.columns

(95412, 22)

In [8]:
#categorical.head()

In [9]:
#some categorical data is integer. 
#we want to turn them to object so that we can test/train as categorical
#BUT we have to make a logistic regression so we need all
#categorical= categorical.astype('object')
    #categorical.dtypes

Concatenate numerical and categorical back together again for your X dataframe.  Designate the TargetB as y

In [10]:
all_data = pd.concat((numerical,categorical),axis=1)
all_data.shape

(95412, 337)

In [11]:
X = all_data
y = targets['TARGET_B']

# Train Test

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

## Transformation
no need to transform y test y train

In [14]:
# Splitting test/train and numerical/categorical features
train_num = X_train[numerical.columns]
train_cat = X_train[categorical.columns]

test_num = X_test[numerical.columns]
test_cat = X_test[categorical.columns]

transform both train and test

In [15]:
scaler = MinMaxScaler() #Scale NUM
#Fit and transform TRAIN
train_num_scaled = scaler.fit_transform(train_num)
train_num_scaled = pd.DataFrame(train_num_scaled, columns=numerical.columns)
# transform TEST
test_num_scaled = scaler.transform(test_num)
test_num_scaled = pd.DataFrame(test_num_scaled, columns=numerical.columns)

# Encode

In [16]:
#let's make it a function
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import numpy as np

def onehot_or_ordinal(data, categorical_columns, threshold=5):
    """
    Encode categorical features using one-hot encoding if the number of unique values
    is below the threshold, otherwise use ordinal encoding.

    Parameters:
    - data : pandas Dataframe
    - categorical_columns : list of column names.
    - threshold: int, the threshold value to decide between one-hot and ordinal encoding.

    Returns:
    - encoded_data: DataFrame, data with encoded categorical features.
    """
#categorical_columns = [col for col in data.columns if data[col].dtype == 'object']
    encoded_data = data.copy()

    # Loop through categorical columns and encode them
    for col in categorical_columns:
        unique_values = data[col].nunique()
        if unique_values <= threshold:
            # Use one-hot encoding
            onehot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
            encoded_values = onehot_encoder.fit_transform(data[col].values.reshape(-1, 1))
            # Create new column names
            new_cols = [col + '_' + str(val) for val in range(unique_values)]
        else:
            # Use ordinal encoding
            ordinal_encoder = OrdinalEncoder()
            encoded_values = ordinal_encoder.fit_transform(data[col].values.reshape(-1, 1))
            new_cols = [col]
        # Add encoded values to the DataFrame
        encoded_df = pd.DataFrame(encoded_values, columns=new_cols, index=data.index)
        encoded_data = pd.concat([encoded_data, encoded_df], axis=1)
        # Drop the original categorical column
        encoded_data.drop(col, axis=1, inplace=True)

    return encoded_data

In [17]:
train_cat_encoded = onehot_or_ordinal(train_cat,['STATE', 'HOMEOWNR', 'GENDER', 'RFA_2R', 'RFA_2A', 'GEOCODE2', 'DOMAIN_A'],5)
train_cat_encoded.shape



(76329, 34)

In [18]:
test_cat_encoded = onehot_or_ordinal(test_cat,['STATE', 'HOMEOWNR', 'GENDER', 'RFA_2R', 'RFA_2A', 'GEOCODE2', 'DOMAIN_A'],5)
test_cat_encoded.shape



(19083, 34)

In [19]:
# again re-concatenate train_num and train_cat as X_train (as well as test_num and test_cat) as X_test
X_train = pd.concat([train_num_scaled, train_cat_encoded.reset_index(drop=True)], axis=1)
X_test = pd.concat([test_num_scaled, test_cat_encoded.reset_index(drop=True)], axis=1)

# Logistic regression
because target is binary

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
y_pred = log_reg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9493790284546455


#  Imbalance
- Use the resampling strategies used in class for upsampling and downsampling to create a balance between the two classes.
- Each time fit the model and see how the accuracy of the model has changed.

In [22]:
y_train

26355    0
3034     1
21143    0
46939    0
        ..
61404    0
17730    0
28030    0
15725    0
Name: TARGET_B, Length: 76329, dtype: int64

In [30]:
#you resample only train set
trainset = pd.concat([X_train.reset_index(), y_train.reset_index()], axis=1)
trainset

Unnamed: 0,index,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,...,GEOCODE2_1,GEOCODE2_2,GEOCODE2_3,DOMAIN_A_0,DOMAIN_A_1,DOMAIN_A_2,DOMAIN_A_3,DOMAIN_A_4,index.1,TARGET_B
0,0,0.000028,0.845361,0.666667,1.000000,0.000000,0.010101,0.484848,0.383838,0.313131,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,26355,0
1,1,0.000000,0.624862,0.833333,0.888889,0.004149,0.000000,0.202020,0.323232,0.131313,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3034,1
2,2,0.000014,0.624862,0.666667,1.000000,0.000000,0.010101,0.242424,0.383838,0.242424,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,21143,0
3,3,0.000000,0.536082,0.000000,1.000000,0.000000,0.000000,0.626263,0.202020,0.464646,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,46939,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76325,76325,0.000042,0.907216,0.000000,0.222222,0.000000,0.000000,0.262626,0.282828,0.393939,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,61404,0
76326,76326,0.000389,0.432990,1.000000,0.888889,0.004149,0.000000,0.181818,0.595960,0.141414,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,17730,0
76327,76327,0.000028,0.670103,0.500000,0.000000,0.000000,0.010101,0.292929,0.313131,0.333333,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,28030,0
76328,76328,0.000000,0.453608,0.666667,1.000000,0.000000,0.000000,0.131313,0.424242,0.080808,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,15725,0


In [31]:
trainset['TARGET_B'].value_counts()/len(trainset['TARGET_B']) #imbalance

TARGET_B
0    0.949207
1    0.050793
Name: count, dtype: float64

In [32]:
from sklearn.utils import resample

category_0 = trainset[trainset['TARGET_B'] == 0] #majority
category_1 = trainset[trainset['TARGET_B'] == 1]

# undersampling

In [33]:
category_0_undersampled = resample(category_0,
                                   replace=False,
                                   n_samples = len(category_1)) # removes majority

In [34]:
trainset_downsampled = pd.concat([category_0_undersampled, category_1], axis=0)

In [35]:
trainset_downsampled['TARGET_B'].value_counts()
y_train_down = trainset_downsampled['TARGET_B']
X_train_down = trainset_downsampled.drop('TARGET_B',axis = 1)

## Model fit - undersampling

In [36]:
log_reg = LogisticRegression()
log_reg.fit(X_train_down, y_train_down)

In [39]:
y_pred = log_reg.predict(X_test.reset_index())
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

ValueError: The feature names should match those that were passed during fit.
Feature names must be in the same order as they were in fit.


# oversampling

In [None]:
category_1_oversampled = resample(category_1,
                                  replace=True,
                                  n_samples = len(category_0)) #resample replace minority

In [None]:
trainset_oversampled = pd.concat([category_0, category_1_oversampled], axis=0)
trainset_oversampled['TARGET_B'].value_counts()

## Model fit - oversampling

In [None]:

y_train_over = train_oversampled['TARGET_B']
X_train_over = train_oversampled.drop('TARGET_B',axis = 1)

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train_over, y_train_over)

In [None]:
y_pred = log_reg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("precision: ",precision_score(y_test,pred))