In [71]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import seaborn as sn

In [72]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [73]:
# X = np.random.rand(100, 5)
# y = np.zeros(100)
# y[:10] = 1

In [74]:
from sklearn.datasets import load_breast_cancer
import numpy as np

breast_cancer = load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target

# Generate an imbalanced version of the dataset with a 90/10 split
np.random.seed(42)
mask = np.random.rand(len(y)) < 0.9 # 90% of samples in one class
y[mask] = 0 # Assign the first class to the majority of samples
y[~mask] = 1 # Assign the second class to the minority of samples

print("Original class distribution:", np.bincount(breast_cancer.target))
print("Imbalanced class distribution:", np.bincount(y))

Original class distribution: [504  65]
Imbalanced class distribution: [504  65]


In [75]:
X_train, X_test, Y_train, Y_test = train_test_split( X, y, test_size=0.30, random_state=30)
lm = LinearRegression()
lm.fit(X_train, Y_train)
# model evaluation for training set
y_train_predict = lm.predict(X_train)
rmse = (np.sqrt(mean_squared_error(Y_train, y_train_predict)))
r2 = r2_score(Y_train, y_train_predict)

print("The model performance for training set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

Y_pred = lm.predict(X_test)
rmse = (np.sqrt(mean_squared_error(Y_test, Y_pred)))
r2 = r2_score(Y_test, Y_pred)

print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))

# model evaluation for testing set
print(Y_pred.shape)
Y_pred

pd.DataFrame(Y_pred,Y_test)

The model performance for training set
--------------------------------------
RMSE is 0.32118330558650104
R2 score is 0.04445627649852801


The model performance for testing set
--------------------------------------
RMSE is 0.3045933475029041
R2 score is -0.09390943379733496
(171,)


Unnamed: 0,0
0,-0.006630
0,-0.104398
0,0.189952
0,0.139792
0,0.207920
...,...
0,-0.002778
0,0.205309
1,0.133033
0,0.058521


In [76]:
unique, counts=np.unique(y, return_counts=True)
print(unique, counts)

[0 1] [504  65]


In [77]:
# Get the indices of the majority and minority classes
majority_class = np.where(y == 0)[0]
minority_class = np.where(y == 1)[0]

# Randomly select a subset of samples from the majority class
random_indices = np.random.choice(majority_class, size=len(minority_class), replace=False)

# Combine the minority class samples with the randomly selected majority class samples
undersampled_indices = np.concatenate([minority_class, random_indices])
X_undersampled = X[undersampled_indices]
y_undersampled = y[undersampled_indices]

# Check the class distribution
print(np.unique(y, return_counts=True))


(array([0, 1]), array([504,  65], dtype=int64))


In [78]:
# Randomly select a subset of samples from the minority class
random_indices = np.random.choice(minority_class, size=len(majority_class), replace=True)

# Combine the majority class samples with the randomly duplicated minority class samples
oversampled_indices = np.concatenate([majority_class, random_indices])
X_oversampled = X[oversampled_indices]
y_oversampled = y[oversampled_indices]

# Check the class distribution
print(np.unique(y_oversampled, return_counts=True))


(array([0, 1]), array([504, 504], dtype=int64))


In [79]:
X_train, X_test, Y_train, Y_test = train_test_split( X_oversampled, y_oversampled, test_size=0.30, random_state=30)
lm = LinearRegression()
lm.fit(X_train, Y_train)
# model evaluation for training set
y_train_predict = lm.predict(X_train)
rmse = (np.sqrt(mean_squared_error(Y_train, y_train_predict)))
r2 = r2_score(Y_train, y_train_predict)

print("The model performance for training set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

Y_pred = lm.predict(X_test)
rmse = (np.sqrt(mean_squared_error(Y_test, Y_pred)))
r2 = r2_score(Y_test, Y_pred)

print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))

# model evaluation for testing set
print(Y_pred.shape)
Y_pred

pd.DataFrame(Y_pred,Y_test)

The model performance for training set
--------------------------------------
RMSE is 0.46795565829606034
R2 score is 0.1239836440092964


The model performance for testing set
--------------------------------------
RMSE is 0.4793744043026675
R2 score is 0.0803098679806089
(303,)


Unnamed: 0,0
1,0.597459
1,0.532285
0,0.457037
0,0.496594
1,0.550705
...,...
0,0.451053
1,0.529600
1,0.708217
0,0.388759


In [80]:
num_class0 = np.sum(y == 0)
num_class1 = np.sum(y == 1)

# Undersample majority class
idx_class0 = np.where(y == 0)[0]
idx_class0_undersampled = np.random.choice(idx_class0, size=num_class1, replace=False)
idx_undersampled = np.concatenate([idx_class0_undersampled, np.where(y == 1)[0]])

# Create new undersampled dataset
X_undersampled = X[idx_undersampled]
y_undersampled = y[idx_undersampled]
print(np.unique(y_undersampled, return_counts=True))

(array([0, 1]), array([65, 65], dtype=int64))


In [81]:
X_train, X_test, Y_train, Y_test = train_test_split( X_undersampled, y_undersampled, test_size=0.30, random_state=30)
lm = LinearRegression()
lm.fit(X_train, Y_train)
# model evaluation for training set
y_train_predict = lm.predict(X_train)
rmse = (np.sqrt(mean_squared_error(Y_train, y_train_predict)))
r2 = r2_score(Y_train, y_train_predict)

print("The model performance for training set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

Y_pred = lm.predict(X_test)
rmse = (np.sqrt(mean_squared_error(Y_test, Y_pred)))
r2 = r2_score(Y_test, Y_pred)

print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))

# model evaluation for testing set
print(Y_pred.shape)
Y_pred

pd.DataFrame(Y_pred,Y_test)

The model performance for training set
--------------------------------------
RMSE is 0.40078359281798837
R2 score is 0.34410442288894494


The model performance for testing set
--------------------------------------
RMSE is 0.5859752886795905
R2 score is -0.5451516752440824
(39,)


Unnamed: 0,0
0,0.598394
0,0.487797
0,0.78458
1,0.169097
0,0.073604
1,0.613222
0,0.410765
1,0.628866
1,-0.044136
1,0.521327
