In [2]:
pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.11.0-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.6/235.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.11.0 imblearn-0.0
Note: you may need to restart the kernel to use updated packages.


In [11]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the ARFF file using scipy.io.arff
from scipy.io import arff
data, meta = arff.loadarff('polish_data/5year.arff')
df = pd.DataFrame(data)
df = df.dropna()

# Separating features (X) and target variable (y)
X = df.iloc[:, :-1]
y = df.iloc[:, -1].astype(int)

In [12]:
# Random Undersampling
# I performed Random UnderSampling to generate the balanced data 
# and train Decision Tree, and SVM Models.

rus = RandomUnderSampler()
X_rus, y_rus = rus.fit_resample(X, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_rus, y_rus, test_size=0.2, random_state=42)

# Train Decision Tree and SVM models on undersampled data
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred)

svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred)
print(f'SVM model prediction using Undersampling', y_pred)
print(f'SVM model accuracy score', accuracy_svm)

SVM model prediction using Undersampling [0 0 1 1 1 0 1 0 1 0 1 1 1 1 1 1 0 0 0 1 1 0 1 0 1 0 0 0 1 1 1 0 1 0 1 0 0
 1 0 0 0]
SVM model accuracy score 0.7073170731707317


In [13]:
# Random Oversampling
# I performed Random OverSampling to generate the balanced data 
# and train Decision Tree, and SVM Models.
ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(X, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_ros, y_ros, test_size=0.2, random_state=42)

# Train Decision Tree and SVM models on oversampled data
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred)

svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred)
print(f'SVM model prediction using Oversampling', y_pred)
print(f'SVM model accuracy score', accuracy_svm)

SVM model prediction using Oversampling [1 1 1 ... 0 1 1]
SVM model accuracy score 0.7073378839590444


In [14]:
# SMOTE
# I performed SMOTE to generate the balanced data and train Decision
# Tree, and SVM Models.
smote = SMOTE()
X_smote, y_smote = smote.fit_resample(X, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

# Train Decision Tree and SVM models on SMOTE data
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred)

svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred)
print(f'SVM model prediction using SMOTE', y_pred)
print(f'SVM model accuracy score', accuracy_svm)

SVM model prediction using SMOTE [1 0 1 ... 0 1 1]
SVM model accuracy score 0.7141638225255973


In [15]:
# Borderline-SMOTE
bsmote = BorderlineSMOTE()
X_bsmote, y_bsmote = bsmote.fit_resample(X, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_bsmote, y_bsmote, test_size=0.2, random_state=42)

# Train Decision Tree and SVM models on Borderline-SMOTE data
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred)

svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred)
print(f'SVM model prediction using Borderline-SMOTE', y_pred)
print(f'SVM model accuracy score', accuracy_svm)

SVM model prediction using Borderline-SMOTE [1 1 0 ... 0 1 0]
SVM model accuracy score 0.8122866894197952


In [16]:
# ADASYN
adasyn = ADASYN()
X_adasyn, y_adasyn = adasyn.fit_resample(X, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_adasyn, y_adasyn, test_size=0.2, random_state=42)

# Train Decision Tree and SVM models on ADASYN data
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred)

svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred)
print(f'SVM model prediction using ADASYN', y_pred)
print(f'SVM model accuracy score', accuracy_svm)

SVM model prediction using ADASYN [0 1 1 ... 1 1 0]
SVM model accuracy score 0.68808911739503
