In [1]:
import pandas as pd
from collections import Counter
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from sklearn.preprocessing import LabelEncoder

# Assuming df is your DataFrame with a 'downloads_category' column
# For example:
df = pd.read_csv('data/generated_data/hf_11_24_generated.csv')

In [6]:
df.downloads_category.value_counts()

downloads_category
Very Low    867712
Mid         308881
High        286800
Low         271634
Name: count, dtype: int64

In [None]:
# Encode the target variable
le = LabelEncoder()
y = le.fit_transform(df['downloads_category'])

# One-hot encode features (this converts categorical values to numeric dummy variables)
features = ['task_group', 'author_category', 'language_category', 'location']
X = pd.get_dummies(df[features], drop_first=True)

# Ensure any boolean columns are converted to integers
X = X.apply(lambda col: col.astype(int) if col.dtype == 'bool' else col)

# Print original distribution of target variable
print("Original class distribution:", Counter(y))

# Apply SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Print resampled distribution
print("Resampled class distribution:", Counter(y_res))

# took 60m 49.1s to run

Original class distribution: Counter({3: 867712, 2: 308881, 0: 286800, 1: 271634})




Resampled class distribution: Counter({3: 867712, 1: 867712, 0: 867712, 2: 867712})


In [None]:
print("Original class distribution:", Counter(y))
# Apply BorderlineSMOTE
borderline_smote = BorderlineSMOTE(random_state=42)
X_res_borderline, y_res_borderline = borderline_smote.fit_resample(X, y)

# Print resampled distribution
print("Resampled class distribution with BorderlineSMOTE:", Counter(y_res_borderline))

Original class distribution: Counter({3: 867712, 2: 308881, 0: 286800, 1: 271634})


