Data reading and initial processing

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("./data/stage_3_feature_importance.csv")
y_gt = pd.read_csv("./data/stage_3_ground_truth.csv", usecols=['categoryId'])

Data preparation

In [None]:
df_lbl = df[df.categoryId != 0]
df_unlbl = df[df.categoryId == 0]

VALIDATE_SIZE = 0.3
UNLBL_SIZE = 0.8

_, df_test = train_test_split(df_unlbl, test_size=VALIDATE_SIZE / UNLBL_SIZE, random_state=42)

X_test = df_test.drop('categoryId', axis=1)
y_test = y_gt.iloc[X_test.index]

Measuring classes sizes

In [None]:
from matplotlib import pyplot as plt
from collections import Counter


def plot_label_distribution(data: pd.DataFrame, name: str = ''):
    c = Counter()
    c.update(data)

    plt.figure(figsize=(8, 8))
    plt.title(name)
    plt.pie(c.values(), labels=c.keys(), autopct='%1.1f%%')
    plt.show()


plot_label_distribution(df['categoryId'], 'Dataset label distribution')
plot_label_distribution(y_gt['categoryId'], 'Ground-truth label distribution')
plot_label_distribution(df_lbl['categoryId'], 'Dataset w/o 0 label distribution')

Balancing dataset

In [None]:
from imblearn.over_sampling import RandomOverSampler, SMOTE

y_lbl = df_lbl['categoryId']
X_lbl = df_lbl.drop('categoryId', axis=1)

X_lbl_res, y_lbl_res = RandomOverSampler(random_state=42).fit_resample(X_lbl, y_lbl)
# X_lbl_res, y_lbl_res = SMOTE(random_state=42).fit_resample(X_lbl, y_lbl)

plot_label_distribution(y_lbl_res, 'Resampled data label distribution')
print("Resampled data shape:", X_lbl_res.shape)

In [None]:
y_unlbl = df_unlbl['categoryId'].replace(0, -1)
X_unlbl = df_unlbl.drop('categoryId', axis=1)

y_mixed = pd.concat([y_lbl_res, y_unlbl])
X_mixed = pd.concat([X_lbl_res, X_unlbl])

print("Final data size:", X_mixed.shape, y_mixed.shape)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
parameters = {'n_jobs': [-1],
              'random_state': [42],
              'max_depth': [20, 40, 60, None],
              'max_features': ['auto', 'sqrt'],
              'n_estimators': [200, 400, 600, 800]}

gs = GridSearchCV(clf, parameters, verbose=3)
# gs.fit(X_lbl_res, y_lbl_res)
# print(gs.best_params_)

In [None]:
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import accuracy_score
from sklearn.semi_supervised import LabelPropagation

# model = LabelPropagation(kernel='knn', n_jobs=-1)
# model.fit(X_mixed, y_mixed)
# y_pred = model.predict(X_test)

# # clf.fit(X_mixed, model.transduction_)

# clf.set_params(**{'max_depth': None, 'max_features': 'auto', 'n_estimators': 600, 'n_jobs': -1, 'random_state': 42})
# # clf.set_params(**gs.best_params_)
clf.fit(X_lbl_res, y_lbl_res)
y_pred = clf.predict(X_test)



print("Accuracy score:", accuracy_score(y_test, y_pred))
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, ConfusionMatrixDisplay

fig, ax = plt.subplots(figsize=(10, 10))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
plt.show()
