# Eksploracja Danych - Projekt
Tomasz Kiljańczyk (136257)

Wojciech Lulek (136280)

# Data reading and initial processing

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("./data/stage_3_feature_importance.csv", dtype={'categoryId': str})
y_gt = pd.read_csv("./data/stage_3_ground_truth.csv", usecols=['categoryId'], dtype={'categoryId': str})

# Data preparation

In [None]:
y_gt['categoryId'].unique()

In [None]:
y_gt[y_gt['categoryId'] == '0'].count()

## Removal of gt entries with missing categories

In [None]:
non_missing_gt_categories = y_gt['categoryId'] != '0'
df = df[non_missing_gt_categories].reset_index(drop=True)
y_gt = y_gt[non_missing_gt_categories].reset_index(drop=True)

## Test set preparation

In [None]:
df_lbl = df[df.categoryId != '0']
df_unlbl = df[df.categoryId == '0']

VALIDATE_SIZE = 0.3
UNLBL_SIZE = 0.8

_, df_test = train_test_split(df_unlbl, test_size=VALIDATE_SIZE / UNLBL_SIZE, random_state=42)

X_test = df_test.drop('categoryId', axis=1)
y_test = y_gt.iloc[X_test.index]

Measuring classes sizes

In [None]:
from matplotlib import pyplot as plt
from collections import Counter


def plot_label_distribution(data: pd.DataFrame, name: str = ''):
    c = Counter()
    c.update(data)

    plt.figure(figsize=(8, 8))
    plt.title(name)
    plt.pie(c.values(), labels=c.keys(), autopct='%1.1f%%')
    plt.show()

In [None]:
plot_label_distribution(df['categoryId'], 'Dataset label distribution')

In [None]:
plot_label_distribution(y_gt['categoryId'], 'Ground-truth label distribution')

In [None]:
plot_label_distribution(df_lbl['categoryId'], 'Dataset w/o 0 label distribution')

Balancing dataset

In [None]:
from imblearn.over_sampling import RandomOverSampler

y_lbl = df_lbl['categoryId']
X_lbl = df_lbl.drop('categoryId', axis=1)

X_lbl_res, y_lbl_res = RandomOverSampler(random_state=42).fit_resample(X_lbl, y_lbl)
# X_lbl_res, y_lbl_res = SMOTE(random_state=42).fit_resample(X_lbl, y_lbl)

plot_label_distribution(y_lbl_res, 'Resampled data label distribution')
print("Resampled data shape:", X_lbl_res.shape)

In [None]:
y_unlbl = df_unlbl['categoryId'].replace(0, -1)
X_unlbl = df_unlbl.drop('categoryId', axis=1)

y_mixed = pd.concat([y_lbl_res, y_unlbl])
X_mixed = pd.concat([X_lbl_res, X_unlbl])

print("Final data size:", X_mixed.shape, y_mixed.shape)

# Classifiers

In [None]:
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import accuracy_score

## RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV
#
# clf = RandomForestClassifier()
# parameters = {'n_jobs': [-1],
#               'random_state': [42],
#               'max_depth': [20, 40, 60, None],
#               'max_features': ['auto', 'sqrt'],
#               'n_estimators': [200, 400, 600, 800]}
#
# gs = GridSearchCV(clf, parameters, verbose=3)
# gs.fit(X_lbl_res, y_lbl_res)
# print(gs.best_params_)

In [None]:
clf = RandomForestClassifier()
clf.fit(X_lbl_res, y_lbl_res)
y_pred = clf.predict(X_test)

print("Accuracy score:", round(accuracy_score(y_test, y_pred), 4))
print(classification_report_imbalanced(y_test, y_pred))

## LabelSpreading

In [None]:
# from sklearn.semi_supervised import LabelSpreading
#
# model = LabelSpreading(kernel='knn', n_jobs=-1)
#
# model.fit(X_lbl_res, y_lbl_res)
# y_pred = model.predict(X_test)
#
# print("Accuracy score:", round(accuracy_score(y_test, y_pred), 4))
# print(classification_report_imbalanced(y_test, y_pred))

## SVC

In [None]:
# from sklearn.svm import SVC
#
# clf = SVC()
# clf.fit(X_lbl_res, y_lbl_res)
# y_pred = clf.predict(X_test)
# print(y_pred)
#
# print("Accuracy score:", round(accuracy_score(y_test, y_pred), 4))
# print(classification_report_imbalanced(y_test, y_pred))

## MLPClassifier

In [None]:
# from sklearn.neural_network import MLPClassifier
#
# clf = MLPClassifier()
# clf.fit(X_lbl_res, y_lbl_res)
# y_pred = clf.predict(X_test)
# print(y_pred)
#
# print("Accuracy score:", round(accuracy_score(y_test, y_pred), 4))
# print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# from sklearn.semi_supervised import LabelPropagation
#
# model = LabelPropagation(kernel='knn', n_jobs=-1)
#
# model.fit(X_lbl_res, y_lbl_res)
# y_pred = model.predict(X_test)
#
# print("Accuracy score:", accuracy_score(y_test, y_pred))
# print(classification_report_imbalanced(y_test, y_pred))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

fig, ax = plt.subplots(figsize=(10, 10))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)
plt.show()
