In [150]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

df = pd.read_csv('npf_train.csv')

npf = df.copy()
class2 = np.array(["noevent","event"])
npf["class2"] = class2[(npf["class4"]!="nonevent").astype(int)]
class_type = np.array([0,1])
npf["class_type"] = class_type[(npf["class4"]!="nonevent").astype(int)]

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import RFECV, RFE
import sklearn.preprocessing as preprocessing


X = npf.drop(["id","date","class4","partlybad","class2","class_type"],axis=1)
y = npf["class_type"]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=101)

# Binary classification

In [146]:
# Preprocessing
estimator_RF = RandomForestClassifier(n_estimators=1500)
selector_RF = RFECV(estimator=estimator_RF, step=1, cv=5, scoring="accuracy")
#selector = RFE(estimator=SVC(kernel="linear"), step=1, n_features_to_select=5)
selector_RF = selector_RF.fit(X_train, y_train)
sup_RF = selector_RF.support_
ranking_RF = selector_RF.ranking_

X_cv_selected_RF = X_train[X_train.columns[sup_RF]]

X_trainScale = preprocessing.scale(X_train)
X_testScale = preprocessing.scale(X_test)

X_train_scale_selfeatures = preprocessing.scale(X_cv_selected_RF)
X_test_scale_selfeatures = preprocessing.scale(X_test[X_test.columns[sup_RF]])

In [151]:
m_RF = RandomForestClassifier(n_estimators=1500)
m_RF.fit(X_train_scale_selfeatures, y_train)
print(f"Binary accuracy: {m_RF.score(X_test_scale_selfeatures, y_test)}")
acc = lambda p: (y_test * np.round(p) + (1 - y_test) * (1 - np.round(p))).mean()
perp = lambda p: np.exp(-np.mean(np.log(y_test*p + (1 - y_test) * (1 - p))))

phat = m_RF.predict_proba(X_test_scale_selfeatures)[:,1]

print(f"RF perp: {perp(phat)}")

Binary accuracy: 0.8928571428571429
RF perp: 1.2841501653269833


# Multi-label classification

In [152]:
from sklearn.preprocessing import StandardScaler

npf["multi_class"] = np.where(npf["class4"]=="nonevent",1,0)
npf["multi_class"] = np.where(npf["class4"]=="Ia",2,npf["multi_class"])
npf["multi_class"] = np.where(npf["class4"]=="Ib",3,npf["multi_class"])
npf["multi_class"] = np.where(npf["class4"]=="II",4,npf["multi_class"])

data_multilabel = npf[npf["class4"]!="nonevent"]
#data_multilabel = npf

Xm = data_multilabel.drop(["id","date","class4","partlybad","class_type","multi_class", "class2"],axis=1)
ym = data_multilabel["multi_class"] 

In [153]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler #UnderSampler

_, Xm_test, _, ym_test = train_test_split(Xm,ym,test_size=0.3,random_state=101)

model_smote=SMOTE()
x_smote_resampled,y_smote_resampled=model_smote.fit_resample(Xm,ym)  #Oversampling
y_smote_resampled.value_counts()

model_RandomUnderSampler=RandomUnderSampler()   
x_RandomUnderSampler_resampled,y_RandomUnderSampler_resampled=model_RandomUnderSampler.fit_resample(X,y)  

Xm_train, _, ym_train, _ = train_test_split(x_smote_resampled,y_smote_resampled,test_size=0.5,random_state=101)
ym_train.value_counts()

3    62
2    59
4    48
Name: multi_class, dtype: int64

In [154]:
multi_RF = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=1500))
multi_RF.fit(Xm_train,ym_train)
predictions_RF = multi_RF.predict(Xm_test)

print(classification_report(ym_test,predictions_RF))
multi_score = multi_RF.score(Xm_test,ym_test)
print("score", multi_score)

perpm = lambda p: np.exp(-np.mean(np.log(ym_test*p + (1 - ym_test) * (1 - p))))

phat_m = multi_RF.predict_proba(Xm_test)[:,1]
print(f"RF perp: {perpm(phat_m)}")

              precision    recall  f1-score   support

           2       0.45      0.71      0.56         7
           3       0.59      0.83      0.69        23
           4       0.85      0.57      0.69        40

    accuracy                           0.67        70
   macro avg       0.63      0.71      0.64        70
weighted avg       0.73      0.67      0.67        70

score 0.6714285714285714
RF perp: 1.5095664459047136


  result = getattr(ufunc, method)(*inputs, **kwargs)


# Making and collecting predictions on the actual test data

In [175]:
npf_pred = pd.read_csv('npf_test_hidden.csv')
X_pred = npf_pred.drop(["id","date","class4","partlybad"],axis=1)
X_pred_binary = X_pred[X_pred.columns[sup_RF]]
ans = pd.DataFrame(columns=["class4", "p"])

X_pred_scale_selfeatures = preprocessing.scale(X_pred_binary)

# TODO: maybe can indeed train multi-label classifier on all instances, 
# get probability for each class separately!

binary_predictions = m_RF.predict(X_pred_scale_selfeatures)
ans["p"] = m_RF.predict_proba(X_pred_scale_selfeatures)[:,1]
label = binary_predictions == 1
ans["class4"] = np.where(label, "event", "nonevent")
ans["class4"] = np.where(ans["class4"]=="nonevent", "nonevent", multi_RF.predict(X_pred))
ans["class4"] = np.where(ans["class4"]=="2", "Ia", ans["class4"])
ans["class4"] = np.where(ans["class4"]=="3", "Ib", ans["class4"])
ans["class4"] = np.where(ans["class4"]=="4", "II", ans["class4"])

with open("answers.csv", "w") as ansfile:
    ansfile.write(f"{m_RF.score(X_test_scale_selfeatures, y_test)}\n")
    ans.to_csv(ansfile, index=False)


[4]
[0.62733333]
class4          II
p         0.627333
Name: 10, dtype: object


