In [56]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,roc_auc_score,roc_curve,auc
from sklearn.model_selection import train_test_split

In [57]:
#We're getting the data
path = "C:/Users/yusse/Documents/dataproject/trainData/32_train.csv"
df = pd.read_csv(path, header=None)
df = df.assign(label=[(lambda _, index: 0 if index < 2500 else 1)(row, index) for index, row in df.iterrows()])


In [58]:
# Seperate labels and attributes
X = df.iloc[:, :-1].values
y = df['label']

X_train,X_validation,y_train,y_validation = train_test_split(X,y,test_size=0.1)

In [59]:
#Define the generic classifier. This is the classifier with trivial parameters.
# max_depth and max_sample_size will be adjusted afterwards.
rf = RandomForestClassifier(oob_score=True,class_weight={0:1.43,1:3.33})

In [None]:
# Compute best parameters in terms of accuracy, with GridSearchCV 
grid_search = GridSearchCV(
    estimator=rf,
    param_grid={"max_depth" : [i for i in range(5,30,5)],"max_samples": [i for i in range(500,4500,500)]},
    cv=25,  # 25-fold cross-validation
    scoring='balanced_accuracy',  
    n_jobs=-1,  
    verbose=1  
)

grid_search.fit(X_train, y_train)


print("Best parameters:", grid_search.best_params_)


print("Best cross-validated scores:", grid_search.best_score_)

In [None]:
#Now, we will use oob as a measure and try to find best parameters again.
#Note that since we are measuring one by one, we had to set max_depth to something.
# We set max_depth to 15 because that was the most common value we got from gridsearch. Note that
# it sometimes gives 13,15,20 etc. as beest depth.
max_samples = [i for i in range(500, 5000,500)]


oob_scores = []

for sample in max_samples:
    rf = RandomForestClassifier(
        max_depth=15,
        oob_score=True,
        class_weight={0:1.43,1:3.33},
        max_samples=sample
    )
    rf.fit(X_train, y_train)
    oob_scores.append(rf.oob_score_)


plt.figure(figsize=(10, 6))
plt.plot(max_samples, oob_scores, marker='o', linestyle='-', color='b', label='OOB Score')
plt.xlabel('Max Sample Size', fontsize=12)
plt.ylabel('OOB Score', fontsize=12)
plt.title('OOB Score vs Max Sample Size', fontsize=14)
plt.grid(True)
plt.legend(fontsize=10)
plt.show()

In [None]:
max_depths = grid_search.param_grid['max_depth']  # This is the range of depths tested in the grid


oob_scores = []


for depth in max_depths:
    rf = RandomForestClassifier(
        max_depth=depth,
        oob_score=True,  
        class_weight={0:1.43,1:3.33},
        max_samples=3500
    )
    rf.fit(X_train, y_train)
    oob_scores.append(rf.oob_score_)


plt.figure(figsize=(10, 6))
plt.plot(max_depths, oob_scores, marker='o', linestyle='-', color='b', label='OOB Score')
plt.xlabel('Max Depth', fontsize=12)
plt.ylabel('OOB Score', fontsize=12)
plt.title('OOB Score vs Max Depth', fontsize=14)
plt.grid(True)
plt.legend(fontsize=10)
plt.show()


In [None]:
#Build the final rf, with tuned hyperparameters
rf_final = RandomForestClassifier(
        max_depth=15,
        oob_score=True,  
        class_weight={0:1.43,1:3.33},
        max_samples=4000
    )
rf_final.fit(X_train,y_train)

y_validation_pred = rf_final.predict(X_validation)

oob_final = rf_final.oob_score_
print(oob_final)

accuracy = accuracy_score(y_validation, y_validation_pred)

training_error = 1 - accuracy

print(f"Training error: {training_error}")


In [None]:
y_validation_prb = rf.predict_proba(X_validation)[:,1]
fpr, tpr, thresholds = roc_curve(y_validation, y_validation_prb, pos_label=1)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='blue', label=f"ROC curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color='gray')
plt.xlabel('Fpr')
plt.ylabel('Tpr')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()


In [69]:
path2  = "C:/Users/yusse/Documents/dataproject/trainData/32_test.csv"
test_set = pd.read_csv(path2, header=None)
predictions = rf_final.predict(test_set)
out_df = pd.DataFrame(predictions)
out_df.to_csv('Group_32_Final_Predictions.csv',index=False,header=None)