In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
train_df = pd.read_csv('../train_test_files/processed_train.csv')
test_df  = pd.read_csv('../train_test_files/processed_test.csv')
x=test_df['response_id']
test_df.drop(['response_id'],axis=1,inplace=True)

In [3]:
model  = DecisionTreeClassifier(random_state=42)

In [4]:
param_grid = {
    'criterion': ['gini'],  # for classification
    'max_depth': [10, 20, 30],  # Limit tree depth to avoid overfitting
    'min_samples_split': [2, 10, 20],    # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 5, 10],      # Minimum samples required to be at a leaf node
    'max_features': ['sqrt', 'log2'],  # Number of features to consider for splits
    'max_leaf_nodes': [None, 20, 50],    # Maximum number of leaf nodes (can speed up training)
    'class_weight': ['balanced']   # Only for classification; use balanced for imbalanced data
}


In [5]:
grid_search = GridSearchCV(
    estimator=model, 
    param_grid=param_grid, 
    scoring='f1',  # or another scoring metric you prefer
    cv=5,                # Number of cross-validation folds
    verbose=2,           # Level of verbosity
    n_jobs=-1            # Use all available cores
)


In [6]:
grid_search.fit(train_df.drop(['exit_status'],axis=1), train_df['exit_status'])


Fitting 5 folds for each of 162 candidates, totalling 810 fits
[CV] END class_weight=balanced, criterion=gini, max_depth=10, max_features=sqrt, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2; total time=   0.1s
[CV] END class_weight=balanced, criterion=gini, max_depth=10, max_features=sqrt, max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=2; total time=   0.1s
[CV] END class_weight=balanced, criterion=gini, max_depth=10, max_features=sqrt, max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=2; total time=   0.1s
[CV] END class_weight=balanced, criterion=gini, max_depth=10, max_features=sqrt, max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=20; total time=   0.1s
[CV] END class_weight=balanced, criterion=gini, max_depth=10, max_features=sqrt, max_leaf_nodes=None, min_samples_leaf=10, min_samples_split=10; total time=   0.1s
[CV] END class_weight=balanced, criterion=gini, max_depth=10, max_features=sqrt, max_leaf_nodes=20, min_samples_leaf=1, min_

In [7]:
final = grid_search.predict(test_df)

In [8]:
predictions = ["Stayed" if pred == 1 else "Left" for pred in final]

In [9]:
output_df = pd.DataFrame({
    'response_id': x,
    'Predictions': predictions
})

In [10]:
output_df.to_csv('../output/decisiontrees_output.csv', index=False)

In [11]:
print(grid_search.best_params_)

{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'min_samples_leaf': 5, 'min_samples_split': 2}


In [12]:
y_predict = grid_search.predict(train_df.drop(['exit_status'],axis=1))

In [13]:
from sklearn.metrics import f1_score

In [14]:
print("the f1 score is :" , f1_score(train_df['exit_status'],y_predict))

the f1 score is : 0.7304265658747301
