# Initial Training

In [3]:
import pandas as pd
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from itertools import combinations
from sklearn.utils import resample
import matplotlib.pyplot as plt
import time

campaign = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank.csv')
mini_holdout = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank_holdout_test_mini.csv')
# campaign.info()

# Separate majority and minority classes
campaign_majority = campaign[campaign['y'] == campaign['y'].value_counts().idxmax()]
campaign_minority = campaign[campaign['y'] == campaign['y'].value_counts().idxmin()]

# Oversample
campaign_minority_oversampled = resample(
    campaign_minority,
    replace=True,                  # Sample with replacement
    n_samples=len(campaign_majority),    # Match majority class size
    random_state=42                # Reproducibility
)

# Combine back to a balanced dataset
campaign_balanced = pd.concat([campaign_majority, campaign_minority_oversampled])

# Shuffle the dataset
campaign_balanced = campaign_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [4]:
'''
##### Features #####

## Client data:
- age (numeric)
- job : type of job (categorical (12): "admin.", "blue-collar", "entrepreneur", "housemaid", "management", "retired", "self-employed", "services", "student", "technician", "unemployed", "unknown")
- marital : marital status (categorical (4): "divorced", "married", "single", "unknown"; note: "divorced" means divorced or widowed)
- education (categorical (8): "basic.4y", "basic.6y", "basic.9y", "high.school", "illiterate", "professional.course", "university.degree", "unknown")
- default: has credit in default? (categorical (3): "no", "yes", "unknown")
- housing: has housing loan? (categorical (3): "no", "yes", "unknown")
- loan: has personal loan? (categorical (3): "no", "yes", "unknown")

## Related with the last contact of the current campaign:
- contact: contact communication type (categorical (2): "cellular", "telephone")
- month: last contact month of year (categorical (12): "jan", "feb", "mar", ..., "nov", "dec")
- dayofweek: last contact day of the week (categorical (5): "mon", "tue", "wed", "thu", "fri")

## Other attributes:
- campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
- pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
- previous: number of contacts performed before this campaign and for this client (numeric)
- poutcome: outcome of the previous marketing campaign (categorical (3): "failure", "nonexistent", "success")

## Social and economic context attributes
- emp.var.rate: employment variation rate - quarterly indicator (numeric)
- cons.price.idx: consumer price index - monthly indicator (numeric)
- cons.conf.idx: consumer confidence index - monthly indicator (numeric)
- euribor3m: euribor 3 month rate - daily indicator (numeric)
- nr.employed: number of employees - quarterly indicator (numeric)

## Output variable (desired
- y - has the client subscribed a term deposit? (binary: "yes","no")
'''

features = ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',\
            'contact', 'month', 'day_of_week',\
            'campaign', 'pdays', 'previous', 'poutcome',\
            'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
X = pd.get_dummies(campaign_balanced[features], drop_first=True)
y = campaign_balanced['y']

'''
##### This is how I tested a bunch of different models, so it's commented out.
##### The best one so far is below this block.
test_sizes = [0.3, 0.4, 0.5]
# 0.1: 90.073
# 0.2: 90.123
# 0.3: 90.196
# 0.4: 90.104
# 0.5: 90.056
# This is possibly worth refining.
params = {
    'criterion': ["gini", "entropy"], # gini seems better more often
    'splitter': ["best", "random"], # random is consistently better
    'max_depth': [3, 5, 10, 15, None], # 5 is consistently best
    'min_samples_split': [10, 20, 50, 100], # 10
    'min_samples_leaf': [5, 10, 20, 50], # 20
    'random_state': [0, 1, 2, 3, 4]
}
gs_cvs = []
total_start = time.time()
for ts in test_sizes:
    subtime_start = time.time()
    # ts = 0.3
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ts, random_state=1)
    grid_search = GridSearchCV(DecisionTreeClassifier(), params, scoring="accuracy", n_jobs=-1)
    grid_search.fit(X_train, y_train)
    gs_cvs.append([ts, grid_search])
    # subtime = time.time() - subtime_start
    print(f'Test size {ts} done in {(time.time() - subtime_start):.2f} s.')
    # clf = DecisionTreeClassifier(random_state=5)
    # clf.fit(X_train, y_train)
    # print(clf.score(X_test, y_test))
print(f'All tests done in {(time.time() - total_start):.2f} s.')
print('Done.')
'''

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
clf = DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=None, min_samples_split=10, min_samples_leaf=5, random_state=1)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy').mean())

0.9364551039427886
0.8602430135217725


In [5]:
fig, ax = plt.subplots(figsize=(20, 20))
tree.plot_tree(clf, fontsize=10, feature_names=X.columns)
plt.show()

KeyboardInterrupt: 

In [6]:
mini_holdout_X = pd.get_dummies(mini_holdout[features], drop_first=True).reindex(columns = X.columns, fill_value=0)
pd.DataFrame([{"no": 0, "yes": 1}[label] for label in clf.predict(mini_holdout_X)], columns=['predictions']).to_csv("team5-module2-predictions.csv", index=False)

# New Section