In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

## Getting dataset

In [3]:
df = pd.read_csv('bank-full.csv', delimiter=';')

In [5]:
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [19]:
all_columns = df.columns

df.dtypes

## Features

In [25]:
# Specify the columns you want to exclude
exclude_columns = ['default', 'loan']

## Data preparation
 - Select only the features from above.
 - Check if the missing values are presented in the features.

In [26]:
# Use list comprehension to create a new list excluding the unwanted columns
selected_columns = [col for col in all_columns if col not in exclude_columns]
selected_columns

['age',
 'job',
 'marital',
 'education',
 'balance',
 'housing',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'y']

In [33]:
selected_columns = df[selected_columns]

In [37]:
selected_columns.isnull().mean()

age          0.0
job          0.0
marital      0.0
education    0.0
balance      0.0
housing      0.0
contact      0.0
day          0.0
month        0.0
duration     0.0
campaign     0.0
pdays        0.0
previous     0.0
poutcome     0.0
y            0.0
dtype: float64

## Question 1
 - What is the most frequent observation (mode) for the column education?

In [41]:
selected_columns['education']

0         tertiary
1        secondary
2        secondary
3          unknown
4          unknown
           ...    
45206     tertiary
45207      primary
45208    secondary
45209    secondary
45210    secondary
Name: education, Length: 45211, dtype: object

# Find the mode of the 'education' column in the selected_columns DataFrame
mode_education = selected_columns['education'].mode()[0]
mode_education

## Question 2
 - Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between  every pair of features.

 - What are the two features that have the biggest correlation?

In [48]:
numerical = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [51]:
# Assuming 'selected_columns' is your DataFrame
# List of numerical features
numerical = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Step 1: Filter the DataFrame to include only the numerical features
numerical_df = selected_columns[numerical]

# Step 2: Compute the correlation matrix
correlation_matrix = numerical_df.corr()

# Step 3: Stack the correlation matrix and find the highest correlation
correlation_values = correlation_matrix.stack().reset_index()
correlation_values.columns = ['Feature 1', 'Feature 2', 'Correlation']

# Exclude self-correlation
correlation_values = correlation_values[correlation_values['Feature 1'] != correlation_values['Feature 2']]

# Find the two features with the highest correlation
top_correlation = correlation_values.sort_values(by='Correlation', ascending=False).head(1)

# Print the result
top_correlation

Unnamed: 0,Feature 1,Feature 2,Correlation
47,previous,pdays,0.45482


## Target encoding
 - Now we want to encode the y variable.
 - Let's replace the values yes/no with 1/0.

In [54]:
# Target encoding with .loc to avoid SettingWithCopyWarning
selected_columns.loc[:, 'y'] = selected_columns['y'].replace({'yes': 1, 'no': 0})

In [58]:
from sklearn.model_selection import train_test_split

# Features and target variable
X = selected_columns.drop('y', axis=1)  # Features
y = selected_columns['y']  # Target variable

# Split the data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)  # 60% train, 40% temp
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 20% val, 20% test


In [59]:
from sklearn.feature_selection import mutual_info_classif

# Since we are working with categorical variables, ensure they are properly encoded
X_train_encoded = pd.get_dummies(X_train, drop_first=True)

# Calculate mutual information scores
mi_scores = mutual_info_classif(X_train_encoded, y_train, discrete_features='auto')

# Create a DataFrame for better visualization
mi_scores_df = pd.DataFrame(mi_scores, index=X_train_encoded.columns, columns=['MI Score'])
mi_scores_df = mi_scores_df.sort_values(by='MI Score', ascending=False)

# Round the scores to 2 decimals
mi_scores_df['MI Score'] = mi_scores_df['MI Score'].round(2)

# Display the mutual information scores
print(mi_scores_df)


                     MI Score
duration                 0.07
poutcome_success         0.03
pdays                    0.03
balance                  0.02
contact_unknown          0.02
previous                 0.01
housing_yes              0.01
age                      0.01
poutcome_unknown         0.01
month_may                0.01
month_oct                0.01
month_sep                0.00
day                      0.00
job_retired              0.00
job_blue-collar          0.00
campaign                 0.00
month_mar                0.00
marital_married          0.00
education_secondary      0.00
month_jul                0.00
education_tertiary       0.00
poutcome_other           0.00
month_aug                0.00
marital_single           0.00
job_student              0.00
month_jan                0.00
job_management           0.00
month_dec                0.00
job_unknown              0.00
month_feb                0.00
job_unemployed           0.00
contact_telephone        0.00
education_

In [61]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Check available columns in selected_columns
print(selected_columns.columns)

# Assuming selected_columns is your DataFrame with encoded target variable 'y'
X = selected_columns.drop('y', axis=1)  # Features
y = selected_columns['y']  # Target variable

# Modify the categorical columns list based on available columns
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 
                    'loan', 'contact', 'month', 'poutcome']

# Filter the categorical columns to include only those present in the DataFrame
categorical_cols = [col for col in categorical_cols if col in X.columns]

# Perform one-hot encoding on available categorical variables
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Split the data into train, validation, and test sets
X_train, X_val, y_train, y_val = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train the Logistic Regression Model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = model.predict(X_val)

# Calculate the accuracy
accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {round(accuracy, 2)}')


Index(['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact',
       'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome',
       'y'],
      dtype='object')
Validation Accuracy: 0.9


In [69]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Assuming selected_columns is your DataFrame and y has been target-encoded

# Step 1: Define the features, including 'marital'
features = ['age', 'balance', 'previous', 'marital']

# Step 2: One-hot encode categorical variables
X = pd.get_dummies(selected_columns[features], drop_first=True)

# Step 3: Separate the target variable
y = selected_columns['y']

# Step 4: Split the data into train/val/test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Step 5: Train the initial model using all the features
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
original_accuracy = accuracy_score(y_val, model.predict(X_val))

# Step 6: Feature elimination
results = {}

# Evaluate the model by dropping each feature one at a time
for feature in features:
    # Check if the feature exists in the columns after one-hot encoding
    if feature in X_train.columns:
        # Drop the feature
        X_train_temp = X_train.drop(feature, axis=1)
        X_val_temp = X_val.drop(feature, axis=1)
        
        model.fit(X_train_temp, y_train)
        accuracy_without_feature = accuracy_score(y_val, model.predict(X_val_temp))
        
        # Calculate the difference in accuracy
        difference = original_accuracy - accuracy_without_feature
        results[feature] = difference
    else:
        print(f"Feature '{feature}' not found after one-hot encoding.")

# Find the feature with the smallest difference
least_useful_feature = min(results, key=results.get)
least_difference = results[least_useful_feature]

# Output the result
print(f"The least useful feature is: {least_useful_feature} with a difference of {least_difference:.4f}")


Feature 'marital' not found after one-hot encoding.
The least useful feature is: previous with a difference of -0.0012


In [70]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Assuming selected_columns is your DataFrame and y has been target-encoded

# Step 1: One-hot encode categorical variables and separate features and target
features = ['age', 'balance', 'previous', 'marital']
X = pd.get_dummies(selected_columns[features], drop_first=True)
y = selected_columns['y']

# Step 2: Split the data into train/val/test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Step 3: Define the values for C
C_values = [0.01, 0.1, 1, 10, 100]
best_accuracy = 0
best_C = None

# Step 4: Train and evaluate models for each value of C
for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    # Calculate accuracy on the validation set
    accuracy = accuracy_score(y_val, model.predict(X_val))
    rounded_accuracy = round(accuracy, 3)
    
    print(f"C: {C}, Validation Accuracy: {rounded_accuracy}")
    
    # Update best accuracy and best C if the current one is better or equal and smaller
    if rounded_accuracy > best_accuracy or (rounded_accuracy == best_accuracy and (best_C is None or C < best_C)):
        best_accuracy = rounded_accuracy
        best_C = C

# Step 5: Output the best C and its accuracy
print(f"The best C is: {best_C} with a validation accuracy of: {best_accuracy}")


C: 0.01, Validation Accuracy: 0.883
C: 0.1, Validation Accuracy: 0.883
C: 1, Validation Accuracy: 0.883
C: 10, Validation Accuracy: 0.883
C: 100, Validation Accuracy: 0.883
The best C is: 0.01 with a validation accuracy of: 0.883
