### 1. Load libraries

I start by importing all the necessary libraries.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### 2. Read in the data

Here I read in the dataset and save it in a dataframe. I also use some methods to visualise the data before I can preprocess it.

In [2]:
# Load the data
train_df = pd.read_csv('trainingset.txt')
train_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,44,JobCat9,single,secondary,no,29,yes,no,unknown,5,may,0,1,-1,0,unknown,TypeA
1,47,JobCat3,married,unknown,no,1506,yes,no,unknown,5,may,0,1,-1,0,unknown,TypeA
2,31,unknown,single,unknown,no,1,no,no,unknown,5,may,0,1,-1,0,unknown,TypeA
3,26,JobCat6,single,tertiary,no,447,yes,yes,unknown,5,may,0,1,-1,0,unknown,TypeA
4,42,JobCat4,divorced,tertiary,yes,2,yes,no,unknown,5,may,0,1,-1,0,unknown,TypeA


In [3]:
train_df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,27272.0,27272.0,27272.0,27272.0,27272.0,27272.0,27272.0
mean,39.880463,1361.079459,15.769287,0.0,2.739623,40.074105,0.578872
std,11.426248,3015.207142,8.300983,0.0,3.011097,100.200984,1.942882
min,16.0,-6847.0,1.0,0.0,1.0,-1.0,0.0
25%,31.0,73.0,8.0,0.0,1.0,-1.0,0.0
50%,37.0,447.0,16.0,0.0,2.0,-1.0,0.0
75%,48.0,1423.0,21.0,0.0,3.0,-1.0,0.0
max,95.0,98417.0,31.0,0.0,63.0,871.0,58.0


In [4]:
# Count null values in each column
nan_values = train_df.isnull().sum()
print(nan_values)

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [5]:
# Count unknown values in each column
unknown_counts = train_df.apply(lambda x: x[x == 'unknown'].count())
print(unknown_counts)

age              0
job            178
marital          0
education     1172
default          0
balance          0
housing          0
loan             0
contact       7897
day              0
month            0
duration         0
campaign         0
pdays            0
previous         0
poutcome     22316
y                0
dtype: int64


In [6]:
# Find number of unique values for each column in the dataset
unique_values = train_df.nunique()
print(unique_values)


age            75
job            12
marital         3
education       4
default         2
balance      5939
housing         2
loan            2
contact         3
day            31
month          12
duration        1
campaign       41
pdays         499
previous       35
poutcome        4
y               2
dtype: int64


In [7]:
# Find unique values for each categorical feature
categorical_columns = train_df.select_dtypes(include='object').columns
possible_values = train_df[categorical_columns].apply(lambda x: x.unique())
print(possible_values)

job          [JobCat9, JobCat3, unknown, JobCat6, JobCat4, ...
marital                            [single, married, divorced]
education              [secondary, unknown, tertiary, primary]
default                                              [no, yes]
housing                                              [yes, no]
loan                                                 [no, yes]
contact                         [unknown, cellular, telephone]
month        [may, jun, jul, aug, oct, nov, dec, jan, feb, ...
poutcome                    [unknown, other, failure, success]
y                                               [TypeA, TypeB]
dtype: object


In [8]:
print(train_df['job'].unique())

['JobCat9' 'JobCat3' 'unknown' 'JobCat6' 'JobCat4' 'JobCat2' 'JobCat11'
 'JobCat7' 'JobCat8' 'JobCat10' 'JobCat1' 'JobCat5']


In [9]:
# Get the counts of unique values in the 'poutcome' column
value_counts = train_df['poutcome'].value_counts()

# Print the counts
print(value_counts)

poutcome
unknown    22316
failure     2998
other       1052
success      906
Name: count, dtype: int64


### 3. Data preprocessing

I defined a function called "preprocess_data" to preapre the data before I can fit in the model. This function can later be used to prepare another dataframe created from the queries.txt file. One observation here is that I chose to first manually map some categorical values to integers to avoid creating too many columns with the method get_dummies() provided by pandas (this method automatically encode categorical values). This also has increased the accuracy of the model (by a very small margin though).

In [10]:
def preprocess_data(df):
    # Replace 'unknown' with the most frequent value in each column
    for column in df.columns:
        most_frequent_value = df[column].mode()[0]
        df[column] = df[column].replace('unknown', most_frequent_value)


    # Drop the column 'duration' as it has no value for the model
    df.drop('duration', axis=1, inplace=True)

    # Map months to integers
    month_mapping = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}
    df['month'] = df['month'].map(month_mapping)

    # Map JobCat to integers
    job_mapping = {'JobCat1': 1, 'JobCat2': 2, 'JobCat3': 3, 'JobCat4': 4, 'JobCat5': 5, 'JobCat6': 6, 'JobCat7': 7, 'JobCat8': 8, 'JobCat9': 9, 'JobCat10': 10, 'JobCat11': 11}
    df['job'] = df['job'].map(job_mapping)

    # Map education to integers
    education_mapping = {'primary': 1, 'secondary': 2, 'tertiary': 3}
    df['education'] = df['education'].map(education_mapping)

    # Map yes/no values to integers
    yes_no_mapping = {'no': 0, 'yes': 1}
    df['default'] = df['default'].map(yes_no_mapping)
    df['housing'] = df['housing'].map(yes_no_mapping)
    df['loan'] = df['loan'].map(yes_no_mapping)

    # Map target values to integers
    df['y'] = df['y'].map({'TypeA': 0, 'TypeB': 1})

    # Encode the rest of categorical variables
    df = pd.get_dummies(df)

    # Moves the target column back to the end of the dataframe
    y = df.pop('y')
    df['y'] = y

    return df

In [11]:
train_df = preprocess_data(train_df)

### 4. Split Data

Here I split the data between testing and training data.

In [12]:
# Separate features and target variable
X = train_df.drop('y', axis=1)
y = train_df['y']

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 5. Model Training

Given that this is a binary classification problem, Logistic Regression seems to be a good choice of model. I first tried to scale the data before trainning the model, but I found that increasing the maximum iterations gave me better results.

In [13]:
# Create a Logistic Regression model
model = LogisticRegression(max_iter=10000)

# Train the model
model.fit(X_train, y_train)

### 6. Model Prediction and Evaluation - Test Data

Here I use the test data to see how accurate my model is.

In [14]:
# Make predictions on the test set
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')

Accuracy: 0.8894592117323556


### 7. Model Prediction - Queries

In last step I use the queries.txt file to make predictions with the model I trained and write it to a file named with my student ID.

In [15]:
# Load the data
queries_df = pd.read_csv('queries.txt')

# Strip whitespaces from the column names since the collunm "campaign" has a whitespace at the beginning
queries_df.columns = queries_df.columns.str.strip()

queries_df = preprocess_data(queries_df)

# Separate the features from the target variable
X_queries = queries_df.drop('y', axis=1)

# Use the model to make predictions
predictions = model.predict(X_queries)

# Create a DataFrame with the query numbers and predictions
solution_df = pd.DataFrame({
    'query': range(1, len(predictions) + 1),
    'prediction': predictions
})

# Map the target values back to their original form
solution_df['prediction'] = solution_df['prediction'].map({0: 'TypeA', 1: 'TypeB'})

# Save the DataFrame to a text file
solution_df.to_csv('predictions.txt', index=False, header=False)

### 8. Pos-Prediction Analysis

By looking at my resulting text file, I've noticed that most of the times poutcome was equal to success, the target feature was equal 'TypeB'. So I did the following analysis to comprove my theory.

In [16]:
# Count the number of each type in the 'prediction' column
type_counts = solution_df['prediction'].value_counts()

# Filter the data where 'poutcome_success' is true
poutcome_success_df = X_queries[X_queries['poutcome_success'] == True]

# Filter the predictions where 'poutcome_success' is true
poutcome_success_predictions = solution_df.loc[poutcome_success_df.index]

# Count the number of instances where 'poutcome_success' is true and the result is 'TypeB'
num_typeB_success = (poutcome_success_predictions['prediction'] == 'TypeB').sum()

# Calculate the proportion of instances where 'poutcome_success' is true and the result is 'TypeB'
proportion_typeB_success = num_typeB_success / len(poutcome_success_predictions)

print(f"Total number of 'TypeB': {type_counts['TypeB']}")
print(f'Number of instances where poutcome_success is true and result is TypeB: {num_typeB_success}')
print(f'Proportion of instances where poutcome_success is true and result is TypeB: {proportion_typeB_success}')

Total number of 'TypeB': 545
Number of instances where poutcome_success is true and result is TypeB: 541
Proportion of instances where poutcome_success is true and result is TypeB: 0.8942148760330578
