In [2]:
# dataframe and plotting
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# machine learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier

In [3]:
train = pd.read_csv('data/Train.csv')
test = pd.read_csv('data/Test.csv')

In [4]:
def remove_duplicates(df, cols):
    # Drop specified columns
    df_clean = df.drop(columns=cols)
    
    # Calculate the total number of rows in the DataFrame
    total_rows = len(df_clean)
    
    # Calculate the number of duplicate rows
    duplicate_rows = df_clean.duplicated().sum()
    
    # Calculate the percentage of duplicate rows
    duplicate_percentage = (duplicate_rows / total_rows) * 100
    
    df_clean = df_clean.drop_duplicates()
    
    # Display the percentage of duplicate rows
    print(f"The number of duplicated rows after removing {cols} is {duplicate_rows}")
    print(f"The percentage of duplicate rows in the dataset after removing {cols} is {duplicate_percentage.round(5)}")
    print(f"The shape of the cleaned dataset after removing {cols} and duplicated rows is is {df_clean.shape}")
    
    # Check the distribution of the bank_account variable
    bank = df_clean['bank_account'].value_counts(normalize=True).round(5) * 100
    # Display the distribution
    print("Bank Account Distribution:")
    print(bank)
    
    return df_clean

In [5]:
def undersample_balance_dataset(df, minority_ratio=0.5, target_variable='bank_account'):
    """
    Undersamples the majority class in a DataFrame to balance the dataset.

    Parameters:
    - df: DataFrame containing the dataset.
    - minority_ratio: Ratio of the minority class in the balanced dataset. Default is 0.5.
    - target_variable: Name of the target variable. Default is 'bank_account'.

    Returns:
    - balanced_df: DataFrame containing the balanced dataset.
    """
    
  # Count the number of samples in each class
    class_counts = df[target_variable].value_counts(normalize = True)
    print(class_counts)

    # Determine the minority and majority classes
    minority_class = class_counts.idxmin()
    majority_class = class_counts.idxmax()

    # Separate the dataframe into minority and majority classes
    minority_df = df[df[target_variable] == minority_class]
    majority_df = df[df[target_variable] == majority_class]

    majority_size = int(minority_ratio*minority_df.shape[0]/(1-minority_ratio))
    # Sample from majority class to match minority class ratio
    majority_sampled = majority_df.sample(majority_size)
    
    # Concatenate minority and sampled majority class
    balanced_df = pd.concat([minority_df, majority_sampled])
    class_counts = balanced_df[target_variable].value_counts()
    print(class_counts)
    
    # Shuffle the balanced dataframe
    balanced_df = balanced_df.sample(frac=1).reset_index(drop=True)
    return balanced_df

In [6]:
df_clean3 = remove_duplicates(train, ['uniqueid','country', 'year'])

#balanced_df = undersample_balance_dataset(df_clean3, minority_ratio=0.60, target_variable='bank_account')

back = df_clean3.copy()

The number of duplicated rows after removing ['uniqueid', 'country', 'year'] is 5310
The percentage of duplicate rows in the dataset after removing ['uniqueid', 'country', 'year'] is 22.57269
The shape of the cleaned dataset after removing ['uniqueid', 'country', 'year'] and duplicated rows is is (18214, 10)
Bank Account Distribution:
bank_account
No     83.03
Yes    16.97
Name: proportion, dtype: float64


In [9]:
from sklearn.preprocessing import OneHotEncoder

has_bank_account = back.bank_account
balanced_df = back.drop("bank_account", axis=1)
categorical_cols = balanced_df.select_dtypes(include=['object']).columns

# Select only the columns with object dtype
categorical_df = balanced_df[categorical_cols]

ohe = OneHotEncoder(sparse=False, drop='first')  # drop='first' to avoid multicollinearity
encoded_features = pd.DataFrame(ohe.fit_transform(balanced_df[categorical_cols]))
encoded_features.columns = ohe.get_feature_names_out(categorical_cols)
bal_df_clean = balanced_df.drop(categorical_cols, axis=1, inplace=True)  # Drop original categorical columns
bal_df_clean = pd.concat([bal_df_clean, encoded_features], axis=1)  # Concatenate encoded features

newtrain = bal_df_clean.copy()

has_bank_account = has_bank_account.replace({'Yes': 1, 'No': 0})
newtrain = pd.concat([newtrain, has_bank_account.reindex(bal_df_clean.index)], axis=1)

#print(newtrain)
newtrain.to_csv('balanced_df_clean.csv', index=False)

baba = df_clean3.copy()
baba_bank = baba.bank_account
baba_bank = baba_bank.replace({'Yes': 1, 'No': 0})
baba = baba.drop("bank_account", axis=1)
categorical_cols = baba.select_dtypes(include=['object']).columns

categorical_df = baba[categorical_cols]
ohe = OneHotEncoder(sparse=False, drop='first')  # drop='first' to avoid multicollinearity
encoded_features = pd.DataFrame(ohe.fit_transform(baba[categorical_cols]))
encoded_features.columns = ohe.get_feature_names_out(categorical_cols)
baba_clean = baba.drop(categorical_cols, axis=1, inplace=True)  # Drop original categorical columns
baba_clean = pd.concat([baba_clean, encoded_features], axis=1)  # Concatenate encoded features

imba_train = pd.concat([baba_clean, baba_bank], axis=1)



In [None]:
#It is a yes/no question of classification so logistic regression makes sense 


# penalty: l1 shrinks coefficients of less important features towards zero or even zero
# use it if we believe some features are irrelevant
# l2 shrinks all coefficients towards zero but not necessarily zero
# reduces overall magnitude of coefficients and prevents overfitting 
# probably going to use this because EDA already chose important features 

#c model complexity 
#from 0.001 0.01 0.1 1 10 100 1000
#search for optimal values to get most of our training data


# sag or saga, liblinear if our data is sparse
# trial an error it 



Accuracy on train data:  0.7603947581297524
Accuracy on test data:  0.7639068564036222
              precision    recall  f1-score   support

          No       0.78      0.84      0.81       928
         Yes       0.73      0.64      0.69       618

    accuracy                           0.76      1546
   macro avg       0.76      0.74      0.75      1546
weighted avg       0.76      0.76      0.76      1546



In [65]:
def random_forest_tester(data, target, rseed):
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=rseed)
    dtree = RandomForestClassifier(random_state=rseed)
    dtree.fit(X_train, y_train)
    param_grid = {
        'n_estimators': np.arange(150, 501).astype(int),  #amount of trees, more trees more accuracy
        'max_depth': [None] + list(np.arange(5, 15).astype(int)),  # maximum depth of each trees, overfitting if too high
        'max_features': ['auto', 'sqrt', None],  # amount of features considered per split
        'max_leaf_nodes': list(np.arange(15, 31).astype(int)),  # maximum number of end nodes
        'min_samples_split': [3, 5, 12, 17],  # minimum samples required to split a node
        'bootstrap': [True, False]  # whether we bootstrap or not 
    }
    rs = RandomizedSearchCV(dtree, param_grid, n_jobs = -1, scoring="roc_auc", cv=5, n_iter = 15, verbose=0, random_state = 42)
    rs.fit(X_train, y_train)
    best_model = rs.best_estimator_

    y_train_pred = best_model.predict(X_train)
    print("Accuracy on train data: ", accuracy_score(y_train, y_train_pred))
    print(classification_report(y_train, y_train_pred))
    print("----------")
    y_test_pred = best_model.predict(X_test)
    print("Accuracy on test data: ", accuracy_score(y_test, y_test_pred))
    print(classification_report(y_test, y_test_pred))


In [93]:
random_forest_tester(newtrain, has_bank_account, 42)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Accuracy on train data:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3689
           1       1.00      1.00      1.00      2492

    accuracy                           1.00      6181
   macro avg       1.00      1.00      1.00      6181
weighted avg       1.00      1.00      1.00      6181

----------
Accuracy on test data:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       947
           1       1.00      1.00      1.00       599

    accuracy                           1.00      1546
   macro avg       1.00      1.00      1.00      1546
weighted avg       1.00      1.00      1.00      1546



In [91]:
X = newtrain.drop("bank_account", axis=1)
y = newtrain.bank_account
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.utils.class_weight import compute_class_weight
class_weights = {0:0.7, 1:1.0}
weighted_model = LogisticRegression(penalty='l2', C=1.0, solver='saga', max_iter=1000, class_weight=class_weights)

lr_param_grid = {
    'penalty': ['l1', 'l2'],
    'C': np.logspace(-4, 4, 4),
    'solver': ['liblinear', 'saga', 'lbfgs']
}

weighted_search = RandomizedSearchCV(weighted_model, lr_param_grid, n_jobs = -1, scoring="f1_macro", cv=3, n_iter = 10, verbose=2, random_state = 42)
weighted_search.fit(X_train, y_train)

weighted_best_params = weighted_search.best_params_

weighted_best_model = weighted_search.best_estimator_

weighted_train_rs_predictions = weighted_best_model.predict(X_train)
weighted_test_rs_predictions = weighted_best_model.predict(X_test)

print("Accuracy on train data: ", accuracy_score(y_train, weighted_train_rs_predictions))
print("Accuracy on test data: ", accuracy_score(y_test, weighted_test_rs_predictions))
print(classification_report(y_test, weighted_test_rs_predictions))


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END ...C=0.046415888336127774, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END .............C=0.0001, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ...C=0.046415888336127774, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END .............C=0.0001, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END .............C=0.0001, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END .......C=21.54434690031882, penalty=l2, solver=saga; total time=   0.1s
[CV] END ...C=0.046415888336127774, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END ...C=0.046415888336127774, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END ...C=0.046415888336127774, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END ............C=10000.0, penalty=l1, solver=liblinear; total time=   0.1s
[CV] END ...C=0.046415888336127774, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END C=0.046415888336127774, penalty=l2, sol

3 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/riael/.pyenv/versions/3.11.3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/riael/.pyenv/versions/3.11.3/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/riael/.pyenv/versions/3.11.3/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    

Accuracy on train data:  0.7502022326484388
Accuracy on test data:  0.7470892626131953
              precision    recall  f1-score   support

           0       0.81      0.76      0.79       947
           1       0.66      0.73      0.69       599

    accuracy                           0.75      1546
   macro avg       0.74      0.74      0.74      1546
weighted avg       0.75      0.75      0.75      1546



In [None]:
#can also try xgboost

In [None]:
def logistic_tester(data, target, random_seed, stratify=0):
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score, classification_report
    
    if stratify == 1:
        X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=random_seed, stratify=target)
    else:
        X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=random_seed)
    

    model = LogisticRegression(penalty='l1', C=1, solver='saga', max_iter=1000)
    model.fit(X_train, y_train)
    
    # Make predictions for y_train
    y_train_pred = model.predict(X_train)
    
    # Print the accuracy of the predictions for train data
    print("Accuracy on train data:", accuracy_score(y_train, y_train_pred))
    
    # Print the classification report for the train data
    print("Classification report on train data:")
    print(classification_report(y_train, y_train_pred))
    
    print("----------")
    
    # Make predictions for y_test
    y_test_pred = model.predict(X_test)
    
    # Print the accuracy of the predictions for test data
    print("Accuracy on test data:", accuracy_score(y_test, y_test_pred))
    
    # Print the classification report for the test data
    print("Classification report on test data:")
    print(classification_report(y_test, y_test_pred))
