## Importing necessary libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
from sklearn.exceptions import ConvergenceWarning

In [2]:
warnings.filterwarnings("ignore", category=ConvergenceWarning)

## Loading the dataset

In [3]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


## Splitting the dataset into features (X) and target (y)

In [4]:
X = data.drop(['ID_code', 'target'], axis=1)
y = data['target']

## Splitting the data into training and testing sets

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Logistic Regression Model (Increased Max Iterations)

In [6]:
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train, y_train)
logreg_predictions = logreg_model.predict(X_test)

## Scaling Features and Logistic Regression Model

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
logreg_model_scaled = LogisticRegression(solver='liblinear')
logreg_model_scaled.fit(X_train_scaled, y_train)
logreg_predictions_scaled = logreg_model_scaled.predict(X_test_scaled)

## Decision Tree Model

In [9]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)

## Logistic Regression Predictions with Probabilities

In [10]:
logreg_probabilities = logreg_model.predict_proba(X_test)[:, 1]
logreg_predictions_lr = pd.DataFrame({'Probability_LR': logreg_probabilities, 'Predicted_Class_LR': logreg_predictions})
logreg_predictions_lr['Predicted_Transaction_LR'] = (logreg_predictions_lr['Predicted_Class_LR'] == 1).astype(int)

## Decision Tree Predictions with Probabilities

In [11]:
dt_probabilities = dt_model.predict_proba(X_test)[:, 1]
dt_predictions_df = pd.DataFrame({'Probability_DT': dt_probabilities, 'Predicted_Class_DT': dt_predictions})
dt_predictions_df['Predicted_Transaction_DT'] = (dt_predictions_df['Predicted_Class_DT'] == 1).astype(int)

## Function to Evaluate Model Performance

In [12]:
def evaluate_model(model_name, y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)
    class_report = classification_report(y_true, y_pred)
    
    print(f'{model_name} Model:')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Confusion Matrix:\n{conf_matrix}')
    print(f'Classification Report:\n{class_report}')
    print('\n')


## Evaluating Model Performance

In [13]:
evaluate_model('Logistic Regression (Increased Max Iter)', y_test, logreg_predictions)
evaluate_model('Logistic Regression (Scaled Features)', y_test, logreg_predictions_scaled)
evaluate_model('Decision Tree', y_test, dt_predictions)

Logistic Regression (Increased Max Iter) Model:
Accuracy: 0.9128
Confusion Matrix:
[[35424   479]
 [ 3010  1087]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     35903
           1       0.69      0.27      0.38      4097

    accuracy                           0.91     40000
   macro avg       0.81      0.63      0.67     40000
weighted avg       0.90      0.91      0.89     40000



Logistic Regression (Scaled Features) Model:
Accuracy: 0.9131
Confusion Matrix:
[[35421   482]
 [ 2994  1103]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     35903
           1       0.70      0.27      0.39      4097

    accuracy                           0.91     40000
   macro avg       0.81      0.63      0.67     40000
weighted avg       0.90      0.91      0.90     40000



Decision Tree Model:
Accuracy: 0.8357
Confusion Matrix:
[[32542  3361]


## Displaying Predictions from Decision Tree

In [14]:
print('\nIdentifying customers who will make transactions using Decision Tree:')
print(dt_predictions_df[['Probability_DT', 'Predicted_Class_DT', 'Predicted_Transaction_DT']])


Identifying customers who will make transactions using Decision Tree:
       Probability_DT  Predicted_Class_DT  Predicted_Transaction_DT
0                 0.0                   0                         0
1                 0.0                   0                         0
2                 0.0                   0                         0
3                 0.0                   0                         0
4                 0.0                   0                         0
...               ...                 ...                       ...
39995             1.0                   1                         1
39996             0.0                   0                         0
39997             0.0                   0                         0
39998             0.0                   0                         0
39999             0.0                   0                         0

[40000 rows x 3 columns]


## Displaying Predictions from Logistic Regression

In [15]:
print('Identifying customers who will make transactions using Logistic Regression:')
print(logreg_predictions_lr[['Probability_LR', 'Predicted_Class_LR', 'Predicted_Transaction_LR']])

Identifying customers who will make transactions using Logistic Regression:
       Probability_LR  Predicted_Class_LR  Predicted_Transaction_LR
0            0.000654                   0                         0
1            0.005314                   0                         0
2            0.005321                   0                         0
3            0.069893                   0                         0
4            0.331407                   0                         0
...               ...                 ...                       ...
39995        0.040395                   0                         0
39996        0.009847                   0                         0
39997        0.257032                   0                         0
39998        0.400005                   0                         0
39999        0.012050                   0                         0

[40000 rows x 3 columns]
