In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np

In [2]:
data = pd.read_csv('train.csv')

In [3]:
print(data.head()) 

   id  target  ps_ind_01  ps_ind_02_cat  ps_ind_03  ps_ind_04_cat  \
0   7       0          2              2          5              1   
1   9       0          1              1          7              0   
2  13       0          5              4          9              1   
3  16       0          0              1          2              0   
4  17       0          0              2          0              1   

   ps_ind_05_cat  ps_ind_06_bin  ps_ind_07_bin  ps_ind_08_bin  ...  \
0              0              0              1              0  ...   
1              0              0              0              1  ...   
2              0              0              0              1  ...   
3              0              1              0              0  ...   
4              0              1              0              0  ...   

   ps_calc_11  ps_calc_12  ps_calc_13  ps_calc_14  ps_calc_15_bin  \
0           9           1           5           8               0   
1           3           1 

In [4]:
data = data.dropna()  

In [5]:
data = pd.get_dummies(data, drop_first=True)

In [6]:
data = data.sample(frac=0.1, random_state=42)  

In [7]:
X = data.drop('target', axis=1)  
y = data['target']

In [8]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  

In [9]:
pca = PCA(n_components=10)  
X_reduced = pca.fit_transform(X_scaled)  

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Initialize and train the model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

In [12]:
# Predictions
y_pred = lr_model.predict(X_test)

In [13]:
# Evaluate
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Logistic Regression Accuracy: 0.9636287274254515
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     11472
           1       0.00      0.00      0.00       433

    accuracy                           0.96     11905
   macro avg       0.48      0.50      0.49     11905
weighted avg       0.93      0.96      0.95     11905



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
# Initialize and train the Random forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

In [15]:
# Predictions
y_pred_rf = rf_model.predict(X_test)

In [16]:
# Evaluate
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.9636287274254515
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     11472
           1       0.00      0.00      0.00       433

    accuracy                           0.96     11905
   macro avg       0.48      0.50      0.49     11905
weighted avg       0.93      0.96      0.95     11905



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
# Initialize and train the Gradientboost model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

In [18]:
# Predictions
y_pred_gb = gb_model.predict(X_test)

In [19]:
# Evaluate
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))
print(classification_report(y_test, y_pred_gb))

Gradient Boosting Accuracy: 0.9634607307853843
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     11472
           1       0.00      0.00      0.00       433

    accuracy                           0.96     11905
   macro avg       0.48      0.50      0.49     11905
weighted avg       0.93      0.96      0.95     11905



In [20]:
# Initialixe and train the Decisiontree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

In [21]:
# Predictions
y_pred_dt = dt_model.predict(X_test)

In [22]:
# Evaluate
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))

Decision Tree Accuracy: 0.9207895842083158


In [24]:
# Initialixe and train the SVM model
svm_model = SVC(kernel='rbf', probability=True, random_state=42)
svm_model.fit(X_train, y_train)

In [25]:
# Predictions
y_pred_svm = svm_model.predict(X_test)

In [26]:
# Evaluate
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))

SVM Accuracy: 0.9636287274254515


In [35]:
# Initialixe and train the Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

In [36]:
# Predictions
y_pred_nb = nb_model.predict(X_test)

In [37]:
# Evaluate
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))

Naive Bayes Accuracy: 0.9636287274254515


In [43]:
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'Gradient Boosting', 
              'Decision Tree', 'SVM', 'Naive Bayes'],
    'Accuracy': [accuracy_score(y_test, y_pred), accuracy_score(y_test, y_pred_rf),
                 accuracy_score(y_test, y_pred_gb), accuracy_score(y_test, y_pred_dt),
                 accuracy_score(y_test, y_pred_svm), accuracy_score(y_test, y_pred_nb)]
})

# Sort by accuracy in descending order and print the results
print(results.sort_values(by='Accuracy', ascending=False))



                 Model  Accuracy
0  Logistic Regression  0.963629
1        Random Forest  0.963629
4                  SVM  0.963629
5          Naive Bayes  0.963629
2    Gradient Boosting  0.963461
3        Decision Tree  0.920790


## Based on the accuracy scores, computational complexity, and interpretability, Logistic Regression and Random Forest are the most suitable models for production.

# challenges faced 
High Dimensionality: Used PCA to reduce 59 features to 10 principal components, retaining variance and speeding up training.

Large Dataset: Sampled 10% of the data (59,521 observations) to reduce computational cost while preserving dataset representativeness.

Imbalanced Features and Encoding: Applied one-hot encoding with drop_first=True to handle high cardinality and avoid multicollinearity.

Training Time: Reduced features with PCA, used StandardScaler for SVM normalization, and experimented with smaller datasets for faster benchmarking.

Data Consistency: Ensured transformations (PCA, scaling) were fitted on training data and applied consistently to both training and test datasets.