In [51]:
#inserting the files into content folder, AFTER adding age_car in India Dataset

In [52]:
#library function downloads
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report

In [53]:
# Read Japan (training) data
jp = pd.read_excel("/content/JPN Data.xlsx")

# Read India (scoring) data
ind = pd.read_excel("/content/IN_Data.xlsx")

In [54]:
jp.head()

Unnamed: 0,ID,CURR_AGE,GENDER,ANN_INCOME,AGE_CAR,PURCHASE
0,00001Q15YJ,50,M,445344.0,439,0
1,00003I71CQ,35,M,107634.0,283,0
2,00003N47FS,59,F,502786.666667,390,1
3,00005H41DE,43,M,585664.0,475,0
4,00007E17UM,39,F,705722.666667,497,1


In [55]:
ind.head()

Unnamed: 0,ID,CURR_AGE,GENDER,ANN_INCOME,DT_MAINT,AGE_CAR
0,20710B05XL,54,M,1425390,2018-04-20,437
1,89602T51HX,47,M,1678954,2018-06-08,388
2,70190Z52IP,60,M,931624,2017-07-31,700
3,25623V15MU,55,F,1106320,2017-07-31,700
4,36230I68CE,32,F,748465,2019-01-27,155


In [56]:
#categorizing data as needed
def age_car_segment(x):
    if x < 200:
        return 1
    elif x < 360:
        return 2
    elif x < 500:
        return 3
    else:
        return 4

jp["AGE_CAR_SEG"] = jp["AGE_CAR"].apply(age_car_segment)
ind["AGE_CAR_SEG"] = ind["AGE_CAR"].apply(age_car_segment)

In [57]:
#declaring data types to use
features = ["CURR_AGE", "ANN_INCOME", "GENDER", "AGE_CAR_SEG"]
target = "PURCHASE"
X = jp[features]
y = jp[target]
categorical_cols = ["GENDER", "AGE_CAR_SEG"]
numeric_cols = ["CURR_AGE", "ANN_INCOME"]

In [58]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)

model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("logreg", LogisticRegression(max_iter=1000))
    ]
)

In [59]:
#training model
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)
model.fit(X_train, y_train)

y_val_prob = model.predict_proba(X_val)[:, 1]
y_val_pred = (y_val_prob >= 0.5).astype(int)

print("ROC-AUC:", roc_auc_score(y_val, y_val_prob))
print(classification_report(y_val, y_val_pred))


ROC-AUC: 0.7588637840655974
              precision    recall  f1-score   support

           0       0.62      0.68      0.65      5091
           1       0.74      0.69      0.72      6909

    accuracy                           0.68     12000
   macro avg       0.68      0.68      0.68     12000
weighted avg       0.69      0.68      0.69     12000



In [60]:
#fine tuning for most appropriate threshold
thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]

for t in thresholds:
    y_pred_t = (y_val_prob >= t).astype(int)
    print(f"\n===== Threshold: {t} =====")
    print(classification_report(y_val, y_pred_t))


===== Threshold: 0.3 =====
              precision    recall  f1-score   support

           0       0.82      0.16      0.27      5091
           1       0.61      0.97      0.75      6909

    accuracy                           0.63     12000
   macro avg       0.72      0.57      0.51     12000
weighted avg       0.70      0.63      0.55     12000


===== Threshold: 0.4 =====
              precision    recall  f1-score   support

           0       0.68      0.46      0.55      5091
           1       0.68      0.84      0.75      6909

    accuracy                           0.68     12000
   macro avg       0.68      0.65      0.65     12000
weighted avg       0.68      0.68      0.66     12000


===== Threshold: 0.5 =====
              precision    recall  f1-score   support

           0       0.62      0.68      0.65      5091
           1       0.74      0.69      0.72      6909

    accuracy                           0.68     12000
   macro avg       0.68      0.68      0.68 

In [61]:
CHOSEN_THRESHOLD = 0.6

In [62]:
model.fit(X, y)


In [63]:
X_ind = ind[features]

ind["P_CLIENT_PROB"] = model.predict_proba(X_ind)[:, 1]
ind["P_CLIENT_FLAG"] = (ind["P_CLIENT_PROB"] >= CHOSEN_THRESHOLD).astype(int)


In [64]:
total_customers = len(ind)
potential_clients = ind["P_CLIENT_FLAG"].sum()
percentage = potential_clients / total_customers * 100

print("Total customers:", total_customers)
print("Estimated potential clients:", potential_clients)
print("Percentage:", round(percentage, 2), "%")


Total customers: 70000
Estimated potential clients: 64984
Percentage: 92.83 %


In [65]:
ind["P_CLIENT_PROB"].describe()


Unnamed: 0,P_CLIENT_PROB
count,70000.0
mean,0.862597
std,0.146118
min,0.261229
25%,0.799064
50%,0.920639
75%,0.969384
max,0.997636


In [66]:
#getting finalised In_data.xlsx
final_cols = [
    "ID",
    "CURR_AGE",
    "GENDER",
    "ANN_INCOME",
    "AGE_CAR",
    "PURCHASE"
]

ind_final = ind[final_cols]

ind_final.to_excel(
    "/content/IND Data.xlsx",
    index=False
)
