In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv('/kaggle/input/sahaay-dc-training-assignment-8-part-1/train.csv')
df_test = pd.read_csv('/kaggle/input/sahaay-dc-training-assignment-8-part-1/test.csv')


In [None]:
y = df_train['target']
X = df_train.drop(['target', 'row_ID'], axis=1)

X_test = df_test.drop(columns=['row_ID'])

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

In [None]:
logreg = Pipeline([
    ("scaler", StandardScaler()),   # VERY important for LogReg
    ("clf", LogisticRegression(
        penalty="elasticnet",
        l1_ratio=0.2,
        C=0.01,
        solver="saga",   #for handling large datasets with many features
        max_iter=8000,
        n_jobs=-1,       #uses all available cpu cores
        class_weight='balanced',
        random_state=42
    ))
])

In [None]:
skf = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

In [None]:
thresholds = []
f1s = []

for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\nFold {fold+1}")

    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

    logreg.fit(X_tr, y_tr)

    y_val_prob = logreg.predict_proba(X_val)[:, 1]

    ts = np.linspace(0.10, 0.25, 100)
    best_t, best_f1 = 0, 0

    for t in ts:
        y_pred = (y_val_prob >= t).astype(int)
        f1 = f1_score(y_val, y_pred)

        if f1 > best_f1:
            best_f1 = f1
            best_t = t

    print(f"  Best threshold: {best_t:.3f}")
    print(f"  Best F1: {best_f1:.4f}")

    thresholds.append(best_t)
    f1s.append(best_f1)

In [None]:
thresholds = np.array(thresholds)

print("\nCV threshold summary")
print("Thresholds:", thresholds)
print("Mean:", thresholds.mean())
print("Median:", np.median(thresholds))
print("Std:", thresholds.std())

final_threshold = np.median(thresholds)
#final_threshold *= 0.9   # bias toward recall

In [None]:
logreg.fit(X, y)

y_test_prob = logreg.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_prob >= final_threshold).astype(int)

In [None]:
# Load data
sample_sub = pd.read_csv("/kaggle/input/sahaay-dc-training-assignment-8-part-1/sample_submission.csv")


# Create submission
submission = sample_sub.copy()
submission["target"] = y_test_pred

# Save
submission.to_csv("submission.csv", index=False)

import os
print(os.listdir())  # MUST show 'submission.csv'
assert "submission.csv" in os.listdir()

print("submission.csv correctly created in notebook root")