In [None]:
# %% [code]
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
#import tensorflow

# Load data
train = pd.read_csv("/kaggle/input/playground-series-s5e3/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s5e3/test.csv")

# Identify features by dropping 'id' and 'rainfall' from the training set
features = train.drop(columns=['id', 'rainfall']).columns

# Check for missing columns in test set
missing_cols = test.columns[test.isnull().any()]
if len(missing_cols) > 0:
    print("Missing columns in test set:", list(missing_cols))
else:
    print("No missing columns found in test set.")

# Create copies of train and test for imputation
train_imputed = train.copy()
test_imputed = test.copy()

# Apply KNN imputation to both train and test using the training features
imputer = KNNImputer(n_neighbors=5)
train_imputed[features] = imputer.fit_transform(train[features])
test_imputed[features] = imputer.transform(test[features])

# %% [code]
# Print basic statistical summaries for verification (no plots)
print("Train Data Summary:")
print(train_imputed[features].describe())
print("\nTest Data Summary:")
print(test_imputed[features].describe())

# %% [code]
# Model Training: Logistic Regression with 10-fold Stratified Cross-Validation

# Prepare data
X = train_imputed[features]
y = train_imputed["rainfall"]
X_test = test_imputed[features]

# Set up 10-fold stratified cross-validation
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

# Arrays to store out-of-fold predictions and test predictions
oof_preds = np.zeros(len(X))
test_preds = np.zeros((len(X_test), FOLDS))

# Training loop
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"Training fold {fold} ...")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Initialize and train the Logistic Regression model
    model = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    # Generate predictions for validation and test sets
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    fold_auc = roc_auc_score(y_val, oof_preds[val_idx])
    print(f"Fold {fold} AUC: {fold_auc:.4f}")
    
    test_preds[:, fold - 1] = model.predict_proba(X_test)[:, 1]

# Evaluate overall performance on training set (OOF predictions)
overall_auc = roc_auc_score(y, oof_preds)
print(f"\nOverall OOF AUC: {overall_auc:.4f}")

# Average test predictions across folds
final_test_preds = test_preds.mean(axis=1)
print("\nFinal test predictions (first 10 samples):")
print(final_test_preds[:10])

# %% [code]
# Prepare and save the submission file
sub = pd.read_csv("/kaggle/input/playground-series-s5e3/sample_submission.csv")
sub["rainfall"] = final_test_preds
sub.to_csv("submission.csv", index=False)
print("Submission file created!")
print(sub.head(3))


# %% [code]
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

# Load data
train = pd.read_csv("/kaggle/input/playground-series-s5e3/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s5e3/test.csv")

# Identify features by dropping id and target column from training set
features = train.drop(columns=['id', 'rainfall']).columns

# Check if there is any missing column in test (example: 'winddirection')
missing_cols = test.columns[test.isnull().any()]
if len(missing_cols) > 0:
    print("Missing columns in test set:", list(missing_cols))
else:
    print("No missing columns found in test set.")

# Create a copy of train and test to perform imputation
train_imputed = train.copy()
test_imputed = test.copy()

# Apply KNN imputation to both train and test using training features
imputer = KNNImputer(n_neighbors=5)
train_imputed[features] = imputer.fit_transform(train[features])
test_imputed[features] = imputer.transform(test[features])

# %% [code]
# Exploratory Data Analysis (EDA)

# Target distribution
plt.figure(figsize=(6,4))
sns.countplot(x="rainfall", data=train_imputed)
plt.title("Rainfall Distribution in Train Data")
plt.show()

# Boxplots for each feature against the target
for feature in features:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x="rainfall", y=feature, data=train_imputed)
    plt.title(f"{feature} by Rainfall")
    plt.tight_layout()
    plt.show()

# Correlation heatmap of features
plt.figure(figsize=(20,12))
corr = train_imputed[features].corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

# %% [code]
# Model Training: Logistic Regression with 10-fold Stratified CV

# Prepare data
X = train_imputed[features]
y = train_imputed["rainfall"]
X_test = test_imputed[features]

# Set up 10-fold stratified cross-validation
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

# Arrays to store out-of-fold predictions and test predictions
oof_preds = np.zeros(len(X))
test_preds = np.zeros((len(X_test), FOLDS))

# Training loop
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"Training fold {fold} ...")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Initialize and train the model
    model = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    # Generate predictions
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    fold_auc = roc_auc_score(y_val, oof_preds[val_idx])
    print(f"Fold {fold} AUC: {fold_auc:.4f}")
    
    # Store test predictions for this fold
    test_preds[:, fold - 1] = model.predict_proba(X_test)[:, 1]

# Evaluate overall performance on training set (OOF predictions)
overall_auc = roc_auc_score(y, oof_preds)
print(f"\nOverall OOF AUC: {overall_auc:.4f}")

# Average test predictions across folds
final_test_preds = test_preds.mean(axis=1)
print("\nFinal test predictions (first 10 samples):")
print(final_test_preds[:10])

# %% [code]
# Prepare and save the submission file
sub = pd.read_csv("/kaggle/input/playground-series-s5e3/sample_submission.csv")
sub["rainfall"] = final_test_preds
sub.to_csv("submission.csv", index=False)
print("Submission file created!")
sub.head(3)
