# Imports

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

playground_series_s5e8_path = kagglehub.competition_download('playground-series-s5e8')

print('Data source import complete.')


In [None]:
import os

cwd = os.getcwd()
print(cwd)

In [None]:
playground_series_s5e8_path

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/root/.cache/kagglehub/competitions/playground-series-s5e8'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv(f'{playground_series_s5e8_path}/train.csv')
test_data = pd.read_csv(f'{playground_series_s5e8_path}/test.csv')

train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.to_csv('binary_train.csv', index=False)
test_data.to_csv('binary_test.csv', index=False)

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
explosion_tracker = 0
categorical_columns = []
for column in train_data.columns:
    if train_data[column].dtype == 'object' or train_data[column].dtype.name == 'category':
        print(f'{column} is a categorical column')
        distinct_values = train_data[column].nunique()
        explosion_tracker += distinct_values
        categorical_columns.append(column)
        print(f'Distinct column values are: {distinct_values}')
        print('********************************************************************************')

print(f'Total columns after one hot encoding will be : {explosion_tracker} + numerical columns')

## Performing One Hot Encoding

In [None]:
input_columns = train_data.columns[1:]
output_columns = train_data.columns[17]

input_columns, output_columns

In [None]:
train_full = train_data[input_columns]
trainY = train_data[output_columns]

train_full.shape, trainY.shape

In [None]:
train_full = pd.get_dummies(train_full, columns=categorical_columns, dtype=int)
train_full.shape

In [None]:
train_full.head()

## Visualizing Correlation Matrix

In [None]:
train_full['y'].value_counts()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

corr_matrix = train_full.corr(numeric_only=True)
# target_corr = corr_matrix['y'].sort_values(ascending=False)


# plt.figure(figsize=(8,6))
# sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
# plt.show()

In [None]:
target_corr = corr_matrix['y'].sort_values(ascending=False)
target_corr, type(target_corr)

In [None]:
target_corr.drop('y').plot(kind='bar', color='skyblue')
plt.title('Correlation of Features with Target')
plt.ylabel('Correlation Coefficient')
plt.ylim(-1, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
#Filtering out columns based on threshold
threshold = 0.05
selected_features = target_corr[target_corr > threshold].index.tolist()
selected_features.remove('y')
selected_features

# Logistic Regression

## Oversampling y=1 class

In [None]:
# Oversampling minority class
from sklearn.utils import resample

majority = train_full[train_full.y == 0]
minority = train_full[train_full.y == 1]
minority_oversampled = resample(minority, replace=True, n_samples=len(majority), random_state=42)
df_balanced = pd.concat([majority, minority_oversampled])


In [None]:
df_balanced.y.value_counts()

In [None]:
## Logistic Regression
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler

# 6. Logistic Regression on selected features
X = df_balanced[selected_features]
y = df_balanced['y']

#Standard Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Model training
model = LogisticRegression(class_weight='balanced', max_iter=1000, solver='lbfgs')
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


# Predicting Output

In [None]:
test_data_full = pd.get_dummies(test_data, columns=categorical_columns, dtype=int)
selected_features.append('id')
test_full = test_data_full[selected_features]
test_full.shape

In [None]:
selected_features.remove('id')
test_full.shape

In [None]:
test_scaled = scaler.transform(test_full[selected_features])
test_full['y'] = model.predict(test_scaled)

test_full.head()

In [None]:
test_full[['id','y']].to_csv('submission.csv', index=False)