In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Reading the Data

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/train.csv")
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/test.csv")
sub = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv")

In [None]:
train_df.head(10)

### Summary of the data

In [None]:
Summary = pd.DataFrame(train_df.dtypes, columns=['Dtype'])
Summary["max"] = train_df.max()
Summary["min"] = train_df.min()
Summary["Null"] = train_df.isnull().sum() # to get null values
Summary["First"] = train_df.iloc[0] # to get first value
Summary["Second"] = train_df.iloc[1] # to get second value
Summary

Nothing fancy going on in here, just trying to get the info of the training data 😎

### Getting rid of the target variable and ID

In [None]:
y = train_df["claim"]
train_df = train_df.drop(["id"], axis=1)
train_df = train_df.drop(["claim"], axis=1)

In [None]:
test_df = test_df.drop(["id"], axis=1)

### Feature Engineering stuff

In [None]:
train_df["max_value"] = train_df.max(axis = 1)
train_df["min_value"] = train_df.min(axis = 1 )
train_df['num_missing_std'] = train_df.isna().std(axis=1).astype('float') 
train_df["mean"] = train_df.mean(axis = 1)
train_df["median"] = train_df.median(axis = 1)
train_df["std"] = train_df.std(axis = 1)
train_df['mad'] = train_df.mad(axis=1) 
train_df["skew"] = train_df.skew(axis = 1)
train_df["null_value"] = train_df.isnull().sum(axis = 1)

In [None]:

test_df["max_value"] = test_df.max(axis = 1)
test_df["min_value"] = test_df.min(axis = 1 )
test_df['num_missing_std'] = test_df.isna().std(axis=1).astype('float') 
test_df["mean"] = test_df.mean(axis = 1)
test_df["median"] = test_df.median(axis = 1)
test_df["std"] = test_df.std(axis = 1)
test_df['mad'] = test_df.mad(axis=1) 
test_df["skew"] = test_df.skew(axis = 1)
test_df["null_value"] = test_df.isnull().sum(axis = 1)

In [None]:
# test_df.isnull().sum()
y.value_counts() #kool

Okay, target variable is pretty balanced!

### Splitting the Data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(train_df, y, stratify = y, random_state = 123, test_size = 0.1)

### Data transformation

In [None]:
from sklearn.impute import SimpleImputer

numerical_transformer = SimpleImputer(strategy='constant', fill_value=0)
# numerical_transformer = SimpleImputer(strategy='mean') # above one worked better rather than this one

imputed_train = pd.DataFrame(numerical_transformer.fit_transform(X_train))
imputed_test = pd.DataFrame(numerical_transformer.transform(X_test))
imputed_train.columns = train_df.columns # Columns are not preserved when imputing
imputed_test.columns = train_df.columns

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(imputed_train)
X_test = scaler.transform(imputed_test)

### Model

In [None]:
lr = LogisticRegression(random_state=123, C = 0.01, penalty = 'l2')
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict_proba(X_test)

In [None]:
y_pred = y_pred[:, 1]

### Performance

In [None]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
print(auc(fpr, tpr))

# Never mnind this stuff written below, its just the progress!
# this is the score: 0.5254161343242256
# this is the score with logisticR: 0.530162419244593
# score with logistic regression: 0.5410502320900562
#  score with logistic regression with mean: Nevermind
# score with zero imputation: 0.5818869874137387
# score improved without removing extra features that I added 0.5895987572971035
# score improved without removing extra features and adding extra features 0.7981633906567501
# score with unique value feature added did not hep 0.7907597973759513
# score now is 0.798165516631935
# score now is 0.7982741998661137
# score now is 0.803559088213053 with constant imputation and with extra features

In [None]:

imputed_test_df = pd.DataFrame(numerical_transformer.transform(test_df))
imputed_test_df.columns = imputed_test_df.columns # Columns are not preserved when imputing

In [None]:
test_df = scaler.transform(imputed_test_df)

In [None]:
y_val = lr.predict_proba(test_df)
y_val = y_val[:, 1]

### Submission

In [None]:
sub = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv")
sub["claim"] = y_val
sub.to_csv("lr.csv", index=False)
sub.head(10)

### Do give an upvote if you think it is easy to understand for yall and help me by commenting your suggestions of what I could try more.
### Happy Kaggling!