reference LR model : https://www.kaggle.com/mohammadkashifunique/tps-nov-logistic-regression-using-pytorch

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.transforms as mtransforms
from sklearn.linear_model import LogisticRegression
import seaborn as sns
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt
import statsmodels.api as sm
import torch
import gc
import torch.nn as nn
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

# Load Data

In [None]:
train = pd.read_csv("../input/tabular-playground-series-nov-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-nov-2021/test.csv")
ss    = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv')

# EDA

In [None]:
train_df = train.drop(['id'], axis = 1 )
test_df = test.drop(['id'], axis = 1)

Check for NULL values

In [None]:
train.isnull().sum()[train.isnull().sum() != 0]

Null is nothing

**HEATMAP**

Visualize and confirm the correlation between features. Although seemingly trivial, statistical analysis is a very important task.

If you have time, draw a scatter plot between features as well.

You can get something out of a visualized graph.

Breaking 10 pieces to figure out the correlation

Rows 1 to 10

In [None]:
colormap = plt.cm.PuBu 
plt.figure(figsize=(15, 8)) 
plt.title("Diabetes Correlation of Features", y = 1.05, size = 15) 
sns.heatmap(train_df.iloc[:, 0:10].astype(float).corr(), linewidths = 0.1, vmax = 1.0,square = True, cmap = colormap, linecolor = "white", annot = True, annot_kws = {"size" : 10})

Rows 11 to 20

In [None]:
plt.figure(figsize=(15, 8)) 
plt.title("Diabetes Correlation of Features", y = 1.05, size = 15) 
sns.heatmap(train_df.iloc[:, 11:20].astype(float).corr(), linewidths = 0.1, vmax = 1.0,square = True, cmap = colormap, linecolor = "white", annot = True, annot_kws = {"size" : 10})

Rows 21 to 30

In [None]:
plt.figure(figsize=(15, 8)) 
plt.title("Diabetes Correlation of Features", y = 1.05, size = 15) 
sns.heatmap(train_df.iloc[:, 21:30].astype(float).corr(), linewidths = 0.1, vmax = 1.0,square = True, cmap = colormap, linecolor = "white", annot = True, annot_kws = {"size" : 10})

**The correlation between the explanatory variables is very low. I think it would be suitable for use in the model.**👍

# EDA -> Features Engineering

**Multicollinearity**

Multicollinearity, which is the most problematic when performing LR, is checked.

When multicollinearity exists, the explanatory power of the model decreases, and the model breaks when other variables are added.

After checking multicollinearity, features with VIF value of 10 or higher are removed.


There are several methods to remove multicollinearity. The main methods are PCA and VIF. I will use VIF

Note that you have to remove them one by one using 'loop'. Remove one and check the VIF value again.

ps. It can be used for linear models such as SVM, but do not apply to ensemble models such as RF and Decision Tree.

In [None]:
train_df_X = train_df.drop('target', axis =1)
train_df_y = train_df['target']

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
def show_vif(df):
    vif = []
    for idx in range(len(df.columns)):
        vif.append(variance_inflation_factor(df.values, idx))

    vif_dataframe = pd.DataFrame()
    vif_dataframe['columns'] = df.columns
    vif_dataframe['VIF'] = vif
    return vif_dataframe

In [None]:
show_vif(train_df_X)

In [None]:
# Eliminate multicollinearity(Remove vif > 10 higher)
def remove_multicollinearity(df):
    while True:
        vif_dataframe = show_vif(df)
        
        print(len(vif_dataframe[vif_dataframe['VIF'] >= 10]))
        if len(vif_dataframe[vif_dataframe['VIF'] >= 10]) == 0:
            break
        
        remove_column = vif_dataframe[vif_dataframe['VIF'] >= 10].sort_values(by='VIF', ascending=False)['columns'].reset_index(drop=True)[0]
        print(f"remove_column: {remove_column}")
        df = df.drop(remove_column, axis=1)
    return df

In [None]:
train_removeVIF_df = remove_multicollinearity(train_df_X)

**There were no features with multicollinearity greater than 10.**

# EDA -> Select feature

**backward elimination**

* One of the stepwise regression analysis methods. 

* A method of simplifying the model by repeating the process of removing unnecessary independent variables one by one from the model including all variables

* A method of removing explanatory variables with the smallest explanatory power (correlation) one by one from a full model including all explanatory variables

In [None]:
variables = list(train_df_X) ## feature list
 
y = train['target'] ## label
selected_variables = variables ## Initially, all variables are selected.
sl_remove = 0.05
 
sv_per_step = [] ## Variables selected for each step
adjusted_r_squared = [] ## Modified r_squared for each step
steps = []
step = 0
while len(selected_variables) > 0:
    X = sm.add_constant(train[selected_variables])
    p_vals = sm.OLS(y,X).fit().pvalues[1:] 
    max_pval = p_vals.max() ## Max p-value
    if max_pval >= sl_remove: ## Exclude if max p-value is greater than or equal to the reference value
        remove_variable = p_vals.idxmax()
        selected_variables.remove(remove_variable)
 
        step += 1
        steps.append(step)
        adj_r_squared = sm.OLS(y,sm.add_constant(train[selected_variables])).fit().rsquared_adj
        adjusted_r_squared.append(adj_r_squared)
        sv_per_step.append(selected_variables.copy())
    else:
        break

In [None]:
fig = plt.figure(figsize=(10,10))
fig.set_facecolor('white')
 
font_size = 15
plt.xticks(steps,[f'step {s}\n'+'\n'.join(sv_per_step[i]) for i,s in enumerate(steps)], fontsize=12)
plt.plot(steps,adjusted_r_squared, marker='o')
    
plt.ylabel('Adjusted R Squared',fontsize=font_size)
plt.grid(True)
plt.show()

Data that does not affect the target
remove = 'f0', 'f52', 'f72', 'f38'

In [None]:
train_refinded_data = train_df.drop(['f0', 'f52', 'f72', 'f38'], axis = 1)
test_refinded_data = test_df.drop(['f0', 'f52', 'f72', 'f38'], axis = 1)

In [None]:
train_refinded_data.head()

In [None]:
X = train_refinded_data.drop(['target'], axis=1)
y = train_refinded_data['target']
X_test = test_refinded_data.copy()

del train_refinded_data
gc.collect()
del test_refinded_data
gc.collect()

**Data Scaling**

In [None]:
scaler = StandardScaler()

X = pd.DataFrame(columns=X.columns, data=scaler.fit_transform(X))
X_test = pd.DataFrame(columns=X_test.columns, data=scaler.transform(X_test))

# Logistic Regression Modeling

In [None]:
%%time
EPOCHS = 100
KFold = StratifiedKFold(n_splits=5, random_state=786, shuffle=True)

for fold, (train_idx, valid_idx) in enumerate(KFold.split(X, y)):
    X_train, X_valid = X.iloc[train_idx].values, X.iloc[valid_idx].values
    y_train, y_valid = y.iloc[train_idx].values, y.iloc[valid_idx].values
    
    X_train = torch.from_numpy(X_train.astype(np.float32))
    X_valid = torch.from_numpy(X_valid.astype(np.float32))
    y_train = torch.from_numpy(y_train.astype(np.float32).reshape(-1,1))
    y_valid = torch.from_numpy(y_valid.astype(np.float32).reshape(-1,1))
    
    model = nn.Sequential(
        nn.Linear(96,1),
        nn.Sigmoid()
    )
    
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.02)
    
    train_losses = np.zeros(EPOCHS)
    valid_losses = np.zeros(EPOCHS)
    
    scores = np.zeros(EPOCHS)
    
    for ep in range(EPOCHS):
        
        optimizer.zero_grad()
        
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        
        loss.backward()
        optimizer.step()
        
        outputs_valid = model(X_valid)
        loss_valid = criterion(outputs_valid, y_valid)
        
        scores += roc_auc_score(y_valid.detach().numpy(), outputs_valid.detach().numpy())
        
        train_losses[ep] = loss.item()
        valid_losses[ep] = loss_valid.item()
        
    print(f"Fold: {fold + 1} Loss: {np.mean(valid_losses)} AUC: {np.mean(scores)}")
    plt.plot(train_losses, label='train loss')
    plt.plot(valid_losses, label='test loss')
    plt.legend()
    plt.show()

In [None]:
X_test = X_test.values
X_test = torch.from_numpy(X_test.astype(np.float32))
predictions = model(X_test)
predictions = predictions.detach().numpy()

In [None]:
ss['target'] = predictions
ss.to_csv('./submission.csv', index=False)
ss.head()