In [None]:
import os

import numpy as np 
import pandas as pd 
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit 
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import mutual_info_classif

In [None]:
file_sub = '/kaggle/input/tabular-playground-series-nov-2021/sample_submission.csv'
fale_train = '/kaggle/input/tabular-playground-series-nov-2021/train.csv'
file_test = '/kaggle/input/tabular-playground-series-nov-2021/test.csv'
target = 'target'
column_id = 'id'

In [None]:
def valida_dados(data):
    #Não existe valores Nulos
    print('missing data', data.isna().sum().sum())

def base_line_logistic(X_data, y_data):
    kfold = ShuffleSplit(n_splits=5, random_state=1)
    pipe = Pipeline([('standart', StandardScaler()),('nlp', LogisticRegression())])

    cross = cross_validate(pipe ,X_data, y_data, cv=kfold, scoring= 'roc_auc', verbose=True, return_train_score=True )
    return cross

def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props
def sample_submission(model, X_test):
    resultado = model.predict_proba(X_test)
    solution = pd.read_csv(file_sub)
    solution[['target_0', target]] = resultado
    solution[['id',target]].to_csv('submission.csv', index=False)

In [None]:
data = pd.read_csv(fale_train)

In [None]:
data.describe()

In [None]:
valida_dados(data)

In [None]:
data[target].value_counts(normalize=True)

In [None]:
data = reduce_mem_usage(data)

In [None]:
X_data = data.drop(columns= [target, column_id]) 
y_data = data[target]
cross = base_line_logistic(X_data, y_data)
print(cross['test_score'].mean())

In [None]:
process = False
drop_features = []
dict_multa_info = {}
if process:
    mutual_info = mutual_info_classif(X_data, y_data)
    drop_features = []
    dict_multa_info = {}
    for i in range(len(columns)):
        dict_multa_info[columns[i]] = mutual_info[i]
        if mutual_info[i] == 0:
            drop_features.append(columns[i])
    print(dict_multa_info)
else:
    dict_multa_info = {
     'f6': 0.0,
     'f14': 0.0,
     'f15': 0.0,
     'f28': 0.0,
     'f29': 0.0,
     'f35': 0.0,
     'f36': 0.0,
     'f38': 0.0,
     'f39': 0.0,
     'f45': 0.0,
     'f48': 0.0,
     'f59': 0.0,
     'f65': 0.0,
     'f68': 0.0,
     'f72': 0.0,
     'f73': 0.0,
     'f74': 0.0,
     'f75': 0.0,
     'f77': 0.0,
     'f78': 0.0,
     'f83': 0.0,
     'f85': 0.0,
     'f86': 0.0,
     'f87': 0.0,
     'f88': 0.0,
     'f93': 0.0,
     'f95': 0.0,
     'f98': 0.0}
    for key, value in dict_multa_info.items():
        if value ==0:
            drop_features.append(key)
if not drop_features:
    X_data.drop(columns=drop_features, inplace=True)

In [None]:
cross = base_line_logistic(X_data, y_data)
cross['test_score'].mean()

In [None]:
data_test = pd.read_csv(file_test).drop(columns= [column_id]) 

In [None]:
if not drop_features:
    data_test.drop(columns=drop_features, inplace=True)

In [None]:
model = Pipeline([('standart', StandardScaler()),('lr', LogisticRegression())])
model.fit(X_data, y_data)
sample_submission(model, data_test)