# Introduction

## The following competition is basically to get used to working on tabular data. I used scikit learn to solve this problem. Lets dive in :)

In [None]:
import numpy as np
import pandas as pd
import sklearn
import os
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression,SGDRegressor,Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Loading Data

In [None]:
train_df = pd.read_csv("../input/tabular-playground-series-mar-2021/train.csv")
test_df = pd.read_csv("../input/tabular-playground-series-mar-2021/test.csv")

In [None]:
train_df.head()

In [None]:
test_df.head()

# Details of Data

In [None]:
train_ds = train_df.drop(["id","target"],axis=1)
labels = train_df["target"]
test_ds = test_df.drop(["id"],axis=1)

print("--------DETAILS OF TRAINING AND TESTING DATA SIZE---------")
print("Number of instances in train data: ",len(train_ds))
print("Number of instances in test data: ",len(test_ds))
print("There are {} feature columns".format(test_ds.shape[1]))

cat_cols = []
num_cols = []

for col in test_ds.columns:
    if test_ds[col].dtype == "float64":
        num_cols.append(col)
    else:
        cat_cols.append(col)
        
print("\n--------DETAILS OF COLUMNS-------")
print("There are {} categorical and {} numerical columns.".format(len(cat_cols),len(num_cols)))
print("Categorical columns are: ",cat_cols)
print("Numerical columns are: ",num_cols)

# Visualization

## COMING SOON :)

# Preprocessing Data

In [None]:
cat_encoder = LabelEncoder()
scaler = StandardScaler()
#scaler = MinMaxScaler()

ds = pd.concat([train_ds,test_ds],axis=0)
for col in cat_cols:
    ds[col] = cat_encoder.fit_transform(ds[col])

#ds[cat_cols] = scaler.fit_transform(ds[cat_cols])
train_ds = ds.iloc[:len(train_ds),:]
test_ds = ds.iloc[len(train_ds):,:]

train_ds.head()

# Hyperparameter Tuning

### **I am using StratifiedKFold to divide the data into training and validation and searching for best hyper parameter C using GridSearchCV and using the same for training.**

In [None]:
SEED = 143
N_SPLITS = 10
kfold = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)


'''sgd = SGDRegressor()
logreg = LogisticRegression(class_weight='balanced')
param = {'C':[0.001,0.003,0.005,0.01,0.03,0.05,0.1,0.3,0.5,1,2,3,3,4,5,10,20]}
clf = GridSearchCV(logreg,param,scoring='roc_auc',refit=True,cv=10)
clf.fit(train_ds,labels)
print('Best roc_auc: {:.4}, with best C: {}'.format(clf.best_score_, clf.best_params_))
lr = LogisticRegression(C=clf.best_params_["C"])'''

rf_clf = RandomForestClassifier(max_depth=7, n_jobs=-1)
param = {'n_estimators':[150,200,225,250,275,300]}
clf = GridSearchCV(rf_clf,param,scoring='roc_auc',refit=True,cv=10)
clf.fit(train_ds,labels)
print('Best roc_auc: {:.4}, with best C: {}'.format(clf.best_score_, clf.best_params_))
clf = RandomForestClassifier(n_estimators=clf.best_params_["n_estimators"], max_depth=7, n_jobs=-1)

# Training and prediction

In [None]:
i=1
pred_test_full =0
max_auc = 0

for train_idx,test_idx in kfold.split(train_ds,labels):
    print(' Running {} of KFold {}'.format(i,kfold.n_splits))
    xtr,xvl = train_ds.loc[train_idx],train_ds.loc[test_idx]
    ytr,yvl = labels.loc[train_idx],labels.loc[test_idx]
    
    #lr.fit(xtr,ytr)
    #sgd.fit(xtr,ytr)
    clf.fit(xtr,ytr)
    score = roc_auc_score(yvl,clf.predict(xvl))   
    print('ROC AUC score:',score)
        
    pred_test = clf.predict_proba(test_ds)[:,1]                #sgd.predict(test_ds)   #lr.predict_proba(test_ds)[:,1]
    pred_test_full +=pred_test
    i+=1

# Submission

In [None]:
# dividing by number of folds
y_pred = pred_test_full/N_SPLITS

pred_csv = pd.read_csv("../input/tabular-playground-series-mar-2021/sample_submission.csv")
pred_csv.target = y_pred

pred_csv.to_csv("submission.csv",index=False)

## That's it, you have done it✌👏.

**Kindly give me suggestions regarding training or preprocessing the data. Always open to learn😊.**
###  Happy coding❤