### **Setting the view of the notebook**

In [None]:
from IPython.core.display import display, HTML, Javascript

# ----- Notebook Theme -----
color_map = ['#6166B3', '#e8eff6', '#0b2553']

prompt = color_map[-1]
main_color = color_map[0]

css_file = '''

    div #notebook {
    background-color: white;
    line-height: 20px;
    }

    #notebook-container {
    %s
    margin-top: 2em;
    padding-top: 2em;
    border-top: 4px solid %s; /* light orange */
    -webkit-box-shadow: 0px 0px 8px 2px rgba(224, 212, 226, 0.5); /* pink */
    box-shadow: 0px 0px 8px 2px rgba(224, 212, 226, 0.5); /* pink */
    }

    div .input {
    margin-bottom: 1em;
    }

    .rendered_html h1, .rendered_html h2, .rendered_html h3, .rendered_html h4, .rendered_html h5, .rendered_html h6 {
    color: %s; /* light orange */
    font-weight: 600;
    }

    div.input_area {
    border: none;
        background-color: %s; /* rgba(229, 143, 101, 0.1); light orange [exactly #E58F65] */
        border-top: 2px solid %s; /* light orange */
    }

    div.input_prompt {
    color: %s; /* light blue */
    }

    div.output_prompt {
    color: %s; /* strong orange */
    }

    div.cell.selected:before, div.cell.selected.jupyter-soft-selected:before {
    background: %s; /* light orange */
    }

    div.cell.selected, div.cell.selected.jupyter-soft-selected {
        border-color: %s; /* light orange */
    }

    .edit_mode div.cell.selected:before {
    background: %s; /* light orange */
    }

    .edit_mode div.cell.selected {
    border-color: %s; /* light orange */

    }
    '''
def to_rgb(h): 
    return tuple(int(h[i:i+2], 16) for i in [0, 2, 4])

main_color_rgba = 'rgba(%s, %s, %s, 0.1)' % (to_rgb(main_color[1:]))
open('notebook.css', 'w').write(css_file % ('width: 95%;', main_color, main_color, main_color_rgba, main_color,  main_color, prompt, main_color, main_color, main_color, main_color))

def nb(): 
    return HTML("<style>" + open("notebook.css", "r").read() + "</style>")
nb()

<div class="alert alert-block alert-info"> 😊If you found this to be interesting💡 and helpful✨, Please Upvote🔺🔺 and leave a comment💬 💡</div>

## Import Libraries

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings

In [None]:
pd.set_option('display.max_columns',None)
warnings.filterwarnings("ignore")

## Reading Data Frame

In [None]:
df = pd.read_csv("../input/tabular-playground-series-may-2022/train.csv")

In [None]:
df.head().style.background_gradient(cmap="Blues")

In [None]:
for col in df.select_dtypes('int64').columns:
    print(col,'\n',df[col].unique(),'\n\n')

#### **Checking info of features**

In [None]:
df.info()

#### Observations - 

* There are a total of Nine Hundred Thousand Records(900000)
* Out of all the columns 16 are off type float, another 16 are of type int and 1 is of type object.
* f_07-f_18 and f_29-f_30 are discrete variables from range 1-16
* f_27 has a sequence of alphabets of 10 letters

#### **Checking the data description**

In [None]:
def colour_map(value):
    if value < 0:
        color = 'red'
    elif value > 0:
        color = 'green'
    else:
        color = "black"
        
    return "color: %s" %color

In [None]:
df.describe().style.background_gradient(cmap="Blues")

In [None]:
df.describe().T.style.applymap(colour_map)

## Exploratory Data Analysis

#### Value Counts For Each Discrete Variables

In [None]:
def feature_value_counts(df,dtype='int64'):
    # Select The Discrete Columns(acc to this data all int64s are discrete)
    cols = list(df.select_dtypes(dtype).columns)
    
    # Removing ID from columns
    if 'id' in cols:
        cols.remove('id')
    # Loop through each category
    for col in cols:
        # Create subplots
        fig,(ax1,ax2) = plt.subplots(1,2,figsize=(12,8))
        
        # Create a value count plot
        plt1 = sns.countplot(df[col],ax=ax1,palette='Set2')
        # Set Title & Labels For First Plot
        ax1.set_title(f"Count Of Values For Column {col}")
        ax1.set_ylabel("Count Of Value")
        ax1.set_xlabel("Values")
        # annotate the barplot
        for p in plt1.patches:
            plt1.annotate("{:.1f}".format(p.get_height()),(p.get_x(),p.get_height()),rotation=90)
        
        # Create a pie plot
        # Create a circle for donot plot
        circle = plt.Circle((0,0),.8,color='white')
        # Plot a pie plot
        df[col].value_counts().plot(kind='pie',ax=ax2,shadow=True,cmap='tab20',autopct="%1.0f%%")
        # Set Title & Labels For First Plot
        ax2.set_title(f"Count Of Values For Column {col}")
        ax2.set_ylabel("")
        # Add circle to plot
        ax2.add_artist(circle)
        
        plt.tight_layout(pad=3)
        plt.show()

In [None]:
feature_value_counts(df)

#### Relation Of Each Discrete With Target Variable

In [None]:
cols = df.select_dtypes('int64').columns

# Iterate Throught Each Column
for col in cols[1:]:
    # Plot a catplot
    sns.catplot(data=df,x=col,col='target',kind='count',palette='cool')
    plt.show()
    

#### Distributions Of Continuous Variables

In [None]:
cols = df.select_dtypes('float').columns

for col in cols[1:]:
    # Create Subplots
    fig,(ax1,ax2,ax3) = plt.subplots(1,3,figsize=(15,7))
    
    # Plot 1 ---> Create a boxplot
    sns.boxplot(data=df,x="target",y=col,ax=ax1)
    
    # Plot 2 ---> Create a Jointplot
    sns.violinplot(data=df,x='target',y=col,ax=ax2)
    
    # Plot 3 ---> Scatter Plot
    sns.scatterplot(data=df,x=col,y='target',ax=ax3)
    
    plt.tight_layout(pad=3)
    plt.show()

#### Checking Correlation Between Variables

In [None]:
data = df.corr().loc[:,['target']]

# Fetch Index and Values From Data
index = data.index[1:]
values = data.values.flatten()[1:]

# Set figure size, title and labels
fig,ax = plt.subplots(figsize=(20,8))
ax.set_title("CORELLATION OF EACH VARIABLE WITH TARGET")
ax.set_xlabel("Columns")
ax.set_ylabel("Correlation")

# Plot a Barplot
plot = plt.bar(index,values,color=['red' if x<0 else 'green' for x in values])

# Annotate Plots
for p in ax.patches:
    ax.annotate("{:.1f}".format(p.get_height()),(p.get_x(),p.get_height()))

# Show plot
plt.show()

In [None]:
corr = df.corr()

plt.figure(figsize=(20,10))
plt.title("CORELLATION BETWEEN FEATURES")
sns.heatmap(corr,annot=True,cbar=False,cmap="cool",fmt='.2f')

plt.show()

## Data Preparation

In [None]:
train = df.copy()

In [None]:
train.head()

In [None]:
train.drop(columns=['id','f_27'],inplace=True)

In [None]:
train.head()

## Train - Test - Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
Y = train['target']
X = train.drop(columns=['target'],inplace=False)

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,train_size=0.8,random_state=11)

## Modelling

#### Creating  function which can check the metrics

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
def metrics(model,X_val,Y_val,nn=False,th=0.5):
    if nn == False:
        # Make Predictions on the validation data
        Y_pred = model.predict(X_val)
    else:
        # Make Predictions and apply threshold
        Y_pred = [1 if x>=th else 0 for x in model.predict(X_val)]
    # Print the classification report
    print(classification_report(Y_pred,Y_val))
    # Create a confusion matrix
    cm = confusion_matrix(Y_pred,Y_val)
    # Plot the matrix
    sns.heatmap(cm,cbar=False,cmap='cool',annot=True,fmt='d')
    # Give labels and title to the plot
    plt.title("Confusion Matrix")
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    # Plot the plot
    plt.show()

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()

In [None]:
lr.fit(X_train,Y_train)

In [None]:
lr.score(X_test,Y_test)

In [None]:
lr.score(X_train,Y_train)

In [None]:
metrics(lr,X_test,Y_test)

### XGBoost

In [None]:
import xgboost as xg

In [None]:
xgb = xg.XGBClassifier()

In [None]:
xgb.fit(X_train,Y_train)

In [None]:
xgb.score(X_train,Y_train)

In [None]:
xgb.score(X_test,Y_test)

In [None]:
Y_pred = xgb.predict(X_test)
Y_val = Y_test

In [None]:
metrics(xgb,X_test,Y_test)

## Let's do some feature engineering with f_27 and check the accuracy

### What kind of features can be build?

- Count of each alphabet
- Sum of position of each alphabet


In [None]:
alpha_list = []
for val in df['f_27'].values:
    if len(alpha_list) == 26:
        break
    for alpha in val:
        if alpha_list == []:
            alpha_list.append(alpha)
        elif alpha not in alpha_list:
            alpha_list.append(alpha)

In [None]:
print(sorted(alpha_list))

In [None]:
len(alpha_list)

* There are a total of 20 alphabets from A to T

#### Let's get the count of each alphabet for each record

In [None]:
# List when we will append the count
total_counts = []
# List when we will append the sums of position
position_list = []

# Iterate throught each value
for val in df['f_27']:
    # Dictionary of alphabets where values represent counts
    alpha_dict_count = {'A':0, 'B':0, 'C':0, 'D':0, 'E':0, 'F':0, 'G':0, 'H':0, 'I':0, 'J':0, 'K':0, 'L':0, 'M':0, 'N':0, 'O':0, 'P':0, 'Q':0, 'R':0, 'S':0, 'T':0}
    # dictionary of aplhabets where values represent the position of an aplhabet
    alpha_dict_pos = {'A':0, 'B':0, 'C':0, 'D':0, 'E':0, 'F':0, 'G':0, 'H':0, 'I':0, 'J':0, 'K':0, 'L':0, 'M':0, 'N':0, 'O':0, 'P':0, 'Q':0, 'R':0, 'S':0, 'T':0}
    for position,alpha in enumerate(val):
        # Increment the count of that alphaet
        alpha_dict_count[alpha]+=1
        # Calculate summ of position of that alphabet
        alpha_dict_pos[alpha] += position
    # Add the values as a list to total_counts
    total_counts.append([x for x in alpha_dict_count.values()])
    # Add the values as a list to position_list
    position_list.append([x for x in alpha_dict_pos.values()])

In [None]:
position_list

In [None]:
total_counts

In [None]:
feature_count = pd.DataFrame(total_counts,columns=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T'])
feature_sum_pos = pd.DataFrame(position_list,columns=['As', 'Bs', 'Cs', 'Ds', 'Es', 'Fs', 'Gs', 'Hs', 'Is', 'Js', 'Ks', 'Ls', 'Ms', 'Ns', 'Os', 'Ps', 'Qs', 'Rs', 'Ss', 'Ts'])

### Concatinating it with train data

In [None]:
train = df.copy()

In [None]:
train = pd.concat([train,feature_count,feature_sum_pos],axis=1)

In [None]:
train.head()

In [None]:
train.shape

### Drop unnecessary features

In [None]:
train.drop(columns=['id','f_27'],inplace=True)

## Train - Test - Split

In [None]:
Y = train['target']
X = train.drop(columns=['target'],inplace=False)

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,train_size=0.8,random_state=11)

## Modelling

#### Logistic Regression

In [None]:
lr2 = LogisticRegression()

In [None]:
lr2.fit(X_train,Y_train)

In [None]:
lr2.score(X_train,Y_train)

In [None]:
lr2.score(X_test,Y_test)

In [None]:
metrics(lr2,X_test,Y_test)

#### XGB Classifier

In [None]:
xgb2 = xg.XGBClassifier()

In [None]:
xgb2.fit(X_train,Y_train)

In [None]:
xgb2.score(X_train,Y_train)

In [None]:
xgb2.score(X_test,Y_test)

In [None]:
metrics(xgb2,X_test,Y_test)

## Let's try to create latent features using Keras Embedding for f_27

In [None]:
from tensorflow.keras.preprocessing.text import one_hot

In [None]:
sequences = df['f_27'].values
sequences

In [None]:
n_sequences = []
for sequence in sequences:
    n_sequence = ''
    for index,word in enumerate(sequence):
        if index<9:
            n_sequence += word + " "
        else:
            n_sequence += word
    n_sequences.append(n_sequence)

### One Hot Representation

In [None]:
one_hot_repr = [one_hot(alphabet,30) for alphabet in n_sequences]

* We have encoded each alphabet as a fixed number in range 0-30

### Word Embedding Representation

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential

#### Create a model with embedding layer

In [None]:
# Dimensions of features wanted
dimensions = 1
# Vocab size of alphabet
voc_size = 30
# Length of input
input_length = 10

In [None]:
# Create a sequential object
model = Sequential()
# Add an Embedding Layer
model.add(Embedding(voc_size,dimensions,input_length=input_length))
# Compilee Model
model.compile(optimizer='adam',loss='mse')

In [None]:
model.summary()

In [None]:
embedded_features = model.predict(one_hot_repr)

In [None]:
print(embedded_features.shape)
embedded_features

In [None]:
embedded_reshaped_features = pd.DataFrame(embedded_features.reshape(900000,10),columns=[f"e{i}" for i in range(1,11)])

In [None]:
df.shape

In [None]:
train = df.copy()

In [None]:
train.drop(columns=['id','f_27'],inplace=True)

In [None]:
train = pd.concat([train,feature_count,feature_sum_pos,embedded_reshaped_features],axis=1)

In [None]:
train1 = train.copy()

In [None]:
con_col = train1.select_dtypes(float).columns

for col in con_col:
    sns.boxplot(np.cbrt(train[col]))
    plt.show()

In [None]:
for col in con_col:
    train[col] = np.cbrt(train[col])

In [None]:
train.head()

## Train Test Split

In [None]:
X = train.drop(columns=['target'],inplace=False)
Y = train['target']

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,train_size=0.8,random_state=11)

## Modelling

#### Logistic Regression

In [None]:
lr3 = LogisticRegression()

In [None]:
lr3.fit(X_train,Y_train)

In [None]:
lr3.score(X_train,Y_train)

In [None]:
lr3.score(X_test,Y_test)

In [None]:
metrics(lr3,X_test,Y_test)

#### XGB

In [None]:
import xgboost as xg

In [None]:
xg3 = xg.XGBClassifier()

In [None]:
xg3.fit(X_train,Y_train)

In [None]:
xg3.score(X_train,Y_train)

In [None]:
xg3.score(X_test,Y_test)

In [None]:
metrics(xg3,X_test,Y_test)

## Making Predictions On Test Set And Submit

In [None]:
def data_preparation(df, embedding_model):
    # Drop the id, f_27
    df_copy = df.drop(columns=['id','f_27'],inplace=False)
    # Get the count of each alphabeet and sum of position
    # List when we will append the count
    total_counts = []
    # List when we will append the sums of position
    position_list = []

    # Iterate throught each value
    for val in df['f_27']:
        # Dictionary of alphabets where values represent counts
        alpha_dict_count = {'A':0, 'B':0, 'C':0, 'D':0, 'E':0, 'F':0, 'G':0, 'H':0, 'I':0, 'J':0, 'K':0, 'L':0, 'M':0, 'N':0, 'O':0, 'P':0, 'Q':0, 'R':0, 'S':0, 'T':0}
        # dictionary of aplhabets where values represent the position of an aplhabet
        alpha_dict_pos = {'A':0, 'B':0, 'C':0, 'D':0, 'E':0, 'F':0, 'G':0, 'H':0, 'I':0, 'J':0, 'K':0, 'L':0, 'M':0, 'N':0, 'O':0, 'P':0, 'Q':0, 'R':0, 'S':0, 'T':0}
        for position,alpha in enumerate(val):
            # Increment the count of that alphaet
            alpha_dict_count[alpha]+=1
            # Calculate summ of position of that alphabet
            alpha_dict_pos[alpha] += position
        # Add the values as a list to total_counts
        total_counts.append([x for x in alpha_dict_count.values()])
        # Add the values as a list to position_list
        position_list.append([x for x in alpha_dict_pos.values()])
        
    # Convert features to  data frame
    feature_count = pd.DataFrame(total_counts,columns=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T'])
    feature_sum_pos = pd.DataFrame(position_list,columns=['As', 'Bs', 'Cs', 'Ds', 'Es', 'Fs', 'Gs', 'Hs', 'Is', 'Js', 'Ks', 'Ls', 'Ms', 'Ns', 'Os', 'Ps', 'Qs', 'Rs', 'Ss', 'Ts'])
    
    # Concat features
    df_w_f = pd.concat([df,feature_count,feature_sum_pos],axis=1)
    
    # Sequence of alphabeets
    sequences = df['f_27'].values
    
    # Add spaces to the f_27 column
    n_sequences = []
    for sequence in sequences:
        n_sequence = ''
        for index,word in enumerate(sequence):
            if index<9:
                n_sequence += word + " "
            else:
                n_sequence += word
        n_sequences.append(n_sequence)
    
    # Create One-Hot Representation
    one_hot_repr = [one_hot(alphabet,30) for alphabet in n_sequences]
    
    # Create embeedding ffeaturees
    embedded_features = model.predict(one_hot_repr)
    
    # Convert features to dataframe
    n_rows = df.shape[0]
    embedded_reshaped_features = pd.DataFrame(embedded_features.reshape(n_rows,10),columns=[f"e{i}" for i in range(1,11)])
    
    # Concat dataframes
    df_w_e_f = pd.concat([df_w_f,embedded_reshaped_features],axis=1)
    
    # Reemove ID and f_27
    df_w_e_f.drop(columns=['id','f_27'],inplace=True)
    
    return df_w_e_f

In [None]:
test = pd.read_csv("../input/tabular-playground-series-may-2022/test.csv")

In [None]:
test_id = test['id']

In [None]:
clean_data = data_preparation(df=test, embedding_model=model)

In [None]:
Y_pred = xg3.predict(clean_data)

In [None]:
submission = pd.DataFrame({"id":test_id,'target':Y_pred})

In [None]:
submission.head(10)

In [None]:
submission['target'].value_counts()

## Let's Try Neural Network Approach

In [None]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import plot_model

In [None]:
nn_model = Sequential()

# Add first layer
nn_model.add(Dense(units=80,input_dim=80,activation='relu'))

# Add second layer
nn_model.add(Dense(units=200,activation='relu'))

# Add third layer
nn_model.add(Dense(units=400,activation='relu'))

# Add second layer
nn_model.add(Dense(units=200,activation='relu'))

# Add first layer
nn_model.add(Dense(units=80,activation='relu'))

# Add first layer
nn_model.add(Dense(units=20,activation='relu'))

# Add output layer
nn_model.add(Dense(units=1,activation='sigmoid'))

# Compile Model
nn_model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
train = data_preparation(df=df,embedding_model=model)

In [None]:
train.head()

In [None]:
train.shape

## Train Test Split

In [None]:
X = train.drop(columns=['target'],inplace=False)
Y = train['target']

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,train_size=0.8,random_state=11)

## Fit Model

In [None]:
nn_model.fit(X_train,Y_train,validation_data=(X_test,Y_test),epochs=20)

In [None]:
Y_pred = [1 if x>0.5 else 0 for x in nn_model.predict(X_test)]

In [None]:
metrics(nn_model,X_test,Y_test,nn=True,th=0.45)

## Test on Test Data For Submission

In [None]:
test = pd.read_csv("../input/tabular-playground-series-may-2022/test.csv")

In [None]:
test_id = test['id']

In [None]:
clean_data = data_preparation(test,model)

In [None]:
clean_data.shape

In [None]:
Y_pred = [1 if x>0.5 else 0 for x in nn_model.predict(clean_data)]

In [None]:
submission_nn = pd.DataFrame({"id":test_id,'target':Y_pred})

In [None]:
submission_nn.head(10)

<div class="alert alert-block alert-info"> We did it !!!🎉🎉🎉 we have successfully performed data visualization📊, data cleaning🧹, feature engineering🔧 and trained🏋️ a classifier with an accuracy of ~95%✅</div>

<div class="alert alert-block alert-info"> 📌 I hope you this was helpful✨, and thank you very much for checking this notebook. Please Upvote🔺🔺 and leave a comment💬 if you found this interesting💡 and useful.😀</div