<a href="https://colab.research.google.com/github/RahulDogra-92/Supervised-Learning-Pipeline/blob/main/Supervised_Learning_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Imports

In [64]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")  #Ignore the warnings
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [65]:
train = pd.read_csv('train_s3TEQDk.csv')

In [66]:
# Inspect the first few lines of your data using head()
train.head(3)

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
0,NNVBBKZB,Female,73,RG268,Other,X3,43,No,1045696,No,0
1,IDD62UNG,Female,30,RG277,Salaried,X1,32,No,581988,No,0
2,HD3DSEMC,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0


In [67]:
# Count the number of missing values in each column
train.isnull().sum()

ID                         0
Gender                     0
Age                        0
Region_Code                0
Occupation                 0
Channel_Code               0
Vintage                    0
Credit_Product         29325
Avg_Account_Balance        0
Is_Active                  0
Is_Lead                    0
dtype: int64

In [68]:
# Fill the missing categorical values with mode
train['Credit_Product'].fillna(str(train['Credit_Product'].mode().values[0]),inplace=True)

In [69]:
train.isnull().sum()

ID                     0
Gender                 0
Age                    0
Region_Code            0
Occupation             0
Channel_Code           0
Vintage                0
Credit_Product         0
Avg_Account_Balance    0
Is_Active              0
Is_Lead                0
dtype: int64

Feature Engineering

In [70]:
#List of all column names whose data types are string,
train_categorical = train.select_dtypes(include=['object'])
train_categorical.head(3)

Unnamed: 0,ID,Gender,Region_Code,Occupation,Channel_Code,Credit_Product,Is_Active
0,NNVBBKZB,Female,RG268,Other,X3,No,No
1,IDD62UNG,Female,RG277,Salaried,X1,No,No
2,HD3DSEMC,Female,RG268,Self_Employed,X3,No,Yes


In [71]:
train_categorical.drop(['ID'],axis=1,inplace=True)
train_categorical.head(3)

Unnamed: 0,Gender,Region_Code,Occupation,Channel_Code,Credit_Product,Is_Active
0,Female,RG268,Other,X3,No,No
1,Female,RG277,Salaried,X1,No,No
2,Female,RG268,Self_Employed,X3,No,Yes


In [72]:
# Create a label encoder for each column. Encode the values
le = preprocessing.LabelEncoder()
train_categorical = train_categorical.apply(le.fit_transform)
train_categorical.head(3)

Unnamed: 0,Gender,Region_Code,Occupation,Channel_Code,Credit_Product,Is_Active
0,0,18,1,2,0,0
1,0,27,2,0,0,0
2,0,18,3,2,0,1


In [73]:
# Inspect the data types of the columns of the data frame
print(train_categorical.dtypes)

Gender            int64
Region_Code       int64
Occupation        int64
Channel_Code      int64
Credit_Product    int64
Is_Active         int64
dtype: object


In [74]:
train = train.drop(train.select_dtypes(include=['object']), axis=1)
train = pd.concat([train,train_categorical], axis=1)
train.head()

Unnamed: 0,Age,Vintage,Avg_Account_Balance,Is_Lead,Gender,Region_Code,Occupation,Channel_Code,Credit_Product,Is_Active
0,73,43,1045696,0,0,18,1,2,0,0
1,30,32,581988,0,0,27,2,0,0,0
2,56,26,1484315,0,0,18,3,2,0,1
3,34,19,470454,0,1,20,2,0,0,0
4,30,33,886787,0,0,32,2,0,0,0


In [75]:
train['Is_Lead'] = train['Is_Lead'].astype('category')

In [76]:
#For train dataset
X = train.drop('Is_Lead',axis=1)
y = train['Is_Lead']

Your First Pipeline

In [77]:
# Split the data into train and test, with 20% as test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, random_state=1)

In [78]:
# Create a random forest classifier, fixing the seed to 2
rf_model = RandomForestClassifier(random_state=2)
rf_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=2, verbose=0,
                       warm_start=False)

In [80]:
# Use it to predict the labels of the test data
rf_predictions = rf_model.predict(X_test)

In [83]:
# Assess the accuracy of both classifiers
print(accuracy_score(y_test, rf_predictions))

0.7796113541560687


Model complexity and overfitting