In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Loading the dataset into notebook

data = pd.read_csv('/kaggle/input/bank-telemarketing-campaign-case/bank_marketing_updated_v1.csv')

data.head(4)

In [None]:
# Formating the rows as there are some discrepencies in the first two rows

data = pd.read_csv('/kaggle/input/bank-telemarketing-campaign-case/bank_marketing_updated_v1.csv', skiprows = 2)

data.head(4) # this dataset looks good

1. After inspecting the dataset we can say that 'customerid' is of no importance in the analysis process, so we can drop it.
2. The 'jobedu' column should be separeted into two different columns
3. We can create a separate column for year
4. We can change the datatypes of column 'age', 'month'


In [None]:
# 1. Dropping customer id column
data.drop('customerid', axis = 'columns', inplace = True)

data.head()

Regular function:

> def square(x):
>     return x * x
    
Lambda function:

> lambda x: x * x


In [None]:
# 2. Splitting 'jobedu' in two columns by using lambda function

# Job column
data['job'] = data['jobedu'].apply(lambda x: x.split(',')[0])

# Education column
data['education'] = data['jobedu'].apply(lambda x: x.split(',')[1])

# Drop the 'jobedu' column
data.drop('jobedu', axis = 1, inplace = True)

# Result
data.head()

In [None]:
# Checking the missing values

data.isnull().sum()

In [None]:
# Dropping missing values in 'age' & response column

data.dropna(subset=['age'], inplace = True) # for age column

data.dropna(subset=['response'], inplace = True) # for response column

data.isnull().sum()

In [None]:
# Imputing values to the missing values for month column

month_mode = data.month.mode()[0]

data.month.fillna(month_mode, inplace = True)

data.isnull().sum()

In [None]:
# 3. Splitting the month column in two

# Month column
data['month1'] = data['month'].apply(lambda x: x.split(',')[0])

# Year column
data['year'] = data['month'].apply(lambda x: x.split(',')[1])

# Drop the 'month' column
data.drop('month', axis = 1, inplace = True)

# Result
data.head()

In [None]:
# 4. Changing the datatypes of columns where necessary

data.age.astype('int32')

data.dtypes


### Handling Outliers:

Two types of outliers:
1. Univariate Outlier
2. Multivariate Outlier

We can handle outliers by either dropping them, imputing with values or leaving them as is.[](http://)

### Analysis:

#### 1. Univariate Analysis

**a. Categorical Unordered Univariate Analysis**

In [None]:
# Job status category

data.job.value_counts()

data.job.value_counts().plot.barh()
plt.show()


In [None]:
# Percentage of each job using normalize

data.job.value_counts(normalize = True)

data.job.value_counts(normalize = True).plot.barh()
plt.show()

**b. Categorical Ordered Univariate Analysis**

In [None]:
# Education variable

data.education.value_counts(normalize = True)

data.education.value_counts(normalize = True).plot.pie()
plt.show()

In [None]:
# month variable

data.month1.value_counts()

data.month1.value_counts().plot.barh()
plt.show()

**c . Numerical Univariate Analysis**

In [None]:
# Age column

data.age.describe()

#### 2. Bivariate Analysis

**a. Numeric-Numeric Analysis**

This analysis can be perfomed by using:
1. Scatter Plot
2. Pair Plot
3. Correlation Matrix

1. Scatter Plot

In [None]:
# Lets take the salary, balance and age variables

# 1. Age vs Salary

data.plot.scatter(x = "age", y = "salary")
plt.show()

In [None]:
# 2. Age vs Balance

data.plot.scatter(x = "age", y = "balance")
plt.show()

In [None]:
# 2. Salary vs Balance

data.plot.scatter(x = "salary", y = "balance")
plt.show()

2. Pair Plot

In [None]:
# Using pair plots from seaborn library

sns.pairplot(data = data, vars=['salary', 'balance', 'age'])
plt.show()

3. Correlation Matrix

In [None]:
# Corr function

data[['age', 'salary', 'balance']].corr()

In [None]:
# Using seaborn heatmap

sns.heatmap(data[['age', 'salary', 'balance']].corr(), annot= True, cmap = 'Blues')
plt.show()

**b. Numeric-Categorical Analysis**

In [None]:
# loan and balance -mean

data.groupby('loan')['balance'].mean()

In [None]:
# loan and balance - median

data.groupby('loan')['balance'].median()

In [None]:
# Plotting

sns.boxplot(data.loan, data.balance)
plt.show()

**c. Categorical-Categorical Analysis**

In [None]:
# Converting categorical variables into factors
# 1 = yes
# 0 = no

data['response_rate'] = np.where(data.response=='yes', 1, 0)
data.response_rate.value_counts()

In [None]:
# Let’s see how the response rate varies for different categories in marital status.

data.groupby('marital')['response_rate'].mean().plot.bar()
plt.show()