In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Variables

*  age (numeric)
*  job : type of job (categorical: "admin.","unknown","unemployed","management","housemaid","entrepreneur","student", "blue-collar","self-employed","retired","technician","services")
*  marital : marital status (categorical: "married","divorced","single"; note: "divorced" means divorced or widowed)
*  education (categorical: "unknown","secondary","primary","tertiary")
*  default: has credit in default? (binary: "yes","no")
*  balance: average yearly balance, in euros (numeric)
*  housing: has housing loan? (binary: "yes","no")
*  loan: has personal loan? (binary: "yes","no")

### related with the last contact of the current campaign:
*  contact: contact communication type (categorical: "unknown","telephone","cellular")
*  day: last contact day of the month (numeric)
*  month: last contact month of year (categorical: "jan", "feb", "mar", …, "nov", "dec")
*  duration: last contact duration, in seconds (numeric)

### other attributes:
*  campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
*  pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)
*  previous: number of contacts performed before this campaign and for this client (numeric)
*  poutcome: outcome of the previous marketing campaign (categorical: "unknown","other","failure","success")

### Output variable (desired target):
* y - has the client subscribed a term deposit? (binary: "yes","no")

In [None]:
df = pd.read_csv('../input/banking-dataset-marketing-targets/train.csv', sep=";")
df.head()

In [None]:
df.tail()

### Understaing the data

* shape
* data type of all the columns
* missing values in all columns
* basic stats of all columns
* distinct values in all columns

#### Shape

In [None]:
df.shape #(There ar 45211 rows and 17 columns)

#### Data type of all the columns

In [None]:
df.dtypes #The dtypes property is used to find the dtypes in the DataFrame.


In [None]:
#Creating a series from a dataframe
# Series represents a single column of a dataframe
type(df.marital)

#### Missing values in all columns

In [None]:
# Checking the missing values in all the columns

print("Missing values in all the columns:")
print(df.isna().sum())


In [None]:
# check missing values for each column 
df.isnull().sum().sort_values(ascending=False)

#### Basic stats of all columns

In [None]:
df.describe()

In [None]:
df.describe(include='all')

In [None]:
#### Distinct values in all columns

In [None]:
df.nunique()

In [None]:
df.nunique(axis=0)

############################################################################################

### Basic EDA

* Number of records in different job types 
* Number of records in different martial status 
* Number of records in different education types 
* Number of records in different default types 
* Number of records in different y types 

#### Number of records in different job types 

In [None]:
df.groupby(by=['job'], as_index=False)['balance'].count()

#### Number of records in different martial status 

In [None]:
df.groupby(by=['marital'], as_index=False)['balance'].count()

#### Number of records in different education types 

In [None]:
df.groupby(by=['education'], as_index=False)['balance'].count()

#### Number of records in different default types 

In [None]:
df.groupby(by=['default'], as_index=False)['balance'].count()

#### Number of records in different y types 

In [None]:
df.groupby(by=['y'], as_index=False)['balance'].count()

############################################################################################

### Slicing the data

* Select rows with age above 25 and less than 45 and select age, balance and duration columns to create a separate dataframe. Calculate avg values of these columns.
* Create a mapping for month column and create a new month column eg jan-1, feb-2 .....
* Create a cross tab with loan and housing loan column, loan and y & housing and y


#### Select rows with age above 25 and less than 45 and select age, balance and duration columns to create a separate dataframe. Calculate avg values of these columns.

In [None]:
df_temp_new = df[(df['age'] > 25) & (df['age'] < 45)]

In [None]:
df_new = df_temp_new[['age', 'balance','duration']]
df_new.head()

In [None]:
df.month.unique()

#### Create a mapping for month column and create a new month column eg jan-1, feb-2 .....

In [None]:
df['month_new'] = df['month'].map({'jan': 1, 'feb': 2,'mar': 3,'apr': 4,'may': 5,'jun': 6,'jul': 7,'aug': 8,'sep': 9,'oct': 10,'nov': 11,'dec': 12})
df.head()

#### Create a cross tab with loan and housing loan column, loan and y & housing and y

In [None]:
pd.crosstab(df.loan,df.housing)

In [None]:
pd.crosstab(df.loan,df.y)

In [None]:
pd.crosstab(df.y,df.housing)

############################################################################################

### Visualization

* Distribution on age
* Distribution on balance
* Distribution on duration
* Bar plot of jobs
* Bar plot of marital
* Bar plot of education

#### Distribution on age

In [None]:
import seaborn as sns 

In [None]:
sns.countplot(x='age',data=df)

#### Distribution on balance

In [None]:
plt.figure(figsize=(100, 18))
sns.countplot(x='balance',data=df)
plt.show()

#### Distribution on duration

In [None]:
sns.countplot(x='duration',data=df)

#### Bar plot of jobs

In [None]:
import seaborn as sns
sns.barplot(x="job", y="age", data=df)

#### Bar plot of marital

In [None]:
sns.barplot(x="marital", y="age", data=df)

#### Bar plot of education

In [None]:
sns.barplot(x="education", y="age", data=df)

############################################################################################

### Stats

* Perform t test/ anova on y * age
* Perform t test/ anova on y * balance
* Perfrom chi sq on y * job
* Perfrom chi sq on y * education

#### Perform t test/ anova on y * age

In [None]:
 df.age.astype(int)

In [None]:
df['y_new'] = df['y'].map({'yes': 1, 'no': 0})
df.head()

In [None]:
from scipy import stats
stats.ttest_ind(df.age,df.y_new)

#### Perform t test/ anova on y * balance

In [None]:
stats.ttest_ind(df.balance,df.y_new)

#### Perfrom chi sq on y * job

In [None]:
contigency= pd.crosstab(df['y_new'], df['job']) 
contigency

In [None]:
#Chi-square test of independence. 
from scipy.stats import chi2_contingency
c, p, dof, expected = chi2_contingency(contigency) 
# Print the p-value
print(p)

#### Perfrom chi sq on y * education

In [None]:
contigency= pd.crosstab(df['y_new'], df['education']) 
contigency

In [None]:
c, p, dof, expected = chi2_contingency(contigency) 
# Print the p-value
print(p)

############################################################################################

### Insights

#### Build assumptions and test using the data
 
* Do young customer book more TDs or older generation?
* Do people who already have loan tend to buy TDs?
* Add some more and test you assumptions/hypothesis...

#### Do young customer book more TDs or older generation?

In [None]:
pd.crosstab(df.age,df.y_new)

#### Do people who already have loan tend to buy TDs?

In [None]:
pd.crosstab(df.loan,df.y_new).plot.bar(stacked=True)

#### Add some more and test you assumptions/hypothesis...