In [32]:
import pandas as pd # Dataframe
import numpy as np # maths

################ Machine Learning #######################
from sklearn import preprocessing # Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


################ Data visualization #######################
import matplotlib.pyplot as plt # Plot 
plt.rc("font", size=14)
import seaborn as sns # Plot advanced version of matplotlib
import plotly
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

ModuleNotFoundError: No module named 'plotly'

# Importing dataset

In [None]:
# Read the data in
data = pd.read_csv(r"C:\Users\GENIUS\Desktop\Machine Learning\Logicals Regression\Credit_Card.csv")

# Column name and Dimension & Dropping Missing Value

In [None]:
data

In [None]:
data=data.dropna()  ## Row delete
data.shape # dimension

# Data Structure or type 

In [None]:
data.info()

# Data Pre-Processing / EDA

#### Assign Lables to  Categorical data 

In [None]:
data.Gender.value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder
data.Gender=(LabelEncoder().fit_transform(data.Gender))  # F= 0 & M= 1

In [None]:
data.Gender.value_counts()

In [None]:
data.info()

In [None]:
data.head()

# EDA

* Bad Rate Analysis == 0 : Good & 1 : Bad
* Bivariate Analysis 

In [None]:
data.target.value_counts()

In [None]:
(333/10000)*100  # 3.3% customer are bad

In [None]:
sns.countplot(x='target',data=data,palette='hls')

In [None]:
data.groupby('target').mean()

* Customer with more balance have higher chance to become bad customer

# Visualizations

In [None]:
data.income.hist()
plt.title('Income of customer')
plt.xlabel('Income')
plt.ylabel('Frequency')
#plt.savefig('Income_age')

In [None]:
data.balance.hist()
plt.title('Balance of Customer')
plt.xlabel('Balance')
plt.ylabel('Frequency')
#plt.savefig('Balance_age')

# Identify & Treatment of Outlier 

In [None]:
sns.boxplot(y='income',data=data)  # Single Variable

In [None]:
sns.boxplot(y='balance',data=data)  # Single Variable

In [None]:
q1 = data['balance'].quantile(0.25)
q3 = data['balance'].quantile(0.75)
iqr = q3-q1 #Interquartile range
upper_limit = q3+1.5*iqr #acceptable range
upper_limit

In [None]:
data['balance']=np.where(data['balance'] > upper_limit,upper_limit,data['balance']) 
# upper limit

In [None]:
sns.boxplot(y='balance',data=data)  # Single Variable

# Step 3 : Splitting the data in Training and Test set
* Using sklearn we split 70% of our data into training set and rest in test set.
* Setting random_state will give the same training and test set everytime on running the code

In [None]:
data_final=data

In [None]:
data_final.head()

In [None]:
# define our input variable (X) & output variable
X = data_final.drop('target', axis = 1)
Y = data_final[['target']]
# Split X and y into X_
X_train, X_test, y_train, y_test= train_test_split(X, Y, test_size=0.30, random_state=1)

# Step 4 : Performing Logistic Regression

In [None]:
train=pd.concat([X_train,y_train],axis=1)
train.head()

# Hypothesis

* H0 :- There is no log linear relationship between Target and Gender , Income & Balance

Vs 

* H1 :- There is log linear relationship between Target and Gender , Income & Balance

Alpha = 0.05

In [None]:
# Needed to run the logistic regression
import statsmodels.formula.api as smf
result=smf.logit("target~C(Gender)+balance+income",data=train).fit()
result.summary2()

In [None]:
model=smf.logit("target~C(Gender)+balance",data=train).fit()  .
model.summary2()

* We Reject Ho 

### Conclusion :- 

There is log linear relationship between Target and Gender & Balance


# Odds Ratio

In [None]:
model.params   # coefficient 

In [None]:
Model_Odds=pd.DataFrame(np.exp(model.params),columns=["Odds_Ratio"])
Model_Odds

* Male have 2.13 times higher odds of not paying Credit Card Bill as compared to Female  

# Step 6 : Predictions on Train Dataset

In [None]:
train.head()

In [None]:
train['Probability']=model.predict(train)
train.head()

* 2.6% chances the customer will not pay the credit card bill

In [None]:
train['Predicted']=np.where(train['Probability'] >= 0.7,1,0)  #thershold  >0.7
train.sort_values(by='target',ascending=False).head()

# Step 7 : Model Performance Metrics

In [None]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(train['Predicted'],train['target'])
print(matrix)

In [None]:
Accuracy_Train=((6745+49)/(7000)*100) (#tp+tn/total)
print(Accuracy_Train) 

* Accuracy of Model is 97% , So we say Good Model 

In [None]:
from sklearn.metrics import classification_report
print(classification_report(train['target'],train['Predicted']))

* Accuracy of Bad Customer Capture by Model is 20% ( Sensitivity )
* Accuracy of Good Customer Capture by Model is 100% ( specificity) 
* Accuracy of Predicted Bad Customer And often Correct is 79% 
* Accuracy of Predicted Good Customer And often Correct is 97% 


# Step 8 : Predictions on Test Dataset

In [None]:
test=pd.concat([X_test,y_test],axis=1)
test.head()

In [None]:
test['Probability']=model.predict(test)
test.head()

In [None]:
test['Predicted']=np.where(test['Probability'] >= 0.7,1,0) 
test.head()

# Step 9 : Model Performance Metrics on Test data 

In [None]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(test['Predicted'],test['target'])
print(matrix)

In [None]:
Accuracy_test=((2904+21)/(3000)*100)  ##(tp+tn/total)
Accuracy_test

* Accuracy of Model Performance on Test data is 97% , So we say Good Model Performance 

In [None]:
from sklearn.metrics import classification_report
print(classification_report(test['target'],test['Predicted']))

* Accuracy of Bad Customer Capture by Model is 23% 
* Accuracy of Good Customer Capture by Model is 100% 
* Accuracy of Predicted Bad Customer And often Correct is 81% 
* Accuracy of Predicted Good Customer And often Correct is 98% 


# Finish 

## Manual Prediction

In [None]:
# sort the pandas dataframe by descending value of single column
train.sort_values(by='Predicted',ascending=False,inplace=True)

train.head()

#### Model Coefficients

In [30]:
model.params

NameError: name 'model' is not defined

# Manual by Function

In [None]:
import math
def Manual_Prediction(gender,balance):
    z=-11.910534+0.756655*gender+0.006046*balance  
    a=math.exp(-z)
    y=round(1/(1+a),4)
    return y

#### For Male Customer

In [None]:
Manual_Prediction(1,2005.575128)

* 72.5% chance that customer who is male and have balance = 2005 ,he will not pay the credit card bill

#### For Female Customer

In [None]:
train.head(5)

In [None]:
Manual_Prediction(0,2193.174309)

* 79% chance that customer is female and have balance = 2193 Indicate will not pay the credit card bill

# Another Method Logistics 

In [None]:
X_train_1=X_train[['Gender', 'balance']]

In [None]:
X_test_1=X_test[['Gender', 'balance']]

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
model_2=logreg.fit(X_train_1, y_train)

# Step 10 :- ROCR & KS PLOT

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, model_2.predict(X_test_1))
fpr, tpr, thresholds= roc_curve(y_test, model_2.predict_proba(X_test_1)
                                 [:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)'
         % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
#plt.savefig('Log_ROC')
plt.show()

### Need to install first
pip install scikit-plot

import scikitplot as skplt
y_probas = model_2.predict_proba(X_test_1)
skplt.metrics.plot_ks_statistic(y_test, y_probas)
plt.show()

#### For Male Customer

z=-11.602065+0.704533*1+0.005836*1823.751426
z

import math
a=math.exp(-z)
a

b=1+a
c=1/b
round(c,4)

#### For Female Customer

# sort the pandas dataframe by descending value of single column
train.sort_values(by='Gender',inplace=True)

train.head()

z=-11.910534+0.756655*0+0.006046*1409.989102
z

import math
a=math.exp(-z)
a

b=1+a
c=1/b
round(c,4)

# Deployment 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
output=logreg.fit(X_train, y_train)

import os
os.chdir(r"C:\Users\User\Desktop\Python Code\End_to_End Project\Credit_Card_logistics")
import pickle
#pickle.dump(output, open('build.pkl','wb'))
model=pickle.load(open('build.pkl','rb'))

In [None]:
model.predict([[1,2063.571934,37372.75849]])

In [None]:
model.predict([[0,2193.174309,25706.64777]])

In [31]:
train['out']=model.predict(X_train)
train

NameError: name 'model' is not defined