In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split 
from sklearn import metrics 
from sklearn.model_selection import cross_val_score 

In [None]:
dta = sm.datasets.fair.load_pandas().data
dta

In [None]:
#add "affair" column with 1 represents having affairs, 0 represents not 
dta['affair'] = (dta.affairs > 0).astype(int)

In [None]:
dta.describe()

In [None]:
y, X = dmatrices('affair ~ rate_marriage + age + yrs_married + children +  religious + educ +C(occupation) + C(occupation_husb)',dta, return_type="dataframe")


In [None]:
X = X.rename(columns =
{'C(occupation)[T.2.0]':'occ_2',
'C(occupation)[T.3.0]':'occ_3',
'C(occupation)[T.4.0]':'occ_4',
'C(occupation)[T.5.0]':'occ_5',
'C(occupation)[T.6.0]':'occ_6',
'C(occupation_husb)[T.2.0]':'occ_husb_2',
'C(occupation_husb)[T.3.0]':'occ_husb_3',
'C(occupation_husb)[T.4.0]':'occ_husb_4',
'C(occupation_husb)[T.5.0]':'occ_husb_5',
'C(occupation_husb)[T.6.0]':'occ_husb_6'})
y = np.ravel(y) #to convert the 2D array to flatten array

In [None]:
X

In [None]:
# let's see how data is distributed for every column
import seaborn as sns
sns.set()
plt.figure(figsize=(20,25), facecolor='white')
plotnumber = 1

for column in dta:
    if plotnumber<=9 :     # as there are 9 columns in the data
        ax = plt.subplot(3,3,plotnumber)
        sns.distplot(dta[column])
        plt.xlabel(column,fontsize=20)        
    plotnumber+=1
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
sns.boxplot(data=dta, width= 0.5,ax=ax,  fliersize=3)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor 
vif = pd.DataFrame()
vif["vif"] = [variance_inflation_factor(X_scaled,i) for i in range(X_scaled.shape[1])]
vif["Features"] = X.columns

#let's check the values
vif

In [None]:
import seaborn as sns
correlation_matrix = dta.corr().round(2)
sns.heatmap(data=correlation_matrix, annot=True)

In [None]:
plt.figure(figsize=(8,6))
sns.set_style("darkgrid")
sns.countplot(np.ravel(y))
plt.yticks(range(0,5000,250))
plt.show()

In [None]:
from imblearn.over_sampling  import RandomOverSampler
rdm=RandomOverSampler(random_state=42)
#X,y=rdm.fit_sample(X,y)
X, y = rdm.fit_resample(X, y)


In [None]:
plt.figure(figsize=(8,6))
sns.set_style("darkgrid")
sns.countplot(np.ravel(y))
plt.yticks(range(0,5000,250))
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler 
scalar = StandardScaler()
X_scaled = scalar.fit_transform(X)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X_scaled,y,test_size=0.2,random_state=1)

In [None]:
clf=LogisticRegression()
clf.fit(x_train,y_train)
y_train_predicted=clf.predict(x_train)
y_predicted=clf.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix,roc_curve,classification_report
print("train set accuracy score: ",accuracy_score(y_train,y_train_predicted))
print("test set accuracy score :",  accuracy_score(y_test,y_predicted))

In [None]:
import pickle
# Writing different model files to file
with open( 'logRegModelForPrediction.sav', 'wb') as f:
    pickle.dump(clf,f)
    
with open('sandardScalar.sav', 'wb') as f:
    pickle.dump(scalar,f)

In [None]:
conf_mat = confusion_matrix(y_test,y_predicted)
conf_mat

In [None]:
true_positive = conf_mat[0][0]
false_positive = conf_mat[0][1]
false_negative = conf_mat[1][0]
true_negative = conf_mat[1][1]

In [None]:
Accuracy = (true_positive + true_negative) / (true_positive +false_positive + false_negative + true_negative)
Accuracy

In [None]:
Precision = true_positive/(true_positive+false_positive)
Precision

In [None]:
Recall = true_positive/(true_positive+false_negative)
Recall

In [None]:
F1_Score = 2*(Recall * Precision) / (Recall + Precision)
F1_Score

In [None]:
# Area Under Curve
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score
auc = roc_auc_score(y_test, y_predicted)
auc

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_predicted)

In [None]:
plt.plot(fpr, tpr, color='orange', label='ROC')
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--',label='ROC curve (area = %0.2f)' % auc)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()

In [None]:
x1_scaled =scalar.fit_transform(np.array([[0,0,0,0,0,1,1,0,0,0,0,4,32,5,2,3,17]]))
print("your prediction is  :",clf.predict(x1_scaled))