In [122]:
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import pandas as pd
df = pd.read_csv("divorce.csv", sep = ';')
X,Y=df.iloc[0:,0:54],df.iloc[0:,-1]
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=42)

In [123]:
#Using ANOVA for feature selection
bestatt=SelectKBest(score_func=f_classif,k=10)
afit=bestatt.fit(X_train,y_train)
scrs=pd.DataFrame(afit.scores_)
cols=pd.DataFrame(X.columns)
chart=pd.concat([cols,scrs],axis=1)
chart.columns=['attribute','score']
print(chart.nlargest(10,'score'))

   attribute        score
39     Atr40  1012.701450
37     Atr38   724.894770
18     Atr19   674.636222
16     Atr17   669.497861
17     Atr18   559.220516
8       Atr9   549.381773
19     Atr20   538.206177
14     Atr15   533.096916
35     Atr36   509.430982
10     Atr11   508.259151


In [124]:
X_train=bestatt.fit_transform(X_train,y_train)
X_test=bestatt.fit_transform(X_test,y_test)

In [125]:
X_train.shape

(119, 10)

In [126]:
X_test.shape

(51, 10)

In [127]:
models={} 
# Accuracy, F1-Score, Precision, Recall
# 1. Implementing SVM
from sklearn.svm import SVC
kernels=['rbf','linear','poly']
models['svm']={}
for kernel in kernels:
    t=[]
    svmclf=SVC(kernel=kernel,gamma=2)
    svmclf.fit(X_train,y_train)
    print("Kernel-"+kernel)
    pred=svmclf.predict(X_test)
    print("Accuracy:",end='')
    print(accuracy_score(pred,y_test))
    t.append(accuracy_score(pred,y_test))
    t.append(f1_score(pred,y_test))
    t.append(precision_score(pred,y_test))
    t.append(recall_score(pred,y_test))    
    models['svm'][kernel]=t
    print()


Kernel-rbf
Accuracy:0.8431372549019608

Kernel-linear
Accuracy:1.0

Kernel-poly
Accuracy:1.0



In [128]:
# 2. Naive Bayes algorithm

In [129]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
nbs=[GaussianNB(),MultinomialNB(),BernoulliNB()]
models['nb']={}
for nb in nbs:
    t=[]
    nbclf=nb
    nbclf.fit(X_train,y_train)
    pred=nbclf.predict(X_test)
    print(nb)
    print("Accuracy:",end='')
    print(accuracy_score(pred,y_test))
    t.append(accuracy_score(pred,y_test))
    t.append(f1_score(pred,y_test))
    t.append(precision_score(pred,y_test))
    t.append(recall_score(pred,y_test))    
    models['nb'][nb]=t
    print()  
print(models)

GaussianNB()
Accuracy:1.0

MultinomialNB()
Accuracy:0.9411764705882353

BernoulliNB()
Accuracy:1.0

{'svm': {'rbf': [0.8431372549019608, 0.8666666666666666, 1.0, 0.7647058823529411], 'linear': [1.0, 1.0, 1.0, 1.0], 'poly': [1.0, 1.0, 1.0, 1.0]}, 'nb': {GaussianNB(): [1.0, 1.0, 1.0, 1.0], MultinomialNB(): [0.9411764705882353, 0.9433962264150944, 0.9615384615384616, 0.9259259259259259], BernoulliNB(): [1.0, 1.0, 1.0, 1.0]}}


In [130]:
# 3. Random Forest

In [131]:
from sklearn.ensemble import RandomForestClassifier
rfclf=RandomForestClassifier()
rfclf.fit(X_train,y_train)
pred=rfclf.predict(X_test)
t=[]
models['rf']={}
print(rfclf)
print("Accuracy:",end='')
print(accuracy_score(pred,y_test))
t.append(accuracy_score(pred,y_test))
t.append(f1_score(pred,y_test))
t.append(precision_score(pred,y_test))
t.append(recall_score(pred,y_test))    
models['rf'][kernel]=t


RandomForestClassifier()
Accuracy:1.0


In [132]:
# 4. AdaBoost

In [133]:
from sklearn.ensemble import AdaBoostClassifier
abclf=AdaBoostClassifier()
abclf.fit(X_train,y_train)
pred=abclf.predict(X_test)
t=[]
models['ab']={}
print(abclf)
print("Accuracy:",end='')
print(accuracy_score(pred,y_test))
t.append(accuracy_score(pred,y_test))
t.append(f1_score(pred,y_test))
t.append(precision_score(pred,y_test))
t.append(recall_score(pred,y_test))    
models['ab'][kernel]=t


AdaBoostClassifier()
Accuracy:1.0


In [134]:
print("Format: Accuracy Score, F1 Score, Precision Score and Recall\n")
for k,v in models.items():
    print(k)
    print(v)
    print()

Format: Accuracy Score, F1 Score, Precision Score and Recall

svm
{'rbf': [0.8431372549019608, 0.8666666666666666, 1.0, 0.7647058823529411], 'linear': [1.0, 1.0, 1.0, 1.0], 'poly': [1.0, 1.0, 1.0, 1.0]}

nb
{GaussianNB(): [1.0, 1.0, 1.0, 1.0], MultinomialNB(): [0.9411764705882353, 0.9433962264150944, 0.9615384615384616, 0.9259259259259259], BernoulliNB(): [1.0, 1.0, 1.0, 1.0]}

rf
{'poly': [1.0, 1.0, 1.0, 1.0]}

ab
{'poly': [1.0, 1.0, 1.0, 1.0]}



In [135]:
import pickle
with open('rfclf', 'wb') as files:
    pickle.dump(rfclf, files)

In [146]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier()
def train_model(train, test, fold_no,mx):
    X_train = train.iloc[0:,0:54]
    y_train = train.iloc[0:,-1]
    X_test = test.iloc[0:,0:54]
    y_test = test.iloc[0:,-1]
    model.fit(X_train,y_train)
    predictions = model.predict(X_test)
    macc=accuracy_score(y_test,predictions)
    print('Fold',str(fold_no),'Accuracy:',macc)
    if(macc>=mx):
        mx=macc
        with open('cross_rf', 'wb') as files:
            pickle.dump(rfclf, files)
        

In [147]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5)
fold_no = 1
mx=0
for train_index, test_index in skf.split(X,Y):
    train = df.loc[train_index,:]
    test = df.loc[test_index,:]
    #print('Fold',str(fold_no),'Class Ratio:',sum(test['Returned_Units'])/len(test['Returned_Units']))
    train_model(train,test,fold_no,mx)
    fold_no += 1

Fold 1 Accuracy: 0.8823529411764706
Fold 2 Accuracy: 1.0
Fold 3 Accuracy: 1.0
Fold 4 Accuracy: 1.0
Fold 5 Accuracy: 1.0
