In [1]:
import pandas as pd

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier


from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import (
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score
)

In [2]:
df = pd.read_csv("../data/churn_modelling.csv")
df = df.drop(["RowNumber", "CustomerId", "Surname"], axis=1)
df = df.replace(
    {"Geography" : {"Germany" : 0,
                    "France" : 1,
                    "Spain" : 2 },
     "Gender" : {"Female" : 0,
                 "Male" : 1}
    }
)
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,1,0,42,2,0.0,1,1,1,101348.88,1
1,608,2,0,41,1,83807.86,1,0,1,112542.58,0
2,502,1,0,42,8,159660.8,3,1,0,113931.57,1
3,699,1,0,39,1,0.0,2,0,0,93826.63,0
4,850,2,0,43,2,125510.82,1,1,1,79084.1,0


In [3]:
X = df.drop(["Exited"], axis=1)
y = df["Exited"]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

X_test.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
2610,682,1,1,38,4,107192.38,1,1,1,15669.17
5244,672,2,1,43,5,0.0,2,1,1,64515.5
1113,731,0,0,21,8,132312.06,1,1,0,106663.46
637,682,1,1,48,1,138778.15,1,0,1,168840.23
1830,506,1,1,37,5,0.0,2,1,1,127543.81


In [4]:
## Support Vector Classifier

In [5]:
svc_clf = SVC(kernel="sigmoid")
svc_clf.fit(X_train,y_train)
y_pred_clf= svc_clf.predict(X_test)

In [6]:
print(confusion_matrix(y_test, y_pred_clf))
print("F1="+str(f1_score(y_test, y_pred_clf)))
print("Precision="+str(precision_score(y_test, y_pred_clf)))
print("Recall="+str(recall_score(y_test, y_pred_clf)))

[[1882  490]
 [ 487  141]]
F1=0.22398729150119143
Precision=0.22345483359746435
Recall=0.22452229299363058


In [7]:
## Decision Tree Classifier

In [8]:
dtc = DecisionTreeClassifier(max_depth=2)
dtc.fit(X_train,y_train)
y_pred_dtc = dtc.predict(X_test)

In [9]:
print(confusion_matrix(y_test, y_pred_dtc))
print("F1="+str(f1_score(y_test, y_pred_dtc)))
print("Precision="+str(precision_score(y_test, y_pred_dtc)))
print("Recall="+str(recall_score(y_test, y_pred_dtc)))

[[2195  177]
 [ 342  286]]
F1=0.5242896425297892
Precision=0.6177105831533477
Recall=0.4554140127388535


In [10]:
## KNeighbors Classifier

In [11]:
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train,y_train)
y_pred_neigh = neigh.predict(X_test)

In [12]:
print(confusion_matrix(y_test, y_pred_neigh))
print("F1="+str(f1_score(y_test, y_pred_neigh)))
print("Precision="+str(precision_score(y_test, y_pred_neigh)))
print("Recall="+str(recall_score(y_test, y_pred_neigh)))

[[2239  133]
 [ 579   49]]
F1=0.12098765432098765
Precision=0.2692307692307692
Recall=0.07802547770700637


In [13]:
## Logistic Regression

In [14]:
lgc = LogisticRegression()
lgc.fit(X_train,y_train)
y_pred_lgc = lgc.predict(X_test)

In [15]:
print(confusion_matrix(y_test, y_pred_lgc))
print("F1="+str(f1_score(y_test, y_pred_lgc)))
print("Precision="+str(precision_score(y_test, y_pred_lgc)))
print("Recall="+str(recall_score(y_test, y_pred_lgc)))

[[2323   49]
 [ 592   36]]
F1=0.10098176718092566
Precision=0.4235294117647059
Recall=0.05732484076433121


In [16]:
voting_clf = VotingClassifier(estimators=[("svc", svc_clf), ("dtc", dtc), ("lgc", lgc)])
voting_clf.fit(X_train,y_train)
y_pred_voting = voting_clf.predict(X_test)

In [17]:
print(confusion_matrix(y_test, y_pred_voting))
print("F1="+str(f1_score(y_test, y_pred_voting)))
print("Precision="+str(precision_score(y_test, y_pred_voting)))
print("Recall="+str(recall_score(y_test, y_pred_voting)))

[[2317   55]
 [ 541   87]]
F1=0.22597402597402597
Precision=0.6126760563380281
Recall=0.13853503184713375


In [18]:
## Random Forest Classifier

In [62]:
rfc = RandomForestClassifier(random_state=10)
rfc.fit(X_train,y_train)
y_pred_rfc = rfc.predict(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test["y_pred"] = y_pred_rfc


In [58]:
print(confusion_matrix(y_test, y_pred_rfc))
print("F1="+str(f1_score(y_test, y_pred_rfc)))
print("Precision="+str(precision_score(y_test, y_pred_rfc)))
print("Recall="+str(recall_score(y_test, y_pred_rfc)))

[[2270  102]
 [ 327  301]]
F1=0.5838991270611058
Precision=0.7468982630272953
Recall=0.47929936305732485


In [21]:
## Multilayer Perceptron

In [22]:
sc=StandardScaler()
scaler = sc.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [23]:
mlp_tuning = MLPClassifier()
parameter_space = { 
    "hidden_layer_sizes": [(150,100,50), (120,80,40), (100,50,30)],
    "max_iter": [100, 150, 200, 250, 300],
    "activation": ["tanh", "relu"], 
    "solver": ["sgd", "adam"], 
    "alpha": [0.0001, 0.05], 
    "learning_rate": ["constant", "invscaling", "adaptive"], 
}
#clf = GridSearchCV (mlp_tuning, parameter_space, n_jobs=-1, cv=5) 
#clf.fit(X_train_scaled,y_train)
#print(clf.best_params_)

In [24]:
mlp = MLPClassifier(max_iter=300, activation="tanh", alpha=0.05, hidden_layer_sizes=(20,), learning_rate="invscaling", solver="adam")
mlp.fit(X_train,y_train)
y_pred_mlp = mlp.predict(X_test_scaled)

In [25]:
print(confusion_matrix(y_test, y_pred_mlp))
print("F1="+str(f1_score(y_test, y_pred_mlp)))
print("Precision="+str(precision_score(y_test, y_pred_mlp)))
print("Recall="+str(recall_score(y_test, y_pred_mlp)))

[[2372    0]
 [ 628    0]]
F1=0.0
Precision=0.0
Recall=0.0


  _warn_prf(average, modifier, msg_start, len(result))
