In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv('Credit_train.csv')       #taking all independent varaibles in X and dependent varaible in y
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values

In [3]:
print(X)

[[13 40 1 ... 9 ' White' ' Male']
 [13 13 0 ... 9 ' White' ' Male']
 [9 40 1 ... 11 ' White' ' Male']
 ...
 [9 40 4 ... 11 ' White' ' Female']
 [9 20 3 ... 11 ' White' ' Male']
 [9 40 5 ... 11 ' White' ' Female']]


In [4]:
print(y)

[0 0 0 ... 0 0 1]


In [5]:
from sklearn.impute import SimpleImputer                          #dealing with missing data
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 0:9])
X[:, 0:9] = imputer.transform(X[:, 0:9])

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [9])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [7]:
print(X[4])

[0.0 0.0 1.0 0.0 0.0 13.0 40.0 5.0 10.0 0.0 0.0 2.0 4.0 9.0 ' Female']


In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X[:,-1] = le.fit_transform(X[:,-1])

In [9]:
print(X[4])

[0.0 0.0 1.0 0.0 0.0 13.0 40.0 5.0 10.0 0.0 0.0 2.0 4.0 9.0 0]


In [10]:
print(X)

[[0.0 0.0 0.0 ... 7.0 9.0 1]
 [0.0 0.0 0.0 ... 6.0 9.0 1]
 [0.0 0.0 0.0 ... 4.0 11.0 1]
 ...
 [0.0 0.0 0.0 ... 4.0 11.0 0]
 [0.0 0.0 0.0 ... 4.0 11.0 1]
 [0.0 0.0 0.0 ... 5.0 11.0 0]]


In [11]:
from imblearn.over_sampling import SMOTE     #dealing with unbalanced dataset
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

In [12]:
from sklearn.model_selection import train_test_split             #spliting of training data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [13]:
from sklearn.preprocessing import StandardScaler               #standard scaling of independent variables
sc = StandardScaler()
X_train[:, 5:14] = sc.fit_transform(X_train[:, 5:14])
X_test[:, 5:14] = sc.transform(X_test[:, 5:14])

In [14]:
print(X_train[1])

[ 0.          0.          0.          0.          1.         -0.61895719
 -0.18202478  2.36225025 -1.39221003 -0.1987216  -0.25931309 -0.32880938
 -1.88300294  0.14738277  0.        ]


In [15]:
from sklearn.tree import DecisionTreeClassifier                 #Decision Tree Classifier
classifier_dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier_dt.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [16]:
from sklearn.svm import SVC                                    #SVM Classifier
classifier_svm = SVC(kernel = 'rbf', random_state = 0)
classifier_svm.fit(X_train, y_train)

SVC(random_state=0)

In [17]:
from xgboost import XGBClassifier                          #XGBoost Classifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [18]:
y_pred_dt = classifier_dt.predict(X_test)
print(np.concatenate((y_pred_dt.reshape(len(y_pred_dt),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [1 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [1 1]]


In [19]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [1 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [1 1]]


In [20]:
y_pred_svm = classifier_svm.predict(X_test)
print(np.concatenate((y_pred_svm.reshape(len(y_pred_svm),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [1 0]
 [0 0]
 ...
 [1 0]
 [0 0]
 [1 1]]


In [21]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred_dt, average='micro')

0.8574029126213593

In [22]:
f1_score(y_test, y_pred, average='micro')

0.8781014023732471

In [23]:
f1_score(y_test, y_pred_svm, average='micro')

0.8189050701186623

In [24]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[6354 1100]
 [ 708 6670]]


0.8781014023732471

In [25]:
y_pred

array([1, 1, 0, ..., 0, 0, 1], dtype=int64)

In [26]:
dataset1 = pd.read_csv('credit_test.csv')
X_t = dataset1.iloc[:, 1:].values

In [27]:
print(X_t)

[[7 40 3 ... 1 ' Black' ' Male']
 [12 40 0 ... 7 ' White' ' Male']
 [10 40 0 ... 15 ' Black' ' Male']
 ...
 [9 40 2 ... 11 ' Black' ' Male']
 [13 50 0 ... 9 ' White' ' Male']
 [13 40 3 ... 9 ' Asian-Pac-Islander' ' Male']]


In [28]:
X_t[:, 0:9] = imputer.transform(X_t[:, 0:9])

In [29]:
print(X_t)

[[7.0 40.0 3.0 ... 1.0 ' Black' ' Male']
 [12.0 40.0 0.0 ... 7.0 ' White' ' Male']
 [10.0 40.0 0.0 ... 15.0 ' Black' ' Male']
 ...
 [9.0 40.0 2.0 ... 11.0 ' Black' ' Male']
 [13.0 50.0 0.0 ... 9.0 ' White' ' Male']
 [13.0 40.0 3.0 ... 9.0 ' Asian-Pac-Islander' ' Male']]


In [30]:
X_t = np.array(ct.fit_transform(X_t))

In [31]:
X_t

array([[0.0, 0.0, 1.0, ..., 4.0, 1.0, ' Male'],
       [0.0, 0.0, 0.0, ..., 2.0, 7.0, ' Male'],
       [0.0, 0.0, 1.0, ..., 4.0, 15.0, ' Male'],
       ...,
       [0.0, 0.0, 1.0, ..., 0.0, 11.0, ' Male'],
       [0.0, 0.0, 0.0, ..., 4.0, 9.0, ' Male'],
       [0.0, 1.0, 0.0, ..., 4.0, 9.0, ' Male']], dtype=object)

In [32]:
X_t[:,-1] = le.fit_transform(X_t[:,-1])

In [33]:
print(X_t[4])

[0.0 0.0 0.0 0.0 1.0 6.0 30.0 1.0 8.0 0.0 0.0 4.0 4.0 0.0 1]


In [34]:
X_t[:, 5:14] = sc.transform(X_t[:, 5:14])

In [35]:
print(X_t[4])

[0.0 0.0 0.0 0.0 1.0 -1.7734231139832677 -1.0133613084094968
 -0.1132927747377066 0.2903527143993289 -0.1987216030260907
 -0.2593130879205311 1.1919549106384708 0.05665521287949842
 -2.9768862937887857 1]


In [36]:
y_pred1 = classifier.predict(X_t)

In [37]:
len(y_pred1)

13305

In [38]:
np.savetxt('credit_result2.dat', y_pred1, fmt='%s')