In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import make_scorer
from xgboost import XGBClassifier
import featuretools as ft
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

In [3]:
dataframe_org = pd.read_csv('train.csv', delimiter='|')

# uncomment the following to exclude records w/ trustlevel >=3 (done by Haritha).
# Jonas: I think this is not the right way to simply exlcude them. The right way would be to tell a model it should
# always predict no fraud when trustLevel >= 3. But I actually have no idea how to do this.

#dataframe_org.drop(dataframe_org.loc[dataframe_org['trustLevel']>=3].index, inplace=True)

dataframe = dataframe_org.copy()


########### manual feature generation ##########

# totalScanned:
dataframe['totalScanned'] = dataframe['scannedLineItemsPerSecond'] * dataframe['totalScanTimeInSeconds']
# avgValuePerScan:
dataframe['avgTimePerScan'] = 1/ dataframe['scannedLineItemsPerSecond']
dataframe['avgValuePerScan'] = dataframe['avgTimePerScan'] * dataframe['valuePerSecond']
# manual feature generation - "totalScanned" ratios
# withoutRegisPerPosition
dataframe['withoutRegisPerPosition'] = dataframe['scansWithoutRegistration'] / dataframe['totalScanned']
# ratio of scansWithoutRegis in totalScan
# equivalent to lineItemVoidsPerPosition
# Might indicate how new or ambivalent a customer is. Expected to be higher for low "trustLevel"
# quantiModPerPosition
dataframe['quantiModPerPosition'] = dataframe['quantityModifications'] / dataframe['totalScanned']
# ratio of quanityMods in totalScan
# manual feature generation - "grandTotal" ratios
# lineItemVoidsPerTotal
dataframe['lineItemVoidsPerTotal'] = dataframe['lineItemVoids'] / dataframe['grandTotal']
# withoutRegisPerTotal
dataframe['withoutRegisPerTotal'] = dataframe['scansWithoutRegistration'] / dataframe['grandTotal']
# quantiModPerTotal
dataframe['quantiModPerTotal'] = dataframe['quantityModifications'] / dataframe['grandTotal']
# manual feature generation - "totalScanTimeInSeconds" ratios
# lineItemVoidsPerTime
dataframe['lineItemVoidsPerTime'] = dataframe['lineItemVoids'] / dataframe['totalScanTimeInSeconds']
# withoutRegisPerTime
dataframe['withoutRegisPerTime'] = dataframe['scansWithoutRegistration'] / dataframe['totalScanTimeInSeconds']
# quantiModPerTime
dataframe['quantiModPerTime'] = dataframe['quantityModifications'] / dataframe['totalScanTimeInSeconds']

########### end manual feature generation ###########

In [4]:
X_base=dataframe.drop('fraud',axis=1)
y_base=dataframe['fraud']

In [5]:
# from sklearn.preprocessing import StandardScaler
# sc=StandardScaler()
# X_base=sc.fit_transform(X_base)

In [6]:
dataframe

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud,...,avgTimePerScan,avgValuePerScan,withoutRegisPerPosition,quantiModPerPosition,lineItemVoidsPerTotal,withoutRegisPerTotal,quantiModPerTotal,lineItemVoidsPerTime,withoutRegisPerTime,quantiModPerTime
0,5,1054,54.70,7,0,3,0.027514,0.051898,0.241379,0,...,36.344828,1.886207,0.000000,0.103448,0.127971,0.000000,0.054845,0.006641,0.000000,0.002846
1,3,108,27.36,5,2,4,0.129630,0.253333,0.357143,0,...,7.714286,1.954286,0.142857,0.285714,0.182749,0.073099,0.146199,0.046296,0.018519,0.037037
2,3,1516,62.16,3,10,5,0.008575,0.041003,0.230769,0,...,116.615385,4.781538,0.769231,0.384615,0.048263,0.160875,0.080438,0.001979,0.006596,0.003298
3,6,1791,92.31,8,4,4,0.016192,0.051541,0.275862,0,...,61.758621,3.183103,0.137931,0.137931,0.086665,0.043332,0.043332,0.004467,0.002233,0.002233
4,5,430,81.53,3,7,2,0.062791,0.189605,0.111111,0,...,15.925926,3.019630,0.259259,0.074074,0.036796,0.085858,0.024531,0.006977,0.016279,0.004651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1874,1,321,76.03,8,7,2,0.071651,0.236854,0.347826,0,...,13.956522,3.305652,0.304348,0.086957,0.105222,0.092069,0.026305,0.024922,0.021807,0.006231
1875,1,397,41.89,5,5,0,0.065491,0.105516,0.192308,1,...,15.269231,1.611154,0.192308,0.000000,0.119360,0.119360,0.000000,0.012594,0.012594,0.000000
1876,4,316,41.83,5,8,1,0.094937,0.132373,0.166667,0,...,10.533333,1.394333,0.266667,0.033333,0.119531,0.191250,0.023906,0.015823,0.025316,0.003165
1877,2,685,62.68,1,6,2,0.035036,0.091504,0.041667,0,...,28.541667,2.611667,0.250000,0.083333,0.015954,0.095724,0.031908,0.001460,0.008759,0.002920


In [7]:
dataframe_test = pd.read_csv('test.csv', delimiter='|')
dataframe_test['totalScanned'] = dataframe_test['scannedLineItemsPerSecond'] * dataframe_test['totalScanTimeInSeconds']
# avgValuePerScan:
dataframe_test['avgTimePerScan'] = 1/ dataframe_test['scannedLineItemsPerSecond']
dataframe_test['avgValuePerScan'] = dataframe_test['avgTimePerScan'] * dataframe_test['valuePerSecond']
# manual feature generation - "totalScanned" ratios
# withoutRegisPerPosition
dataframe_test['withoutRegisPerPosition'] = dataframe_test['scansWithoutRegistration'] / dataframe_test['totalScanned']
# ratio of scansWithoutRegis in totalScan
# equivalent to lineItemVoidsPerPosition
# Might indicate how new or ambivalent a customer is. Expected to be higher for low "trustLevel"
# quantiModPerPosition
dataframe_test['quantiModPerPosition'] = dataframe_test['quantityModifications'] / dataframe_test['totalScanned']
# ratio of quanityMods in totalScan
# manual feature generation - "grandTotal" ratios
# lineItemVoidsPerTotal
dataframe_test['lineItemVoidsPerTotal'] = dataframe_test['lineItemVoids'] / dataframe_test['grandTotal']
# withoutRegisPerTotal
dataframe_test['withoutRegisPerTotal'] = dataframe_test['scansWithoutRegistration'] / dataframe_test['grandTotal']
# quantiModPerTotal
dataframe_test['quantiModPerTotal'] = dataframe_test['quantityModifications'] / dataframe_test['grandTotal']
# manual feature generation - "totalScanTimeInSeconds" ratios
# lineItemVoidsPerTime
dataframe_test['lineItemVoidsPerTime'] = dataframe_test['lineItemVoids'] / dataframe_test['totalScanTimeInSeconds']
# withoutRegisPerTime
dataframe_test['withoutRegisPerTime'] = dataframe_test['scansWithoutRegistration'] / dataframe_test['totalScanTimeInSeconds']
# quantiModPerTime
dataframe_test['quantiModPerTime'] = dataframe_test['quantityModifications'] / dataframe_test['totalScanTimeInSeconds']


In [8]:
df_sol=pd.read_csv('sol_dmc.csv')

In [9]:
from sklearn.metrics import confusion_matrix

In [10]:
dataframe_test=dataframe_test.replace([np.nan,np.inf,-np.inf],0)

In [11]:
logistic=LogisticRegression(penalty='none',max_iter=175)
modelm=logistic.fit(X=X_base,y=y_base)
predictions1=modelm.predict(dataframe_test)
confusion_matrix(df_sol['fraud'],predictions1)

array([[470630,   3764],
       [  3657,  20070]], dtype=int64)

In [12]:
xgb=XGBClassifier()
model=xgb.fit(X=X_base,y=y_base)
predictions=model.predict(dataframe_test)
confusion_matrix(df_sol['fraud'],predictions)



array([[471600,   2794],
       [  4640,  19087]], dtype=int64)

In [13]:
ada=AdaBoostClassifier(learning_rate=0.95,n_estimators=100)
model18=ada.fit(X=X_base,y=y_base)
predictions18=model18.predict(dataframe_test)
confusion_matrix(df_sol['fraud'],predictions18)

array([[471734,   2660],
       [  2684,  21043]], dtype=int64)

In [14]:
dataframe_test.isna().sum()

trustLevel                   0
totalScanTimeInSeconds       0
grandTotal                   0
lineItemVoids                0
scansWithoutRegistration     0
quantityModifications        0
scannedLineItemsPerSecond    0
valuePerSecond               0
lineItemVoidsPerPosition     0
totalScanned                 0
avgTimePerScan               0
avgValuePerScan              0
withoutRegisPerPosition      0
quantiModPerPosition         0
lineItemVoidsPerTotal        0
withoutRegisPerTotal         0
quantiModPerTotal            0
lineItemVoidsPerTime         0
withoutRegisPerTime          0
quantiModPerTime             0
dtype: int64

In [15]:
dataframe_test.columns

Index(['trustLevel', 'totalScanTimeInSeconds', 'grandTotal', 'lineItemVoids',
       'scansWithoutRegistration', 'quantityModifications',
       'scannedLineItemsPerSecond', 'valuePerSecond',
       'lineItemVoidsPerPosition', 'totalScanned', 'avgTimePerScan',
       'avgValuePerScan', 'withoutRegisPerPosition', 'quantiModPerPosition',
       'lineItemVoidsPerTotal', 'withoutRegisPerTotal', 'quantiModPerTotal',
       'lineItemVoidsPerTime', 'withoutRegisPerTime', 'quantiModPerTime'],
      dtype='object')

In [16]:
np.isfinite(dataframe_test).sum()

trustLevel                   498121
totalScanTimeInSeconds       498121
grandTotal                   498121
lineItemVoids                498121
scansWithoutRegistration     498121
quantityModifications        498121
scannedLineItemsPerSecond    498121
valuePerSecond               498121
lineItemVoidsPerPosition     498121
totalScanned                 498121
avgTimePerScan               498121
avgValuePerScan              498121
withoutRegisPerPosition      498121
quantiModPerPosition         498121
lineItemVoidsPerTotal        498121
withoutRegisPerTotal         498121
quantiModPerTotal            498121
lineItemVoidsPerTime         498121
withoutRegisPerTime          498121
quantiModPerTime             498121
dtype: int64

In [17]:
random=RandomForestClassifier(n_estimators= 75,
 min_samples_split= 2,
 min_samples_leaf= 1,
 max_features= 'auto')
clf=random.fit(X_base,y_base)
predict=clf.predict(dataframe_test)
confusion_matrix(df_sol['fraud'],predict)

array([[472255,   2139],
       [  9162,  14565]], dtype=int64)

In [18]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_base1=sc.fit_transform(X_base)
dataframe_test1=sc.fit_transform(dataframe_test)

In [19]:
sv=SVC(C=100, gamma=0.001, kernel='rbf')
svc=sv.fit(X_base1,y_base)
pred=svc.predict(dataframe_test1)
confusion_matrix(df_sol['fraud'],pred)

array([[471312,   3082],
       [  2116,  21611]], dtype=int64)

In [20]:
sv1=SVC(C=800, gamma=0.0009, kernel='rbf')
svc1=sv1.fit(X_base1,y_base)
pred11=svc1.predict(dataframe_test1)
confusion_matrix(df_sol['fraud'],pred11)

array([[470649,   3745],
       [  1406,  22321]], dtype=int64)

In [21]:
df_voter=pd.DataFrame()
df_voter['svc']=pred11

In [22]:
from sklearn.model_selection import GridSearchCV
 
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}
 
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
 
# fitting the model for grid search
grid.fit(X_base, y_base)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.944 total time=   0.2s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.944 total time=   0.3s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.944 total time=   0.3s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.944 total time=   0.3s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.947 total time=   0.2s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.944 total time=   0.3s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.944 total time=   0.2s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.944 total time=   0.3s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.944 total time=   0.2s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.947 total time=   0.2s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.944 total time=   0.2s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

[CV 2/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.944 total time=   0.2s
[CV 3/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.944 total time=   0.3s
[CV 4/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.944 total time=   0.3s
[CV 5/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.947 total time=   0.2s
[CV 1/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.944 total time=   0.3s
[CV 2/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.944 total time=   0.3s
[CV 3/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.944 total time=   0.2s
[CV 4/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.944 total time=   0.3s
[CV 5/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.947 total time=   0.3s
[CV 1/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.939 total time=   0.2s
[CV 2/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.941 total time=   0.2s
[CV 3/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.939 total time=   0.2s
[CV 4/5] END ....C=1000, gam

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             verbose=3)

In [23]:
grid.best_params_

{'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}

In [24]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,10,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4, cv=5)
gsearch1.fit(X_base,y_base)
gsearch1.best_params_, gsearch1.best_score_



({'max_depth': 5, 'min_child_weight': 3}, 0.9985352112676056)

In [25]:
param_test1 = {
 'learning_rate':[i/100.0 for i in range(1,100)]
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=3, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4, cv=5)
gsearch1.fit(X_base,y_base)
gsearch1.best_params_, gsearch1.best_score_



({'learning_rate': 0.16}, 0.9986934942991281)

In [26]:
param_test1 = {
 'n_estimators':[i for i in range(100,500,50)]
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.16, n_estimators=250, max_depth=5,reg_alpha=1e-05,
 min_child_weight=3, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4, cv=5)
gsearch1.fit(X_base,y_base)
gsearch1.best_params_, gsearch1.best_score_



({'n_estimators': 250}, 0.9987256874580819)

In [27]:
param_test1 = {
 'scale_pos_weight':[i for i in range(1,25)]
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.16, n_estimators=250, max_depth=5,reg_alpha=1e-05,
 min_child_weight=3, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4, cv=5)
gsearch1.fit(X_base,y_base)
gsearch1.best_params_, gsearch1.best_score_



({'scale_pos_weight': 1}, 0.9987256874580819)

In [28]:
#trying XGB after hyper-parameter tuning
xgbnew=XGBClassifier(learning_rate =0.16, n_estimators=250, max_depth=5,reg_alpha=1e-05,
 min_child_weight=3, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=5, scale_pos_weight=20, seed=27)
modelnew=xgbnew.fit(X=X_base,y=y_base)
predictionsnew=modelnew.predict(dataframe_test)
confusion_matrix(df_sol['fraud'],predictionsnew)



array([[469575,   4819],
       [  2561,  21166]], dtype=int64)

In [29]:
df_voter['logistic']=predictions1
df_voter['ada']=predictions18
df_voter['xgb']=predictionsnew

In [30]:
df_voter.shape

(498121, 4)

In [31]:
len(dataframe_test)

498121

In [32]:
arr=[]
naya_list=[]
imp_list=[]
for row in range(len(dataframe_test)):
    for column in range(4):
        arr.append(df_voter.loc[row][column])
    imp_list.append(1.2*arr[0]+arr[1]+1.1*arr[2]+1.1*arr[3])    
    if imp_list[0]>2.4:
        naya_list.append(1)
    else:
        naya_list.append(0)
    imp_list=[]
    arr=[]  

In [33]:
naya_list,dataframe_test

([0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


In [34]:
confusion_matrix(df_sol['fraud'],pd.Series(naya_list))

array([[471853,   2541],
       [  2851,  20876]], dtype=int64)

In [35]:
from sklearn.metrics import classification_report

In [36]:
print(classification_report(df_sol['fraud'],predictionsnew))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99    474394
           1       0.81      0.89      0.85     23727

    accuracy                           0.99    498121
   macro avg       0.90      0.94      0.92    498121
weighted avg       0.99      0.99      0.99    498121



In [37]:
cv = StratifiedKFold(n_splits=10)

features = []
features.append(('pca', PCA(n_components=8)))
features.append(('select_best', SelectKBest(k=4)))
feature_union = FeatureUnion(features)
# create pipeline
estimators = []
estimators.append(('feature_union', feature_union))
estimators.append(('xgb', XGBClassifier(learning_rate =0.16, n_estimators=250, max_depth=5,reg_alpha=1e-05,
 min_child_weight=3, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=5, scale_pos_weight=20, seed=27)))

xgb_af = Pipeline(estimators)

print('XGB w/ PCA & featuretools & SelectKBest on data w/ additional manual features generated: {}'.format(np.mean(cross_validate(xgb_af, X_base, y=y_base, cv=cv)['test_score'])))

XGB w/ PCA & featuretools & SelectKBest on data w/ additional manual features generated: 0.9920212765957446


In [38]:
xgc_af=xgb_af.fit(X_base,y_base)
pf=xgc_af.predict(dataframe_test)



In [39]:
dataframe_test.shape

(498121, 20)

In [40]:
confusion_matrix(df_sol['fraud'],pf)

array([[469610,   4784],
       [  2666,  21061]], dtype=int64)

In [41]:
from imblearn.over_sampling import SMOTE
def upsample_SMOTE(X, y, ratio=0.08):
    sm = SMOTE(random_state=23, sampling_strategy=ratio)
    X_train_sm, y_train_sm = sm.fit_resample(X, y)
    print(len(X_train_sm), len(y_train_sm))
    return X_train_sm, y_train_sm

In [42]:
X_train_sm,y_train_sm=upsample_SMOTE(X_base, y_base, ratio=0.25)

2218 2218


In [43]:
X_train_sm=pd.DataFrame(X_train_sm)
y_train_sm=pd.DataFrame(y_train_sm)

In [44]:
cols=X_train_sm.columns

In [45]:
!pip install pca
from pca import pca
model = pca(n_components=15)
# Fit transform
out = model.fit_transform(X_train_sm)

# Print the top features. The results show that f1 is best, followed by f2 etc
feture=list(out['topfeat']['feature'][:15])
feture=list(set(feture))

[pca] >Processing dataframe..
[pca] >The PCA reduction is performed on the [20] columns of the input dataframe.
[pca] >Fitting using PCA..
[pca] >Computing loadings and PCs..
[pca] >Computing explained variance..
[pca] >Outlier detection using Hotelling T2 test with alpha=[0.05] and n_components=[15]
[pca] >Outlier detection using SPE/DmodX with n_std=[2]


In [46]:
print(my_model.explained_variance_ratio_.cumsum())

NameError: name 'my_model' is not defined

In [None]:
df_new=X_train_sm[feture]

In [None]:
df_new.head()

In [None]:
df2=dataframe_test[feture]
xgbnew1=XGBClassifier(learning_rate =0.16, n_estimators=250, max_depth=5,reg_alpha=1e-05,
 min_child_weight=3, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=5, scale_pos_weight=20, seed=27)
modelnew1=xgbnew1.fit(X=X_train_sm,y=y_train_sm)
predictionsnew1=modelnew1.predict(dataframe_test)
confusion_matrix(df_sol['fraud'],predictionsnew1)

In [None]:
clf8 = MLPClassifier(random_state=1, max_iter=100,hidden_layer_sizes=(90,), activation='relu',solver='adam').fit(X_base, y_base)
cf=clf8.predict(dataframe_test)
confusion_matrix(df_sol['fraud'],cf)

In [None]:
df_voter.columns

In [None]:
#finally lets see the best model
print(confusion_matrix(df_sol['fraud'],df_voter['svc']))
print('-----------------------------------------------------------------------------------------------------')
print(confusion_matrix(df_sol['fraud'],df_voter['logistic']))
print('-----------------------------------------------------------------------------------------------------')
print(confusion_matrix(df_sol['fraud'],df_voter['ada']))
print('-----------------------------------------------------------------------------------------------------')
print(confusion_matrix(df_sol['fraud'],df_voter['xgb']))
print('-----------------------------------------------------------------------------------------------------')

In [None]:
import pickle
with open('model_pkl', 'wb') as files:
    pickle.dump(svc1, files)