In [1]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd
import numpy as  np

In [2]:
data = pd.read_csv('creditcard.csv')

In [3]:
# Only use the 'Amount' and 'V1', ..., 'V28' features
features = ['Amount'] + ['V%d' % number for number in range(1, 29)]

# The target variable which we would like to predict, is the 'Class' variable
target = 'Class'

# Now create an X variable (containing the features) and an y variable (containing only the target variable)
X = data[features]
y = data[target]

print(len(X))
print(len(y))

284807
284807


In [4]:
def normalize(X):
    """
    Make the distribution of the values of each variable similar by subtracting the mean and by dividing by the standard deviation.
    """
    for feature in X.columns:
        X[feature] -= X[feature].mean()
        X[feature] /= X[feature].std()
    return X

In [5]:
# Define the model
model = LogisticRegression()

# Define the splitter for splitting the data in a train set and a test set
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)

# Loop through the splits (only one)
for train_indices, test_indices in splitter.split(X, y):
    # Select the train and test data
    X_train, y_train = X.loc[train_indices], y.loc[train_indices]
    X_test, y_test = X.loc[test_indices], y.loc[test_indices]
    
    # Normalize the data
    X_train = normalize(X_train)
    X_test = normalize(X_test)
    
    # Fit and predict!
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # And finally: the results
    print(classification_report(y_test, y_pred))

    
print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))
print(len(y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00    142158
          1       0.88      0.61      0.72       246

avg / total       1.00      1.00      1.00    142404

142403
142404
142403
142404
142404


In [6]:
from sklearn.metrics import classification_report
from  sklearn.metrics import precision_recall_fscore_support


def pandas_classification_report(y_true, y_pred):
    metrics_summary = precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred)

    avg = list(precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred,
            average='weighted'))

    metrics_sum_index = ['precision', 'recall', 'f1-score', 'support']
    class_report_df = pd.DataFrame(
        list(metrics_summary),
        index=metrics_sum_index)

    support = class_report_df.loc['support']
    total = support.sum() 
    avg[-1] = total

    class_report_df['avg / total'] = avg

    return class_report_df.T

In [7]:
print(classification_report(y_true=y_test, y_pred=y_pred, digits=6))

             precision    recall  f1-score   support

          0   0.999332  0.999859  0.999596    142158
          1   0.883041  0.613821  0.724221       246

avg / total   0.999131  0.999192  0.999120    142404



In [8]:
model.score(X_test, y_test)

0.99919243841465122

In [9]:
model.predict_proba(X_test)
#in each array there are two eelements which signify the
#probability that this point belongs to zero or one
#sum of these two is one

array([[  9.99764246e-01,   2.35753742e-04],
       [  9.99510346e-01,   4.89654158e-04],
       [  9.99519204e-01,   4.80795830e-04],
       ..., 
       [  9.99310342e-01,   6.89657872e-04],
       [  9.99941065e-01,   5.89345898e-05],
       [  9.99808546e-01,   1.91453799e-04]])

In [10]:
y_pred - y_test

248027    0
159764    0
259935    0
111386    0
119485    0
273783    0
262261    0
115267    0
203426    0
145513    0
7758      0
259806    0
11613     0
258880    0
71795     0
217295    0
128730    0
260806    0
10986     0
38906     0
147971    0
116505    0
186818    0
241951    0
114156    0
262986    0
121317    0
12199     0
187910    0
113478    0
         ..
200843    0
78852     0
50673     0
220206    0
249360    0
169538    0
149552    0
115672    0
222605    0
223698    0
133073    0
42538     0
57784     0
155363    0
57356     0
188860    0
237444    0
256105    0
269098    0
154100    0
28646     0
266499    0
160565    0
270504    0
64398     0
30600     0
260037    0
46285     0
257086    0
29329     0
Name: Class, Length: 142404, dtype: int64

In [11]:
from sklearn.metrics import confusion_matrix

In [12]:
y_train_pred = model.predict(X_train)

In [13]:
y_test_pred = model.predict(X_test)

In [14]:
confusion_matrix(y_train, y_train_pred)

array([[142137,     20],
       [    87,    159]], dtype=int64)

In [15]:
confusion_matrix(y_test, y_test_pred)

array([[142138,     20],
       [    95,    151]], dtype=int64)

In [16]:
df_class_report = pandas_classification_report(y_true=y_test, y_pred=y_pred)
print(df_class_report)

             precision    recall  f1-score   support
0             0.999332  0.999859  0.999596  142158.0
1             0.883041  0.613821  0.724221     246.0
avg / total   0.999131  0.999192  0.999120  142404.0


In [17]:
df_class_report.to_csv('my_csv_file.csv',  sep=',')

In [18]:
len(y_pred)

142404

In [24]:
df = pd.DataFrame({'Class':y_pred, 'Feature' : X_test['V1'] })
df.head()

Unnamed: 0,Class,Feature
248027,0,1.057872
159764,0,-0.9507
259935,0,0.927323
111386,0,-0.395844
119485,0,0.577045


In [20]:
df.to_csv("Predict1.csv")

In [21]:
np.savetxt("ans1.csv", y_pred , fmt="%1.5f", delimiter=",")