In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import *
from ast import literal_eval
from tensorflow.random import set_seed
import os
import pickle




In [2]:
RANDOM_SEED = 0
np.random.seed(seed=RANDOM_SEED)
set_seed(RANDOM_SEED)

In [3]:
df = pd.read_pickle("Fraud Detection with Natural Language Processing.pkl")
print("dataset shape: ", df.shape)
action_vocab = pd.read_csv("vocab.csv")


dataset shape:  (105303, 9)


Checking Datasets

In [4]:
df.head(3).T

Unnamed: 0,0,1,2
actions,"[[2], [23], [6], [7], [14], [25], [28], [6], [...","[[22], [27], [24], [1], [1268], [1269], [1267]...","[[22], [24], [27], [1], [1268], [1269], [1267]..."
times,"[0, 47000.0, 640000.0, 6000.0, 54000.0, 10000....","[0, 33204.0, 215636.0, 443415.0, 72586.0, 3424...","[0, 25459.0, 46236.0, 428626.0, 42785.0, 74158..."
execution_time,203,203,359
Amount,15,13,310
device_freq,1.0,1.0,1.0
ip_freq,1.0,1.0,0.333333
beneficiary_freq,1.0,0.5,0.333333
application_freq,1.0,1.0,1.0
is_fraud,0,0,0


In [5]:
action_vocab.head(5)

Unnamed: 0,Name
0,/ACCOUNTS/ACCOUNTS_FULL_EXCHANGE
1,/PROFILE/USERPROFILE
2,/P2PREGISTRATION/ASKMEMBER
3,/CARDS/FETCHCARDS
4,/ACCOUNTS/ACCOUNTS_FULL


In [6]:
# one line is broken, throw it away
broken_times = df[df.times.apply(lambda x: x[-1]!="]")]
assert broken_times.shape[0] == 1
assert broken_times.iloc[0].is_fraud==0


In [7]:
# ignore the single broken line
df = df[df.times.apply(lambda x: x[-1]=="]")]

In [8]:
#build the raw text, using the names and the (index-inverted) tokens 
action_names = action_vocab.Name.to_list()
id_to_action = {str(i):a for i,a in enumerate(action_names)}
action_to_id = {a:str(i) for i,a in enumerate(action_names)}

In [9]:
# Recall to cast the strings into lists
df.actions = df.actions.apply(literal_eval)

In [10]:
df["times"] = df.times.apply(literal_eval).apply(lambda x: [i/1000 for i in x])
df["Action time mean"] = df.times.apply(np.mean)
df["Action time std"] = df.times.apply(np.std)
df["log(amount)"] = df.Amount.apply(np.log)
df["Transaction Type"] = df.is_fraud.apply(lambda x: "Fraud" if x else "Non Fraud")
df["time_to_first_action"] = df.times.apply(lambda x: x[1] if len(x)>1 else 0)
df["actions_str"] = df.actions.apply(lambda x: " ".join([id_to_action[str(i[0])] for i in x if len(i)>0]))
df["total_time_to_transaction"] = df.times.apply(sum)/1000
df['actions'] = df['actions'].apply(lambda x: [item for sublist in x for item in sublist])
df.head(2)

Unnamed: 0,actions,times,execution_time,Amount,device_freq,ip_freq,beneficiary_freq,application_freq,is_fraud,Action time mean,Action time std,log(amount),Transaction Type,time_to_first_action,actions_str,total_time_to_transaction
1,"[22, 27, 24, 1, 1268, 1269, 1267, 22, 29, 1, 2...","[0.0, 33.204, 215.636, 443.415, 72.586, 34.241...",203,13,1.0,1.0,0.5,1.0,0,177.859292,218.36858,2.564949,Non Fraud,33.204,/PROFILE/GETCUSTOMERRESPONSE /TAXFREE/GETTAXGO...,4.268623
2,"[22, 24, 27, 1, 1268, 1269, 1267, 2, 23, 6, 25...","[0.0, 25.459, 46.236, 428.626, 42.785, 74.158,...",359,310,1.0,0.333333,0.333333,1.0,0,196.875569,252.496316,5.736572,Non Fraud,25.459,/PROFILE/GETCUSTOMERRESPONSE /CAMPAIGN/GETBALA...,587.279823


In [11]:
df.isnull().sum()

actions                      0
times                        0
execution_time               0
Amount                       0
device_freq                  0
ip_freq                      0
beneficiary_freq             0
application_freq             0
is_fraud                     0
Action time mean             0
Action time std              0
log(amount)                  0
Transaction Type             0
time_to_first_action         0
actions_str                  0
total_time_to_transaction    0
dtype: int64

In [12]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re


In [13]:
def preprocess_text(text):
    if isinstance(text, str):  # Check if text is a string
        # Convert text to lowercase
        text = text.lower()
        
        # Remove special characters, punctuation, and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Tokenization
        tokens = word_tokenize(text)
        
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
        
        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        
        # Join tokens back into text
        processed_text = ' '.join(tokens)
        
        return processed_text
    else:
        return ''  # Return empty string for non-string values

# Apply preprocessing to the 'actions_str' column
df['preprocessed_actions'] = df['actions_str'].apply(preprocess_text)


In [14]:
# Display the processed DataFrame
print(df[['actions_str', 'preprocessed_actions']].head())

                                         actions_str  \
1  /PROFILE/GETCUSTOMERRESPONSE /TAXFREE/GETTAXGO...   
2  /PROFILE/GETCUSTOMERRESPONSE /CAMPAIGN/GETBALA...   
3  /PROFILE/GETCUSTOMERRESPONSE /PROFILE/USERPROF...   
4  /PROFILE/GETCUSTOMERRESPONSE /CAMPAIGN/GETBALA...   
5  /PROFILE/GETCUSTOMERRESPONSE /CAMPAIGN/GETBALA...   

                                preprocessed_actions  
1  profilegetcustomerresponse taxfreegettaxgoal c...  
2  profilegetcustomerresponse campaigngetbalance ...  
3  profilegetcustomerresponse profileuserprofile ...  
4  profilegetcustomerresponse campaigngetbalance ...  
5  profilegetcustomerresponse campaigngetbalance ...  


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features as needed
tfidf_matrix = tfidf_vectorizer.fit_transform(df['preprocessed_actions'])


In [16]:

# Convert TF-IDF matrix to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())


In [17]:
# Concatenate TF-IDF features with the original DataFrame
df_with_features = pd.concat([df, tfidf_df], axis=1)
# Display the DataFrame with features
df_with_features.head()


Unnamed: 0,actions,times,execution_time,Amount,device_freq,ip_freq,beneficiary_freq,application_freq,is_fraud,Action time mean,...,wealthgetexistingportfolio,wealthgetfeed,wealthgetglobalindexfeed,wealthgetinstrument,wealthgetmifid,wealthgetonlinebookingtransaction,wealthgetorder,wealthgetportfolioslist,wealthgetposition,wealthpartyid
1,"[22, 27, 24, 1, 1268, 1269, 1267, 22, 29, 1, 2...","[0.0, 33.204, 215.636, 443.415, 72.586, 34.241...",203.0,13.0,1.0,1.0,0.5,1.0,0.0,177.859292,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003871,0.011988,0.214356
2,"[22, 24, 27, 1, 1268, 1269, 1267, 2, 23, 6, 25...","[0.0, 25.459, 46.236, 428.626, 42.785, 74.158,...",359.0,310.0,1.0,0.333333,0.333333,1.0,0.0,196.875569,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020461,0.0,0.13329
3,"[22, 1, 29, 22, 26, 2, 23, 25, 6, 28, 14, 7, 6...","[0.0, 440.927, 5.785, 46.875, 968.65, 311.757,...",250.0,350.0,1.0,0.5,0.5,1.0,0.0,231.663108,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.088614
4,"[22, 24, 27, 72, 1269, 1268, 1267, 4, 70, 46, ...","[0.0, 93.894, 46.81, 548.388, 132.548, 37.844,...",203.0,350.0,1.0,0.6,0.4,1.0,0.0,201.258838,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096876
5,"[22, 24, 27, 23, 2, 1269, 1269, 6, 25, 7, 28, ...","[0.0, 921.997, 47.386, 963.97, 9.522, 443.596,...",593.0,2000.0,1.0,0.666667,0.166667,1.0,1.0,196.272,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.212969


In [18]:
from gensim.models import Word2Vec
import multiprocessing

# Tokenize the preprocessed text data
tokenized_text = df['preprocessed_actions'].apply(word_tokenize)

# Train Word2Vec model
cores = multiprocessing.cpu_count()  # Number of CPU cores
word2vec_model = Word2Vec(sentences=tokenized_text, 
                          vector_size=100,      # Dimensionality of the word vectors
                          window=5,             # Context window size
                          min_count=1,          # Minimum frequency of words to consider
                          workers=cores)        # Number of CPU cores to use for training



In [19]:
# Save or load the trained Word2Vec model if needed
# word2vec_model.save("word2vec_model.bin")
# word2vec_model = Word2Vec.load("word2vec_model.bin")

# Extract word embeddings for each action
word_embeddings = []
for tokens in tokenized_text:
    embeddings = [word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv]
    if embeddings:
        word_embeddings.append(sum(embeddings) / len(embeddings))  # Average of word embeddings for each action
    else:
        word_embeddings.append([0] * word2vec_model.vector_size)   # Use zero vector for actions with no embeddings



In [20]:
# Convert word embeddings to DataFrame
embedding_df = pd.DataFrame(word_embeddings, columns=[f"embedding_{i+1}" for i in range(word2vec_model.vector_size)])

# Concatenate word embeddings with the original DataFrame
df_with_embeddings = pd.concat([df, embedding_df], axis=1)

# Displaying the DataFrame with embeddings
df_with_embeddings.head()


Unnamed: 0,actions,times,execution_time,Amount,device_freq,ip_freq,beneficiary_freq,application_freq,is_fraud,Action time mean,...,embedding_91,embedding_92,embedding_93,embedding_94,embedding_95,embedding_96,embedding_97,embedding_98,embedding_99,embedding_100
1,"[22, 27, 24, 1, 1268, 1269, 1267, 22, 29, 1, 2...","[0.0, 33.204, 215.636, 443.415, 72.586, 34.241...",203.0,13.0,1.0,1.0,0.5,1.0,0.0,177.859292,...,0.39037,-0.259035,-0.818634,0.526734,-0.675468,0.911069,-0.475308,0.655829,0.984734,-1.828189
2,"[22, 24, 27, 1, 1268, 1269, 1267, 2, 23, 6, 25...","[0.0, 25.459, 46.236, 428.626, 42.785, 74.158,...",359.0,310.0,1.0,0.333333,0.333333,1.0,0.0,196.875569,...,0.370563,-0.170274,-0.850609,0.435764,-0.746376,0.892807,-0.16066,0.271043,1.041365,-1.642899
3,"[22, 1, 29, 22, 26, 2, 23, 25, 6, 28, 14, 7, 6...","[0.0, 440.927, 5.785, 46.875, 968.65, 311.757,...",250.0,350.0,1.0,0.5,0.5,1.0,0.0,231.663108,...,0.282979,-0.551494,-0.871284,0.165223,-0.645264,0.52067,0.301725,0.106478,1.12097,-1.542511
4,"[22, 24, 27, 72, 1269, 1268, 1267, 4, 70, 46, ...","[0.0, 93.894, 46.81, 548.388, 132.548, 37.844,...",203.0,350.0,1.0,0.6,0.4,1.0,0.0,201.258838,...,0.201138,-0.865989,-0.758778,-0.272655,-0.81392,0.295349,0.388804,0.648794,0.542747,-1.152348
5,"[22, 24, 27, 23, 2, 1269, 1269, 6, 25, 7, 28, ...","[0.0, 921.997, 47.386, 963.97, 9.522, 443.596,...",593.0,2000.0,1.0,0.666667,0.166667,1.0,1.0,196.272,...,0.398779,-0.263677,-0.834379,0.552381,-0.701737,0.939087,-0.504878,0.672113,0.956458,-1.847037


In [21]:
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from sklearn.model_selection import train_test_split

# Define maximum sequence length (you can adjust this based on your data)
max_sequence_length = 100

# Convert action sequences to padded sequences
padded_sequences = pad_sequences(df['actions'], maxlen=max_sequence_length)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['is_fraud'], test_size=0.2, random_state=42)


In [22]:
# Define the LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(action_vocab), output_dim=100, input_length=max_sequence_length))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))





In [23]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])






In [None]:
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))


Epoch 1/10


  68/1317 [>.............................] - ETA: 2:49 - loss: 0.1007 - accuracy: 0.9903

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

In [None]:
# Find the maximum sequence length in the dataset
#max_sequence_length = max(len(seq) for seq in df['actions'])

# Print the problematic sample at position 304 and its shape
#sample_304 = df['actions'].iloc[304]
#print("Sample at position 304:", sample_304)
#print("Shape of sample:", np.array(sample_304).shape)


In [None]:
# Find the maximum sequence length in the dataset
max_sequence_length = max(len(seq) for seq in df['actions'])

# Identify sequences causing the issue
problematic_sequences = [seq for seq in df['actions'] if len(seq) != max_sequence_length]

# Print the problematic sequences and their lengths
for i, seq in enumerate(problematic_sequences):
    print(f"Sequence {i+1}: Length={len(seq)}, Elements={seq}")

# Print the shape of the first problematic sequence if it exists
if problematic_sequences:
    try:
        print("Shape of the first problematic sequence:", np.array(problematic_sequences[0]).shape)
    except Exception as e:
        print("Error occurred while getting the shape of the first problematic sequence:", str(e))
else:
    print("No problematic sequences found.")


In [None]:

# Print the shape of the first problematic sequence
print("Shape of the first problematic sequence:", np.array(problematic_sequences[0]).shape)

# Print the shape of the first problematic sequence if it exists
if problematic_sequences:
    try:
        print("Shape of the first problematic sequence:", np.array(problematic_sequences[0]).shape)
    except Exception as e:
        print("Error occurred while getting the shape of the first problematic sequence:", str(e))
else:
    print("No problematic sequences found.")


In [None]:
from keras.layers import Input, Embedding, LSTM

# Define input sequence shape
sequence_input = Input(shape=(max_sequence_length,), dtype='int32')

# Define embedding layer
embedding_layer = Embedding(input_dim=len(action_vocab), output_dim=100, input_length=max_sequence_length)(sequence_input)

# Define LSTM layer
lstm_layer = LSTM(100, return_sequences=True)(embedding_layer)


In [None]:
from keras.layers import Activation, Dense, Dot

# Attention mechanism (Bahdanau attention)
attention_weights = Dense(1, activation='tanh')(lstm_layer)
attention_weights = Activation('softmax')(attention_weights)
attention_output = Dot(axes=1)([attention_weights, lstm_layer])


In [None]:
from keras.layers import Concatenate, Reshape
from keras.layers import ZeroPadding1D

# Pad attention_output to match the shape of lstm_layer along the second dimension
attention_output_padded = ZeroPadding1D(padding=(0, 3084))(attention_output_reshaped)

# Concatenate LSTM output and attention output
final_output = Concatenate(axis=-1)([attention_output_padded, lstm_layer])


In [None]:
# Dense layer for classification
output = Dense(1, activation='sigmoid')(final_output)


In [None]:
from keras.models import Model

# Define model
model = Model(inputs=sequence_input, outputs=output)


In [None]:
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences

# Define the model architecture
def create_model(input_shape):
    inputs = tf.keras.Input(shape=input_shape)
    # Define the rest of your model layers
    # Example:
    # x = tf.keras.layers.Dense(64, activation='relu')(inputs)
    # outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    # model = tf.keras.Model(inputs, outputs)
    # return model
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(64, activation='relu', input_shape=input_shape),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    return model

# Create the model
model = create_model((3085,))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Assuming X_train, y_train, X_test, y_test are defined
# Pad input data to match expected shape
X_train_padded = pad_sequences(X_train, maxlen=3085)
X_test_padded = pad_sequences(X_test, maxlen=3085)

# Train the model
model.fit(X_train_padded, y_train, epochs=10, batch_size=64, validation_data=(X_test_padded, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


In [None]:
# Assuming 'df' is your DataFrame containing the dataset
#num_features = df.shape[1]

#print("Number of features:", num_features)

In [None]:
# Assuming df is your DataFrame

# Select numerical columns
numerical_columns = df.select_dtypes(include=['number']).columns.tolist()

print("Numerical Columns:", numerical_columns)


In [None]:
# Check for infinite values
if np.any(np.isinf(df.select_dtypes(include=[np.number]).values)):
    print("DataFrame contains infinite values.")

# Check for large values
max_val = np.max(df.select_dtypes(include=[np.number]).values)
if max_val > 1e10:  # Adjust the threshold as needed
    print("DataFrame contains large values.")

# Check for missing values
if df.isnull().values.any():
    print("DataFrame contains missing values.")

# Handle missing values (if any)
df.dropna(inplace=True)

# Proceed with preprocessing and anomaly detection
# (Use the code you provided after ensuring data integrity)


In [None]:
# Select only numeric columns
numeric_columns = df.select_dtypes(include=[np.number]).columns

# Check for infinite values in each numeric column
for column in numeric_columns:
    if np.any(np.isinf(df[column])):
        print(f"Column '{column}' contains infinite values.")


In [None]:
# Check unique values in the 'log(amount)' column
unique_values = df['log(amount)'].unique()
print(unique_values)


In [None]:
# Replace infinite values with NaN
df['log(amount)'].replace([np.inf, -np.inf], np.nan, inplace=True)


In [None]:
# Check if there are any infinite values after handling
if np.any(np.isinf(df['log(amount)'])):
    print("Column 'log(amount)' still contains infinite values after handling.")
else:
    print("All infinite values in column 'log(amount)' have been handled.")


ANOMALY DETECTION

In [None]:
import numpy as np

# Assuming 'df' is your DataFrame containing the data
# Extract the 'log(amount)' column from the DataFrame
log_amount_column = df['log(amount)']

# Convert the column to a NumPy array
your_array = np.array(log_amount_column)

# Check the data type of elements in your NumPy array
print("Data type of array elements:", your_array.dtype)

# Print the elements of your array to inspect them
print("Array elements:", your_array)


# Example: Reshape sequences into a uniform shape
reshaped_array = np.array([np.reshape(seq, (-1,)) for seq in your_array])

# Example: Print specific elements causing the error
for i, element in enumerate(reshaped_array):
    if isinstance(element, list):
        print(f"Element {i}: {element}")


In [None]:
Ensemble Methods and Evaluation and Iteration