In [None]:
import os
print("Installing requirements.")
os.system("pip3 install -r requirements.txt")

In [None]:
import pandas as pd

def load_data():
    print("Loading data.")
    data = pd.read_csv("https://media.githubusercontent.com/media/SHREYASNAIR129/Fraud-Detection/master/Final_Transactions.csv")
    return data

In [None]:
import pandas as pd

def data_Analysis():
    data=pd.DataFrame(load_data())
    
    print("Analysing Data")

    #Changing values from scientific notation to much readable notation
    print("Improving data readability")
    pd.set_option('display.float_format', '{:.2f}'.format)

    #First look at the data
    print("Thease are the first 5 rows of data set.")
    print(data.head())

    #Data types
    print("Observing the data types in the dataset.")
    print(data.info())

    #Data set description
    print("Description of the dataset.")
    print(data.describe())

    #Data set null values observation
    print("Sum of null values in the dataset.")
    print(data.isnull().sum())

    #Duplicates in the dataset
    print("Duplicates in the data are: ",data.duplicated().sum())

    #Checking imbalance in data
    tx_fraud = data['TX_FRAUD']
    print("Imbalanced data: \n",tx_fraud.value_counts())

In [None]:
from dataLoading import load_data
from imblearn.over_sampling import RandomOverSampler
import pandas as pd
import seaborn
import matplotlib.pyplot as plt


def data_preprocessing():
    #Analysing data before processing it.
    data_Analysis()

    #Loading data
    data = pd.DataFrame(load_data())
    
    print("Preprocessing the data.")
    #Droping null values
    print("Droping null values")
    data.dropna()
    print("Null values dropped")
    print(data.isnull().sum())

    #Dropping duplicate values
    print("Droping Duplicate values")
    data.drop_duplicates()
    print("Duplicates dropped")

    #Random Oversampling of majority class to balance data
    print("Oversampling to balance data")
    tx_fraud = data['TX_FRAUD']
    rus = RandomOverSampler(sampling_strategy="minority")
    new_balanced_data, new_tx_fraud = rus.fit_resample(data,tx_fraud)

    #Check the new balanced data
    print("Balanced data: \n",new_tx_fraud.value_counts())

    return(new_balanced_data)

In [None]:
import seaborn
import matplotlib.pyplot as plt
import pandas as pd

def data_Visualization():
    data = pd.DataFrame(data_preprocessing())

    print("Visualizing the data.")
    #To visualize outliers
    seaborn.pairplot(data = data)
    plt.show()

In [None]:
import pandas as pd

def feature_Engineering():
    #Visualizing data
    data_Visualization()

    data = pd.DataFrame(data_preprocessing())

    print("Feature Engineering.")
    #Removing outliers and unwanted columns
    print("Removing outliers and unwanted columns")
    data = data.drop(['Unnamed: 0'], axis=1)
    data = data.drop(['TX_DATETIME'], axis=1)
    data = data[(data["TX_AMOUNT"]<75000)]
    
    return data

In [None]:
#Checking corelation matrix for essential features.
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

def feature_selection():
    data = pd.DataFrame(feature_Engineering())
    
    print("Feature selection based on corelation matrix")
    #Checking corelation matrix for essential features.
    corr_matrix = data.corr()
    sn.heatmap(corr_matrix, annot=True)
    plt.show()

    #Droping Columns that have very less corelation
    data = data.drop(['TRANSACTION_ID'], axis=1)
    data = data.drop(['CUSTOMER_ID'], axis=1)
    data = data.drop(['TERMINAL_ID'], axis=1)

    return data

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

def data_Spliting():
    data = pd.DataFrame(feature_selection())

    print("Splitting the data.")
    # Assigning the featurs as X and trarget as y
    X= data.drop(["TX_FRAUD"],axis =1)
    y= data["TX_FRAUD"]
    x_train, x_test, y_train, y_test = train_test_split(X, y,test_size=0.25, random_state=7)
    return (x_train, x_test, y_train, y_test)

In [None]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import pickle

# Mentioning the models for Classification
dtc = tree.DecisionTreeClassifier()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()
lr = LogisticRegression()
knc = KNeighborsClassifier()
svc = LinearSVC()

def modelSelection():
    x_train, x_test, y_train, y_test = data_Spliting()
    
    #Name of algorithms used.
    algo_names = ["Decision Tree","Logistic Regression","Random Forest Classification","Gradient Boosting Classification","Linear Support Vector Classification","K Neighbours Classification"]
    algo = [dtc, lr, rfc, gbc, svc, knc] 
    
    print("Selecting the best model")
    #Finding AI Scores for each classifier.
    Al_Scores = []
    j=0
    for i in algo:
        train = i.fit(x_train,y_train)
        score = train.score(x_test,y_test)
        Al_Scores.append(score)
        # print(score)
        print(algo_names[j],":",score*100)
        j+=1
    
    #Deciding final model
    max_value= max(Al_Scores)
    for i in range(len(Al_Scores)):
        if Al_Scores[i] == max_value:
            best_model = algo[i]
            best_model_name = algo_names[i]
    print("The best model is", best_model_name)

    #Finding metrics for the best model and printing them
    y_predict = best_model.predict(x_test)
    precision = precision_score(y_test, y_predict, pos_label='positive', average='micro')
    recall = recall_score(y_test, y_predict, pos_label='positive', average='micro')
    accuracy = accuracy_score(y_test, y_predict)
    f1score = f1_score(y_test, y_predict, pos_label='positive', average='micro')

    print("Precision = ", precision)
    print("f1score = ", f1score)
    print("recall = ", recall)
    print("accuracy = ", accuracy)

    #Creating pickle file for best model
    print("Creating pickel file.")
    pickle.dump(best_model, open('model.pkl', 'wb'))
    
modelSelection()
