# Import Libraries

In [None]:
import matplotlib.pyplot as Pt ##Matplotlib is a comprehensive library for creating static, animated, and interactive visualizations in Python. Matplotlib makes easy things easy and hard things possible.
import seaborn as Ss ##a widely popular data visualization library that is commonly used for data science and machine learning tasks
import matplotlib.ticker as mtick
import plotly.express as px #Plotly express is a high-level data visualization package that allows you to create interactive plots with very little code. It is built on top of Plotly Graph Objects, which provides a lower-level interface for developing custom visualizations.
import pandas as F #Pandas is one of the tools in Machine Learning which is used for data cleaning and analysis. It has features which are used for exploring, cleaning, transforming and visualizing from data.
import numpy as N #In Python we have lists that serve the purpose of arrays, but they are slow to process.NumPy aims to provide an array object that is up to 50x faster than traditional Python lists.The array object in NumPy is called ndarray, it provides a lot of supporting functions that make working with ndarray very easy.

Ss.set(rc = {'figure.figsize':(17,10)}) 


# Load Data

In [None]:
d=F.read_csv('../input/fraud-detection/fraudTrain.csv') 
d.isna().sum() #The isna() function returns a Boolean value of True if the value is NaN and False otherwise. The sum() function returns the sum of True values, which equals the number of NaN values in the column.

# From here we start a Data Analysis


In [None]:
#understanding  of transaction amt
#d.describe()

**here starting visualization**

In [None]:
# evaluating amount vs fraud
Axis=Ss.histplot(x='amt',data=d[d.amt<=1000],hue='is_fraud',stat='percent',multiple='dodge',common_norm=False,bins=25)
Axis.set_ylabel('percentage of every  type')
Axis.set_xlabel('transaction Amt in USD')
Pt.legend(title='type', labels=['fraud', 'no fraud']) #A legend is a predefined function legend() that creates an area on the graph which describes all the elements of a graph

In [None]:
#obtain a relation between Gender and Fraud
Axis=Ss.histplot(x='gender',data=d, hue='is_fraud',stat='percent',multiple='dodge',common_norm=False)
Axis.set_ylabel('percentage')
Axis.set_xlabel('credit card holder gender')
Pt.legend(title='Type', labels=['fraud', 'no fraud'])

In [None]:
#here creating the relationship of age and fraud
import datetime as DA #Python Datetime module supplies classes to work with date and time. These classes provide a number of functions to deal with dates, times, and time intervals. Date and DateTime are an object in Python, so when you manipulate them, you are actually manipulating objects and not strings or timestamps.
d['age']=DA.date.today().year-F.to_datetime(d['dob']).dt.year
Axis=Ss.kdeplot(x='age',data=d, hue='is_fraud', common_norm=False)
Axis.set_xlabel('credit card holder age')
Axis.set_ylabel('density')
Pt.xticks(N.arange(0,110,5))
Pt.title('Age Distribution in Fraudulent vs Non-Fraudulent Transactions')
Pt.legend(title='Type', labels=['fraud', 'no fraud'])

In [None]:
#here we are removing the duplicate for better training of model
d.drop_duplicates(inplace=True) #The drop_duplicates() function provided by pandas removes duplicate rows, which ensures that the data fed into the machine learning model is not redundant

In [None]:
#Categorical variables usually have strings for their values. Many machine learning algorithms do not support string values for the input variables. Therefore, we need to replace these string values with numbers. This process is called categorical variable encoding.
#here we are encoding the features
#now  operation of one hot encoding ,category variable 
# One-hot encoding in machine learning is the conversion of categorical information into a format that may be fed into machine learning algorithms to improve prediction accuracy. One-hot encoding is a common method for dealing with categorical data in machine learning.
Cate_Onehot = F.get_dummies(d.category, prefix='category', drop_first=True)
#here process of One-hot encoding the Gender variable
print(Cate_Onehot)
Gend_Onehot = F.get_dummies(d.gender, prefix='gender', drop_first=True)
print(Gend_Onehot)
#one-hot encoding the age variable
Age_Onehot = F.get_dummies(d.age, prefix='age', drop_first=True)
print(Age_Onehot)

#  Now here  we are applying the data modeling , prediction

In [None]:
d1 = F.concat([d, Cate_Onehot,Gend_Onehot,Age_Onehot], axis=1)



In [None]:
# now make a subset the training data  include  features that is important
train=d[['category','amt','gender','age','is_fraud']]
train

In [None]:
train=F.get_dummies(train, drop_first=True)
print(train)


In [None]:
# here we are changing the  Category terms into dummy variables
train=F.get_dummies(train, drop_first=True)
y_train=train['is_fraud'].values
#y_train
X_train=train.drop("is_fraud", axis='columns').values #The drop() method removes the specified row or column. By specifying the column axis ( axis='columns' ), the drop() method removes the specified column.
#print(X_train)

In [None]:
# here is the testing dataset
test=F.read_csv('../input/fraud-detection/fraudTest.csv')
test['age']=DA.date.today().year-F.to_datetime(test['dob']).dt.year
#test['age']
test=test[['category','amt','gender','age','is_fraud']]
test

In [None]:
test=F.get_dummies(test, drop_first=True)
test

In [None]:
# here convert category to dummy variables
test=F.get_dummies(test, drop_first=True)
y_test=test['is_fraud'].values
X_test=test.drop("is_fraud", axis='columns').values

In [None]:
# here appling logistic regression-Logistic regression is a supervised machine learning algorithm mainly used for classification tasks where the goal is to predict the probability that an instance of belonging to a given class. It is used for classification algorithms its name is logistic regression. it’s referred to as regression because it takes the output of the linear regression function as input and uses a sigmoid function to estimate the probability for the given class. The difference between linear regression and logistic regression is that linear regression output is the continuous value that can be anything while logistic regression predicts the probability that an instance belongs to a given class or not.
from imblearn.over_sampling import SMOTE #Smote is used for synthetic minority oversampling in machine learning. It generates synthetic samples to balance imbalanced datasets, specifically targeting the minority class.
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report #The classification report visualizer displays the precision, recall, F1, and support scores for the model.

method= SMOTE()
X_resampled, y_resampled = method.fit_resample(X_train, y_train)
Model=LogisticRegression()
Model.fit(X_resampled,y_resampled)
predicted=Model.predict(X_test)
print('Classification report:\n', classification_report(y_test, predicted))
conf_mat = confusion_matrix(y_true=y_test, y_pred=predicted)
print('Confusion matrix:\n', conf_mat)
print('Share of Non-Fraud in Test Data:', 1-round(y_test.sum()/len(y_test),4))

In [None]:
#here we use random forest-combines the output of multiple decision trees to reach a single result. Its ease of use and flexibility have fueled its adoption, as it handles both classification and regression problems.
from sklearn.ensemble import RandomForestClassifier
Model2 = RandomForestClassifier(random_state=5)
Model2.fit(X_resampled,y_resampled)
predicted=Model2.predict(X_test)
print('Classification report:\n', classification_report(y_test, predicted))
conf_mat = confusion_matrix(y_true=y_test, y_pred=predicted)
print('Confusion matrix:\n', conf_mat)
print('Share of Non-Fraud in Test Data:', 1-round(y_test.sum()/len(y_test),4))
#Macro-averaged Precision is calculated as an average of Precisions of all classes
#Weighted-averaged Precision is also calculated based on Precision per class but takes into account the number of samples of each class in the data

### Here we are doing serialization

In [None]:
import pickle #Pickling is the process of serializing an object. Serializing means storing the object in the form of binary representation so it can be saved in our main memory. The object could be of any type. It could be a string, tuple, or any other sort of object that Python supports. The data is stored in the main memory in a file. While writing the code for pickling, we open the file in "wb" mode, also known as writing binary mode. So, to use the pickle module, we have to make a file with the .pkl extension and send it in a dump() function along with the object. dump() is a built-in function in the Pickle module, made for pickling.
pickle.dump(Model2, open('finalModel', 'wb'))