# importing requried libraries

In [40]:
import pandas as pd

In [41]:
file_path='sri_bhavana.csv'
df=pd.read_csv(file_path)
print("Size of dataset:",df.shape)
df.head()

Size of dataset: (1048575, 11)


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


# checking for the null values

In [42]:
# check for null data and if present remove them.
null = df.isnull().values.any()

if null:
  df = df.dropna()
  print("Removed null rows.")
  print("New dataset size: ", df.shape)
else:
  print("No null values found in the DataFrame.")

No null values found in the DataFrame.


# checking for duplicates

In [43]:
# check for duplicate rows and if present remove them.
duplicates = df.duplicated().any()

if duplicates:
  df = df.drop_duplicates()
  print("Removed duplicate rows.")
  print("New dataset size: ", df.shape)
else:
  print("No duplicate rows found in the DataFrame.")

No duplicate rows found in the DataFrame.


# missing values

In [44]:
#check for the missing values
print("\nmissing values before handling:")
print(df.isnull().sum())


missing values before handling:
step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


# Types of payment

In [45]:
# check for the types of payments 
distinct_types = df['type'].unique()
print("Types of payment: ", distinct_types)

Types of payment:  ['PAYMENT' 'TRANSFER' 'CASH_OUT' 'DEBIT' 'CASH_IN']


In [46]:
# convert the types to numerical form and update the dataset
types_dict = {v: i for i, v in enumerate(['PAYMENT', 'TRANSFER', 'CASH_OUT', 'DEBIT', 'CASH_IN'])}
print("Types dictionary: ", types_dict)
df['type'] = df['type'].replace(types_dict)
df.head()

Types dictionary:  {'PAYMENT': 0, 'TRANSFER': 1, 'CASH_OUT': 2, 'DEBIT': 3, 'CASH_IN': 4}


  df['type'] = df['type'].replace(types_dict)


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,0,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,0,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,1,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,2,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,0,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


# droping the unwanted coumns

In [47]:
# List of columns to drop (adjust this list based on actual columns)
columns_to_drop = ['step', 'nameOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFlaggedFraud']
# Drop the columns
df = df.drop(columns=columns_to_drop, errors='ignore')

# weight of fraud and not fraud data in the dataset

In [48]:
# check the weight of fraud and not fraud data in the dataset
df['isFraud'].value_counts()

isFraud
0    1047433
1       1142
Name: count, dtype: int64

# reduce the weightage of 'not fraud' data

In [49]:
# reduce the weightage of 'not fraud' data
from sklearn.utils import resample
majority = df[(df['isFraud']==0)] # not fraud data
minority = df[(df['isFraud']==1)] # fraud data

majority_downsampled = resample(majority, 
                                replace=False, 
                                n_samples=len(minority), 
                                random_state=71) #downsample the majority data

new_df = pd.concat([majority_downsampled, minority]) #concatenate the downsampled majority and minority
new_df = new_df.sample(frac=1).reset_index(drop=True) #shuffle the new dataframe

print("New dataframe size: ",new_df.shape)

print("\nNew Distribution")
print(new_df['isFraud'].value_counts())

new_df.head()

New dataframe size:  (2284, 5)

New Distribution
isFraud
0    1142
1    1142
Name: count, dtype: int64


Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,isFraud
0,2,492239.95,870700.78,378460.83,0
1,2,372149.98,204553.0,0.0,0
2,2,77452.6,0.0,0.0,0
3,1,5021186.54,5021186.54,0.0,1
4,2,91087.35,0.0,0.0,0


# saving the cleaned file 

In [50]:
# save the cleaned file
new_df.to_csv("cleaned_dataset.csv", index=False)
print("Cleaned dataset saved.")

Cleaned dataset saved.
