# Data Processing

### Load Dataset in Jupyter Notebook

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Credit card transactions_India.csv')

### Basic Data Inspection

In [3]:
#View the first few rows to understand the structure:

df.head()

Unnamed: 0,index,City,Date,Card Type,Exp Type,Gender,Amount
0,0,"Delhi, India",29-Oct-14,Gold,Bills,F,82475
1,1,"Greater Mumbai, India",22-Aug-14,Platinum,Bills,F,32555
2,2,"Bengaluru, India",27-Aug-14,Silver,Bills,F,101738
3,3,"Greater Mumbai, India",12-Apr-14,Signature,Bills,F,123424
4,4,"Bengaluru, India",5-May-15,Gold,Bills,F,171574


In [4]:
# Summary Statistics

df.describe()

Unnamed: 0,index,Amount
count,26052.0,26052.0
mean,13025.5,156411.537425
std,7520.708943,103063.254287
min,0.0,1005.0
25%,6512.75,77120.25
50%,13025.5,153106.5
75%,19538.25,228050.0
max,26051.0,998077.0


### Data Cleaning

In [5]:
# Lowercase Column Names and Replace Spaces
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [6]:
df.head()

Unnamed: 0,index,city,date,card_type,exp_type,gender,amount
0,0,"Delhi, India",29-Oct-14,Gold,Bills,F,82475
1,1,"Greater Mumbai, India",22-Aug-14,Platinum,Bills,F,32555
2,2,"Bengaluru, India",27-Aug-14,Silver,Bills,F,101738
3,3,"Greater Mumbai, India",12-Apr-14,Signature,Bills,F,123424
4,4,"Bengaluru, India",5-May-15,Gold,Bills,F,171574


### Convert Data Types

- Ensure numerical columns are of appropriate types (float, int).
- Dates need to be in datetime format.

In [7]:
df['amount'] = df['amount'].astype(float)
df['date'] = pd.to_datetime(df['date'], format='mixed')
df['city'] = df['city'].str.replace(', India', '')

# Reset index and rename index column to 'transaction_id'
df = df.reset_index().rename(columns={'index': 'transaction_id'})

# Increment transaction_id by 1
df['transaction_id'] = df['transaction_id'] + 1

In [8]:
df.head()

Unnamed: 0,level_0,transaction_id,city,date,card_type,exp_type,gender,amount
0,0,1,Delhi,2014-10-29,Gold,Bills,F,82475.0
1,1,2,Greater Mumbai,2014-08-22,Platinum,Bills,F,32555.0
2,2,3,Bengaluru,2014-08-27,Silver,Bills,F,101738.0
3,3,4,Greater Mumbai,2014-04-12,Signature,Bills,F,123424.0
4,4,5,Bengaluru,2015-05-05,Gold,Bills,F,171574.0


In [9]:
# drop level 0 column
df = df.drop('level_0', axis=1)

In [10]:
# check and drop duplicates if any
df.drop_duplicates(inplace=True)

In [11]:
df.shape

(26052, 7)

### Explore Data

In [12]:
# Shows data types, non-null counts, and memory usage.

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26052 entries, 0 to 26051
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   transaction_id  26052 non-null  int64         
 1   city            26052 non-null  object        
 2   date            26052 non-null  datetime64[ns]
 3   card_type       26052 non-null  object        
 4   exp_type        26052 non-null  object        
 5   gender          26052 non-null  object        
 6   amount          26052 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(4)
memory usage: 1.4+ MB


In [13]:
# Check for null values

df.isnull().sum()

transaction_id    0
city              0
date              0
card_type         0
exp_type          0
gender            0
amount            0
dtype: int64

In [15]:
df.to_csv('Credit_card_transactions.csv', index=False)