In [1]:
# Data Manipulation Libraries
import pandas as pd 
import numpy as np

# Plotting Libraries
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# Load the dataset
data = pd.read_csv('C:\\Users\\Tsi\\Desktop\\10academy\\Week6\\data\\data.csv',low_memory=False)

# Load variable definitions
data_vd =  pd.read_csv('C:\\Users\\Tsi\\Desktop\\10academy\\Week6\\data\\Xente_Variable_Definitions.csv', low_memory=False)

<h1>Overview a data</h1>

In [4]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

# Display variable definitions
print("\nVariable definitions:")
data_vd

First few rows of the dataset:
         TransactionId         BatchId       AccountId       SubscriptionId  \
0  TransactionId_76871   BatchId_36123  AccountId_3957   SubscriptionId_887   
1  TransactionId_73770   BatchId_15642  AccountId_4841  SubscriptionId_3829   
2  TransactionId_26203   BatchId_53941  AccountId_4229   SubscriptionId_222   
3    TransactionId_380  BatchId_102363   AccountId_648  SubscriptionId_2185   
4  TransactionId_28195   BatchId_38780  AccountId_4841  SubscriptionId_3829   

        CustomerId CurrencyCode  CountryCode    ProviderId     ProductId  \
0  CustomerId_4406          UGX          256  ProviderId_6  ProductId_10   
1  CustomerId_4406          UGX          256  ProviderId_4   ProductId_6   
2  CustomerId_4683          UGX          256  ProviderId_6   ProductId_1   
3   CustomerId_988          UGX          256  ProviderId_1  ProductId_21   
4   CustomerId_988          UGX          256  ProviderId_4   ProductId_6   

      ProductCategory    ChannelId   

Unnamed: 0,Column Name,Definition
0,TransactionId,Unique �transaction identifier on platform
1,BatchId,Unique number assigned to a batch of transacti...
2,AccountId,Unique number identifying the customer on plat...
3,SubscriptionId,Unique number identifying the customer subscri...
4,CustomerId,Unique identifier attached to Account
5,CurrencyCode,Country currency
6,CountryCode,Numerical geographical code of country
7,ProviderId,Source provider of Item �bought.
8,ProductId,Item name being bought.
9,ProductCategory,ProductIds are organized into these broader pr...


In [None]:
#cleaning a data by removing columns that did not affect FraudResult which is the target column
data1 = data.drop['CurrencyCode','CountryCode']

In [None]:
#Adding proxy variable as a columun  
# Define a function to categorize users based on the fraud result
def categorize_risk(row):
    if row['FraudResult'] == 1:
        return 'High Risk'
    else:
        return 'Low Risk'

# Apply the categorize_risk function to the DataFrame and create the RiskProfile column
data['RiskProfile'] = data.apply(categorize_risk, axis=1)

Feature Engineering: Feature engineering is the process of creating new features (or variables) from the raw data that can better represent the underlying patterns and relationships in the data. This can involve tasks such as:
1.Selecting the most relevant features from the raw data
2.Transforming or combining existing features to create new, more informative features
3.Encoding categorical variables into a numerical format that the model can understand
4.Handling missing data, outliers, and other data quality issues

<h1>Create Aggregate Features</h1>

Total Transaction Amount: 

In [5]:
#Sum of all transaction amounts for each customer.


Average Transaction Amount: 

In [6]:
#Average transaction amount per customer.


Transaction Count:

In [7]:
#Number of transactions per customer.


Standard Deviation of Transaction Amounts: 

In [8]:
#Variability of transaction amounts per customer.


<h1>Extract Features</h1>

In [9]:
#feature extraction.....transformationi 
# Determining the minimum and maximum dates in the dataset to find the range of years
data['TransactionStartTime'] = pd.to_datetime(data['TransactionStartTime'])

data['Year'] = data['TransactionStartTime'].dt.year
data['Month'] = data['TransactionStartTime'].dt.month
data['Day'] = data['TransactionStartTime'].dt.day
data['Hour'] = data['TransactionStartTime'].dt.hour

<h1>Encode Categorical Variables</h1>

In [None]:
#feature transforming...encoding



<h1>Handle Missing Values</h1>

In [10]:

#checking for missing values from the data set
data.isnull().sum()

TransactionId           0
BatchId                 0
AccountId               0
SubscriptionId          0
CustomerId              0
CurrencyCode            0
CountryCode             0
ProviderId              0
ProductId               0
ProductCategory         0
ChannelId               0
Amount                  0
Value                   0
TransactionStartTime    0
PricingStrategy         0
FraudResult             0
Year                    0
Month                   0
Day                     0
Hour                    0
dtype: int64

<h1>Normalize/Standardize Numerical Features</h1>

Min-Max Scaling (Normalization): Also known as feature scaling, this method rescales the features to a common range, typically between 0 and 1. The formula is: x_normalized = (x - min(x)) / (max(x) - min(x)).
Standardization (Z-score Normalization): This method transforms the features to have a mean of 0 and a standard deviation of 1. The formula is: x_standardized = (x - mean(x)) / std(x).

Standardization

In [11]:
#feature scaling
# Select numerical columns from the dataframe
numerical_columns = data.select_dtypes(include=['float64', 'int64'])

from scipy import stats
from scikit-learn import maxmin

#Normalizing(Min-Max Scaling)


#Standardazing(Z-score)
# Calculate the Z-scores for each numerical column
z_scores = stats.zscore(data[numerical_columns.columns])

# Define a threshold for outlier detection
threshold = 3

# Find the indices of the outliers
outlier_indices = np.where(np.abs(z_scores) > threshold)

# Remove the outliers from the dataframe
data_without_outliers = data.drop(outlier_indices[0])

# Print the updated dataframe without outliers
print(data_without_outliers)

              TransactionId         BatchId       AccountId  \
0       TransactionId_76871   BatchId_36123  AccountId_3957   
1       TransactionId_73770   BatchId_15642  AccountId_4841   
2       TransactionId_26203   BatchId_53941  AccountId_4229   
3         TransactionId_380  BatchId_102363   AccountId_648   
4       TransactionId_28195   BatchId_38780  AccountId_4841   
...                     ...             ...             ...   
95657   TransactionId_89881   BatchId_96668  AccountId_4841   
95658   TransactionId_91597    BatchId_3503  AccountId_3439   
95659   TransactionId_82501  BatchId_118602  AccountId_4841   
95660  TransactionId_136354   BatchId_70924  AccountId_1346   
95661   TransactionId_35670   BatchId_29317  AccountId_4841   

            SubscriptionId       CustomerId CurrencyCode  CountryCode  \
0       SubscriptionId_887  CustomerId_4406          UGX          256   
1      SubscriptionId_3829  CustomerId_4406          UGX          256   
2       SubscriptionId_2