In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv(r"C:\Users\Vaahnitha\OneDrive\Desktop\programming\hsbc hackathon\train_hsbc_df.csv")
df.head
df.info
df.isnull().sum()

step           0
customer       0
age            0
gender         0
zipcodeOri     0
merchant       0
zipMerchant    0
category       0
amount         0
fraud          0
dtype: int64

In [3]:
# Check for missing values in each column
missing_values = df.isnull().sum()

# Display columns with missing values
missing_values = missing_values[missing_values > 0]
print("Columns with missing values:\n", missing_values)

# Calculate the percentage of missing data for each column
missing_percentage = (missing_values / len(df)) * 100
print("Percentage of missing data in each column:\n", missing_percentage)


Columns with missing values:
 Series([], dtype: int64)
Percentage of missing data in each column:
 Series([], dtype: float64)


In [4]:
# Check for missing values in each column
missing_values = df.isnull().sum()

# Print out the number of missing values
print("Missing values in each column:\n", missing_values)

# Check if there are any missing values
if missing_values.sum() == 0:
    print("No missing values found in the dataset.")
else:
    print("Missing values detected. Please handle them as needed.")


Missing values in each column:
 step           0
customer       0
age            0
gender         0
zipcodeOri     0
merchant       0
zipMerchant    0
category       0
amount         0
fraud          0
dtype: int64
No missing values found in the dataset.


In [5]:
# List all columns in the DataFrame
print("Columns in the DataFrame:", df.columns)

Columns in the DataFrame: Index(['step', 'customer', 'age', 'gender', 'zipcodeOri', 'merchant',
       'zipMerchant', 'category', 'amount', 'fraud'],
      dtype='object')


In [6]:
# Creating a new feature for the number of transactions per customer aka feature transforming for customer, since it's a unique feature, using it for aggregation i.r., counting is better
df['customer_transaction_count'] = df.groupby('customer')['customer'].transform('count')

# Drop the original 'customer' column if it's not needed directly
df = df.drop('customer', axis=1)

In [7]:
print("Columns in the DataFrame:", df.columns)

Columns in the DataFrame: Index(['step', 'age', 'gender', 'zipcodeOri', 'merchant', 'zipMerchant',
       'category', 'amount', 'fraud', 'customer_transaction_count'],
      dtype='object')


In [8]:
print(df['gender'].unique())
print(df['gender'].value_counts())

["'M'" "'F'" "'E'" "'U'"]
gender
'F'    280199
'M'    231998
'E'      1009
'U'       437
Name: count, dtype: int64


In [9]:
print("Unique values in 'gender':", df['gender'].unique())
print("Frequency distribution of 'gender':")
print(df['gender'].value_counts())
# Define expected categories
expected_categories = ["'M'","'F'" ]
new_category = "'O'"

# Replace 'E' and 'U' along with any other unexpected values with 'O'
df['gender'] = df['gender'].apply(lambda x: x if x in expected_categories else new_category)

# Verify the changes
print("Updated unique values in 'gender':", df['gender'].unique())
print("Updated frequency distribution of 'gender':")
print(df['gender'].value_counts())


Unique values in 'gender': ["'M'" "'F'" "'E'" "'U'"]
Frequency distribution of 'gender':
gender
'F'    280199
'M'    231998
'E'      1009
'U'       437
Name: count, dtype: int64
Updated unique values in 'gender': ["'M'" "'F'" "'O'"]
Updated frequency distribution of 'gender':
gender
'F'    280199
'M'    231998
'O'      1446
Name: count, dtype: int64


In [10]:
# Define categorical features
categorical_features = ['gender', 'zipcodeOri', 'merchant', 'zipMerchant', 'category']

# Initialize OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore')

# Apply OneHotEncoder to categorical features
encoded_data = encoder.fit_transform(df[categorical_features])

# Convert the encoded data to a DataFrame and concatenate with the original data
encoded_df = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names_out(categorical_features))
df = pd.concat([df.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

# Drop the original categorical columns
df = df.drop(categorical_features, axis=1)

In [11]:
# Check data types of the columns
print("Data types of columns:\n", df.dtypes)

Data types of columns:
 step                                 int64
age                                 object
amount                             float64
fraud                                int64
customer_transaction_count           int64
                                    ...   
category_'es_sportsandtoys'        float64
category_'es_tech'                 float64
category_'es_transportation'       float64
category_'es_travel'               float64
category_'es_wellnessandbeauty'    float64
Length: 75, dtype: object


In [12]:
# Check unique values in the 'age' column
print("Unique values in 'age' column:\n", df['age'].unique())


Unique values in 'age' column:
 ["'3'" "'4'" "'2'" "'5'" "'1'" "'6'" "'0'" "'U'"]


In [13]:
df['age'] = df['age'].replace({'\'U\'': np.nan})

# Convert numeric values from strings to actual numbers
df['age'] = pd.to_numeric(df['age'].str.strip("'"), errors='coerce')

# Fill NaN values in 'age' with the median
df['age'] = df['age'].fillna(df['age'].median())

# Optionally, convert 'age' to integers
df['age'] = df['age'].astype(int)

# Verify the changes
print(df['age'].unique())
print(df.dtypes)

[3 4 2 5 1 6 0]
step                                 int64
age                                  int32
amount                             float64
fraud                                int64
customer_transaction_count           int64
                                    ...   
category_'es_sportsandtoys'        float64
category_'es_tech'                 float64
category_'es_transportation'       float64
category_'es_travel'               float64
category_'es_wellnessandbeauty'    float64
Length: 75, dtype: object


In [14]:
print("Columns in the DataFrame:", df.columns)

Columns in the DataFrame: Index(['step', 'age', 'amount', 'fraud', 'customer_transaction_count',
       'gender_'F'', 'gender_'M'', 'gender_'O'', 'zipcodeOri_'28007'',
       'merchant_'M1053599405'', 'merchant_'M117188757'',
       'merchant_'M1198415165'', 'merchant_'M1294758098'',
       'merchant_'M1313686961'', 'merchant_'M1352454843'',
       'merchant_'M1353266412'', 'merchant_'M1400236507'',
       'merchant_'M1416436880'', 'merchant_'M151143676'',
       'merchant_'M1535107174'', 'merchant_'M1600850729'',
       'merchant_'M1649169323'', 'merchant_'M1726401631'',
       'merchant_'M17379832'', 'merchant_'M1741626453'',
       'merchant_'M1748431652'', 'merchant_'M1788569036'',
       'merchant_'M1823072687'', 'merchant_'M1842530320'',
       'merchant_'M1872033263'', 'merchant_'M1873032707'',
       'merchant_'M1888755466'', 'merchant_'M1913465890'',
       'merchant_'M1946091778'', 'merchant_'M2011752106'',
       'merchant_'M2080407379'', 'merchant_'M209847108'',
       'mer

In [15]:
from sklearn.preprocessing import StandardScaler

# Define numerical features
numerical_features = ['age','amount', 'customer_transaction_count']

# Initialize StandardScaler
scaler = StandardScaler()

# Apply scaling to numerical features
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [16]:
from sklearn.model_selection import train_test_split

# Define the target variable and features
X = df.drop('fraud', axis=1)
y = df['fraud']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv(r"C:\Users\Vaahnitha\OneDrive\Desktop\programming\hsbc hackathon\train_hsbc_df.csv")

# Creating a new feature for the number of transactions per customer
df['customer_transaction_count'] = df.groupby('customer')['customer'].transform('count')
df = df.drop('customer', axis=1)

# Replace unexpected values in 'gender'
expected_categories = ["'M'", "'F'"]
new_category = "'O'"
df['gender'] = df['gender'].apply(lambda x: x if x in expected_categories else new_category)

# Define categorical features
categorical_features = ['gender', 'zipcodeOri', 'merchant', 'zipMerchant', 'category']
encoder = OneHotEncoder(handle_unknown='ignore')
encoded_data = encoder.fit_transform(df[categorical_features])
encoded_df = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names_out(categorical_features))
df = pd.concat([df.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)
df = df.drop(categorical_features, axis=1)

# Handle 'age' column
df['age'] = df['age'].replace({'\'U\'': np.nan})
df['age'] = pd.to_numeric(df['age'].str.strip("'"), errors='coerce')
df['age'] = df['age'].fillna(df['age'].median())
df['age'] = df['age'].astype(int)

# Standardize numerical features
numerical_features = ['age', 'amount', 'customer_transaction_count']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Define features and target
X = df.drop('fraud', axis=1)
y = df['fraud']

# Include 'fraud' column before saving
df['fraud'] = y

# Save the preprocessed dataset including 'fraud'
df.to_csv('preprocessed_data_with_fraud.csv', index=False)
