In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dataset/credit-debit dataset.csv
/kaggle/input/dataset/TransactionDataset1.csv


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder, MultiLabelBinarizer
from sklearn.impute import SimpleImputer
import joblib

# Load datasets
data1 = pd.read_csv('/kaggle/input/dataset/TransactionDataset1.csv').drop(['user_id', 'name', 'addresses', 'email_address', 'transaction_id', 'transaction_date'], axis=1)
data2 = pd.read_csv('/kaggle/input/dataset/credit-debit dataset.csv').dropna()

# Label encoding for categorical variables in data1
label_encoder = LabelEncoder()
data1[data1.select_dtypes(include=['object']).columns] = data1.select_dtypes(include=['object']).apply(label_encoder.fit_transform)

# One-hot encoding for categorical variables in data2
data2_one_hot = pd.get_dummies(data2, columns=['Employment Status', 'Education Level', 'Marital Status'], drop_first=True)

# One-hot encoding for accounts in data2
mlb = MultiLabelBinarizer()
data2_one_hot = pd.concat([data2_one_hot, pd.DataFrame(mlb.fit_transform(data2_one_hot['Money Sources']), columns=mlb.classes_, index=data2_one_hot.index),
                   pd.DataFrame(mlb.fit_transform(data2_one_hot['Transfer Accounts']), columns=mlb.classes_, index=data2_one_hot.index)], axis=1)
data2_one_hot = data2_one_hot.drop(['Money Sources', 'Transfer Accounts'], axis=1)

# Combine datasets
X_combined = pd.concat([data1.drop('fraud_indicator', axis=1), data2_one_hot[['Total Credit Amount', 'Transaction Amount']] + data2_one_hot.drop(['Total Credit Amount', 'Transaction Amount', 'Fraud Indicator'], axis=1)], axis=1)
y_combined = pd.concat([data1['fraud_indicator'], data2_one_hot['Fraud Indicator']], ignore_index=True)[:X_combined.shape[0]]

# Save column names before SimpleImputer transformation
column_names = X_combined.columns.tolist()

# Split the combined data into training and testing sets
X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(
    X_combined, y_combined, test_size=0.2, random_state=42
)

# Use SimpleImputer to handle missing values by filling NaNs with the mean
imputer = SimpleImputer(strategy='mean')
X_train_combined = imputer.fit_transform(X_train_combined)
X_test_combined = imputer.transform(X_test_combined)

# Standardize the features
scaler_combined = StandardScaler()
X_train_combined = scaler_combined.fit_transform(X_train_combined)
X_test_combined = scaler_combined.transform(X_test_combined)

# Choose a model (Random Forest)
model_combined = RandomForestClassifier(n_estimators=2000, random_state=42, verbose=1)

# Train the combined model
model_combined.fit(X_train_combined, y_train_combined)

# Make predictions on the test set
y_pred_combined = model_combined.predict(X_test_combined)

# Display the parameters of the trained RandomForestClassifier
print("Trained RandomForestClassifier Parameters:")
print(model_combined.get_params())

# Display confusion matrix and classification report
conf_matrix = confusion_matrix(y_test_combined, y_pred_combined)
classification_rep = classification_report(y_test_combined, y_pred_combined)

print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)



[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    5.5s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:   12.2s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:   21.8s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:   33.9s
[Parallel(n_jobs=1)]: Done 1799 tasks      | elapsed:   49.1s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    0.5s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    0.7s


Trained RandomForestClassifier Parameters:
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 2000, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 1, 'warm_start': False}
Confusion Matrix:
[[2562    0]
 [ 118 1320]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      2562
           1       1.00      0.92      0.96      1438

    accuracy                           0.97      4000
   macro avg       0.98      0.96      0.97      4000
weighted avg       0.97      0.97      0.97      4000



[Parallel(n_jobs=1)]: Done 1799 tasks      | elapsed:    1.1s


In [8]:

# Save the combined model and features
combined_model_filename = '/kaggle/working/random_forest_fraud_pred.pkl'
joblib.dump({
    'label_encoder': label_encoder,
    'scaler': scaler_combined,
    'model': model_combined,
    'features': column_names  # Save the features used for training
}, combined_model_filename)
print(f'Combined model and features saved as {combined_model_filename}')

# Load the saved model
loaded_model = joblib.load('/kaggle/working/random_forest_fraud_pred.pkl')

# Display the features on which the model has been trained
print("Features used for training:")
print(loaded_model['features'])

Combined model and features saved as /kaggle/working/random_forest_fraud_pred.pkl
Features used for training:
['age', 'kyc_status', 'days_since_kyc_incomplete', 'transaction_amount', 'home_branch', 'transaction_location', 'transaction_method', 'transaction_category', 'transaction_merchant', 'transaction_time', 'average_expenditure', 'comparison_with_avg_expenditure', 'transaction_count_7_days', 'suspicion_indicator', ' ', ' ', "'", "'", ',', ',', '0', '0', '1', '1', '2', '2', '3', '3', '4', '4', '5', '5', '6', '6', '7', '7', '8', '8', '9', '9', 'Education Level_High School', 'Education Level_Master', 'Education Level_PhD', 'Employment Status_Student', 'Employment Status_Unemployed', 'Linked Accounts', 'Marital Status_Married', 'Marital Status_Single', 'Name', 'Total Credit Amount', 'Transaction Amount', 'User ID', '[', '[', ']', ']']


'/kaggle/working'

Internet connection is available.


In [11]:
# Install Streamlit
!pip install streamlit

# Load necessary libraries
import streamlit as st
import joblib

# Load your machine learning model
model = joblib.load('/kaggle/working/random_forest_fraud_pred.pkl')

# Create the Streamlit app
st.title('Rajasthan Hackhathon')

# Add user input components
user_input = st.text_input('Enter some text:')
prediction_button = st.button('Get Prediction')

# Make predictions
if prediction_button:
    prediction = model.predict([user_input])[0]
    st.write('Prediction:', prediction)





