<a href="https://colab.research.google.com/github/Subashganesan00/INSAID-Assigment/blob/main/INSAID_TASK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import multiprocessing

# Step 1: Data Cleaning
data = pd.read_csv('fraud_dataset.csv')

# Handle missing values
data.dropna(inplace=True)

# Handle outliers (assuming 'amount' is a numerical feature)
Q1 = data['amount'].quantile(0.25)
Q3 = data['amount'].quantile(0.75)
IQR = Q3 - Q1
data = data[(data['amount'] >= Q1 - 1.5 * IQR) & (data['amount'] <= Q3 + 1.5 * IQR)]

# Handle multicollinearity (optional)
# Apply appropriate techniques like correlation analysis or variance inflation factor (VIF) analysis to identify and remove highly correlated features.

# Step 2: Fraud Detection Model
selected_features = ['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
target = 'isFraud'

# Convert categorical variables to numerical using one-hot encoding
data = pd.get_dummies(data[selected_features + [target]])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.drop(target, axis=1), data[target], test_size=0.2, random_state=42)

# Create and train the model
n_estimators = multiprocessing.cpu_count()  # Use all available CPU cores
model = RandomForestClassifier(n_estimators=n_estimators)
model.fit(X_train, y_train)

# Step 3: Variable Selection
feature_importances = model.feature_importances_
k = 5  # Number of top features to select
top_features = np.argsort(feature_importances)[-k:]
top_feature_names = data.drop(target, axis=1).columns[top_features]

# Step 4: Model Performance
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Model Performance:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

# Step 5: Key Factors for Fraud Prediction
print("Key Factors for Fraud Prediction:")
for feature in top_feature_names:
    print(feature)

# Step 6: Interpretation of Factors
# Perform additional analysis and interpretation of the selected features and their relationship with fraud detection
# You can analyze the coefficients or feature importances, explore their relationships with the target variable, and draw insights.

# Step 7: Prevention Strategies during Infrastructure Update
# Implement the necessary prevention strategies as mentioned in the solution description
# This could involve enhancing security measures, implementing real-time monitoring systems, improving authentication protocols, and conducting regular security audits.

# Step 8: Evaluating the Effectiveness of Prevention Actions
# Continuously monitor relevant metrics, such as the number of detected fraudulent transactions and false positive/negative rates, to assess the effectiveness of the prevention actions
# Analyze the changes in these metrics over time and compare them with the baseline to determine if the implemented prevention strategies are effective.

