In [1]:
from kafka import KafkaProducer
from kafka import KafkaConsumer
import json


## Created a producer script to send transaction data.

In [5]:
producer = KafkaProducer(bootstrap_servers='localhost:9092', 
                         value_serializer=lambda v: json.dumps(v).encode('utf-8'))

# Example transaction data
transaction_data = {
    "transaction_id": "12345",
    "user_id": "67890",
    "transaction_amount": 250.75,
    "timestamp": "2024-09-10 14:32:55",
    "location": "New York, USA",
    "device_id": "abc123"
}

producer.send('fraud_detection', value=transaction_data)
producer.flush()

## Create a consumer to read the transaction data in real-time:

In [10]:
from kafka import KafkaConsumer
import json

consumer = KafkaConsumer('fraud_detection', 
                         bootstrap_servers=['localhost:9092'],
                         value_deserializer=lambda x: json.loads(x.decode('utf-8')))

## Create features like time-based and location-based features in real-time.

In [18]:
import pandas as pd

# Define your data dictionary
data_dict = {
    'timestamp': ['2024-09-01 10:00:00', '2024-09-01 10:05:00'],
    'transaction_amount': [100, 150],
    'location_change': [0, 1],
    'is_fraud': [0, 1]
}

# Create a DataFrame from the data_dict
df = pd.DataFrame(data_dict)

# Convert 'timestamp' column to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Compute the time difference
df['time_since_last'] = df['timestamp'].diff().dt.total_seconds()

print(df)

            timestamp  transaction_amount  location_change  is_fraud  \
0 2024-09-01 10:00:00                 100                0         0   
1 2024-09-01 10:05:00                 150                1         1   

   time_since_last  
0              NaN  
1            300.0  


In [22]:
# Check the type and content of df
print(type(df))  # Should print: <class 'pandas.core.frame.DataFrame'>
print(df.head())  # Prints the first few rows of the DataFrame

<class 'pandas.core.frame.DataFrame'>
            timestamp  transaction_amount  location_change  is_fraud  \
0 2024-09-01 10:00:00                 100                0         0   
1 2024-09-01 10:05:00                 150                1         1   

   time_since_last  
0              NaN  
1            300.0  


In [25]:
print(df.columns)

Index(['timestamp', 'transaction_amount', 'location_change', 'is_fraud',
       'time_since_last'],
      dtype='object')


In [27]:
X = df.drop(columns='is_fraud')  # Features
y = df['is_fraud']  # Target variable

In [29]:
print(df.head())  # Check the first few rows of df

            timestamp  transaction_amount  location_change  is_fraud  \
0 2024-09-01 10:00:00                 100                0         0   
1 2024-09-01 10:05:00                 150                1         1   

   time_since_last  
0              NaN  
1            300.0  


In [7]:
import pandas as pd

# Define your data dictionary or load from a file
data_dict = {
    'timestamp': ['2024-09-01 10:00:00', '2024-09-01 10:05:00'],
    'transaction_amount': [100, 150],
    'time_since_last': [0, 300],  # Ensure this column exists
    'location_change': [0, 1],    # Ensure this column exists
    'is_fraud': [0, 1]
}

# Create the DataFrame
df = pd.DataFrame(data_dict)

# Convert 'timestamp' column to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [9]:
from sklearn.model_selection import train_test_split

# Define features and target
X = df.drop(columns='is_fraud')  # Features
y = df['is_fraud']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (1, 4)
X_test shape: (1, 4)
y_train shape: (1,)
y_test shape: (1,)


In [11]:
print("Training set class distribution:")
print(y_train.value_counts())

print("Test set class distribution:")
print(y_test.value_counts())

Training set class distribution:
is_fraud
0    1
Name: count, dtype: int64
Test set class distribution:
is_fraud
1    1
Name: count, dtype: int64


## Develop a machine learning model to classify transactions as fraudulent or legitimate.

In [13]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(class_weight='balanced')  # Automatically adjusts weights inversely proportional to class frequencies

In [15]:
print("Class distribution in target variable:")
print(y.value_counts())

Class distribution in target variable:
is_fraud
0    1
1    1
Name: count, dtype: int64


In [17]:
# Remove classes with only 1 sample
min_class_count = y.value_counts().min()
if min_class_count < 2:
    y = y[y.map(y.value_counts()) > 1]
    X = X[y.index]

In [19]:
print("Shape of X:", X.shape)
print("Length of y:", len(y))

Shape of X: (2, 0)
Length of y: 0


In [21]:
print("Data shape:", df.shape)
print("Columns in data:", df.columns)

Data shape: (2, 5)
Columns in data: Index(['timestamp', 'transaction_amount', 'time_since_last', 'location_change',
       'is_fraud'],
      dtype='object')


In [23]:
print(df.head())

            timestamp  transaction_amount  time_since_last  location_change  \
0 2024-09-01 10:00:00                 100                0                0   
1 2024-09-01 10:05:00                 150              300                1   

   is_fraud  
0         0  
1         1  


In [25]:
print("Missing values in X:", X.isnull().sum())
print("Missing values in y:", y.isnull().sum())

Missing values in X: Series([], dtype: float64)
Missing values in y: 0


In [27]:
# Example of ensuring alignment
valid_indices = X.index.intersection(y.index)
X = X.loc[valid_indices]
y = y.loc[valid_indices]

In [30]:
print(df.columns)

Index(['timestamp', 'transaction_amount', 'time_since_last', 'location_change',
       'is_fraud'],
      dtype='object')


## Implement real-time classification in the Kafka pipeline.

In [32]:
data_dict = {
    'timestamp': ['2024-09-01 10:00:00', '2024-09-01 10:05:00'],
    'transaction_amount': [100, 150],
    'location_change': [0, 1],  # Make sure this column exists
    'is_fraud': [0, 1]
}

df = pd.DataFrame(data_dict)

In [34]:
print(df.head())

             timestamp  transaction_amount  location_change  is_fraud
0  2024-09-01 10:00:00                 100                0         0
1  2024-09-01 10:05:00                 150                1         1


In [None]:
from kafka import KafkaConsumer

# Replace with your Kafka broker address and topic
broker_address = 'localhost:9092'
topic_name = 'your_topic'

try:
    consumer = KafkaConsumer(
        topic_name,
        bootstrap_servers=broker_address,
        auto_offset_reset='earliest',
        group_id='your_group_id'
    )
    
    print("Kafka consumer connected successfully.")
    
    for message in consumer:
        print(f"Received message: {message.value}")
        
except Exception as e:
    print(f"An error occurred: {e}")
finally:
    if 'consumer' in locals():
        consumer.close()

Kafka consumer connected successfully.


In [5]:
import logging
logging.basicConfig(level=logging.DEBUG)

## Regularly retrain the model with new data to address evolving fraud patterns.

In [1]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# Create a sample dataset (replace this with your actual dataset)
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, weights=[0.9, 0.1], random_state=42)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
accuracy = model.score(X_test, y_test)
print(f'Model accuracy: {accuracy}')

Model accuracy: 0.9637883008356546


## Precision,Recall,F1 Score

In [3]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Make predictions using the test set
y_pred = model.predict(X_test)

# Calculate precision, recall, and f1 scores
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the scores
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Precision: 0.9887640449438202
Recall: 0.9411764705882353
F1 Score: 0.9643835616438357


In [3]:
from kafka import KafkaProducer
import json
from datetime import datetime
import random

producer = KafkaProducer(bootstrap_servers='localhost:9092',
                         value_serializer=lambda v: json.dumps(v).encode('utf-8'))

# Simulate transactions
for _ in range(10):  # Send 10 transactions
    transaction = {
        "transaction_id": str(random.randint(10000, 99999)),
        "user_id": str(random.randint(1000, 9999)),
        "transaction_amount": round(random.uniform(10.0, 500.0), 2),
        "timestamp": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        "location": "New York, USA"  # Example location
    }
    producer.send('transactions', transaction)