In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb
import lightgbm as lgb
import cv2
import pytesseract


File_path = 'Fraud_data.csv'
data = pd.read_csv(File_path)
# print(data.head())

data.dropna(inplace=True)

data['Premium'] = data['Premium'].str.replace(',', '').str.strip().astype(float)
data['POLICY SUMASSURED'] = data['POLICY SUMASSURED'].str.replace(',', '').str.strip().astype(float)

data['Premium_to_SumAssured_Ratio'] = data['Premium'] / data['POLICY SUMASSURED']

data.replace([np.inf, -np.inf], np.nan, inplace=True)

data.dropna(inplace=True)

data.fillna(value=0, inplace=True)

print(data['Premium_to_SumAssured_Ratio'])

def check_risk(ratio, high_ratio_threshold=0.2, low_ratio_threshold=0.01):
    if ratio > high_ratio_threshold:
        return 'High Risk'
    elif ratio < low_ratio_threshold:
        return 'Low Risk'
    else:
        return 'Normal Risk'

risk_categories = data['Premium_to_SumAssured_Ratio'].apply(check_risk)

print(risk_categories)

data['Risk Category'] = risk_categories

label_encoder = LabelEncoder()
data['Fraud Category Encoded'] = label_encoder.fit_transform(data['Fraud Category'])

features = ['Premium', 'POLICY SUMASSURED', 'Premium_to_SumAssured_Ratio']
X = data[features]
y = data['Fraud Category Encoded']  
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

rf = RandomForestClassifier(n_estimators=100, random_state=0, class_weight='balanced')

rf.fit(X_train, y_train)

xgb_model = xgb.XGBClassifier(eval_metric='mlogloss')

xgb_model.fit(X_train, y_train)

lgb_model = lgb.LGBMClassifier()

lgb_model.fit(X_train, y_train)

def predict_and_warn(model, new_data):
    
    probas = model.predict_proba(new_data)
    predictions = model.predict(new_data)
    
    warnings = []
    for i, proba in enumerate(probas):
        if max(proba) > 0.8: 
            warnings.append(f"Warning: High probability of fraud detected for customer {i+1}!")
        else:
            warnings.append(f"Customer {i+1} is safe.")
    return predictions, warnings


new_data = pd.DataFrame({
    'Premium': [5000, 10000],
    'POLICY SUMASSURED': [50000, 100000],
    'Premium_to_SumAssured_Ratio': [0.1, 0.1]
})

rf_predictions, rf_warnings = predict_and_warn(rf, new_data)
print("Random Forest Warnings:")
print(rf_warnings)

xgb_predictions, xgb_warnings = predict_and_warn(xgb_model, new_data)
print("XGBoost Warnings:")
print(xgb_warnings)

lgb_predictions, lgb_warnings = predict_and_warn(lgb_model, new_data)
print("LightGBM Warnings:")
print(lgb_warnings)

def authenticate_document(image_path):
    image = cv2.imread(image_path)
    if image is None:
        return "Error: Image file not found or could not be loaded."
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray)

    if "AUTHENTIC" in text:
        return "Document is authentic"
    else:
        return "Document is not authentic"

document_status = authenticate_document('path_to_document_image.jpg')
print(document_status)

def real_time_fraud_detection(model, stream_data):
    for data_point in stream_data:
        prediction, warning = predict_and_warn(model, pd.DataFrame([data_point]))
        print(warning)

stream_data = [
    {'Premium': 5000, 'POLICY SUMASSURED': 50000, 'Premium_to_SumAssured_Ratio': 0.1},
    {'Premium': 10000, 'POLICY SUMASSURED': 100000, 'Premium_to_SumAssured_Ratio': 0.1}
]

real_time_fraud_detection(rf, stream_data)


7       0.100000
8       0.100000
9       0.100000
10      0.100000
11      0.100000
          ...   
1314    0.142857
1315    0.142857
1316    0.142857
1317    0.050000
1318    0.050000
Name: Premium_to_SumAssured_Ratio, Length: 742, dtype: float64
7       Normal Risk
8       Normal Risk
9       Normal Risk
10      Normal Risk
11      Normal Risk
           ...     
1314    Normal Risk
1315    Normal Risk
1316    Normal Risk
1317    Normal Risk
1318    Normal Risk
Name: Premium_to_SumAssured_Ratio, Length: 742, dtype: object
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000034 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 224
[LightGBM] [Info] Number of data points in the train set: 593, number of used features: 3
[LightGBM] [Info] Start training from score -5.692047
[LightGBM] [Info] Start training from score -1.966354
[LightGBM] [Info] Start training from score -6.385194
[LightGBM] [Info] Sta