# Risk Analysis Project: Predictive Modeling for Online Purchase Order Classification

**Author:** Ryhan Sunny

## Data Preprocessing and Classification 

In [2]:
# All imported libraries here:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from datetime import datetime
import joblib

### Function to Calculate Age from Birthdate
This function calculates age from a birthdate string and handles exceptions for invalid or missing values.

In [3]:
def calculate_age(birthdate):
    try:
        birth_date = datetime.strptime(birthdate, '%m/%d/%Y')
        current_date = datetime.now()
        age = current_date.year - birth_date.year - ((current_date.month, current_date.day) < (birth_date.month, birth_date.day))
        return age
    except ValueError:
        return np.nan  # Return NaN for invalid or missing values

###  Function to Preprocess Data
This function performs data preprocessing, including handling missing values, one-hot encoding for categorical variables, label encoding, age calculation, and other data transformations

In [4]:
def preprocess_data(df, is_train=True):
    # Handle missing values
    df.fillna(method="ffill", inplace=True)
    
    # One-hot encoding for categorical variables
    categorical_columns = ["Z_METHODE", "Z_CARD_ART", "Z_LAST_NAME", "WEEKDAY_ORDER"]
    df = pd.get_dummies(df, columns=categorical_columns)
    
    # Replace '?' with NaN and convert 'TIME_ORDER' to minutes past midnight
    df['TIME_ORDER'] = df['TIME_ORDER'].replace('?', np.nan)
    df['TIME_ORDER'] = pd.to_datetime(df['TIME_ORDER'], errors='coerce').dt.hour * 60 + pd.to_datetime(df['TIME_ORDER'], errors='coerce').dt.minute
    mean_time = df['TIME_ORDER'].mean()
    df['TIME_ORDER'].fillna(mean_time, inplace=True)
    
    # Label encoding
    label_encoder = LabelEncoder()
    encoded_columns = ['B_EMAIL', 'B_TELEFON', 'FLAG_NEWSLETTER', 'CHK_LADR', 'CHK_RADR', 'CHK_KTO',
                       'CHK_CARD', 'CHK_COOKIE', 'CHK_IP', 'FAIL_LPLZ', 'FAIL_LORT', 'FAIL_LPLZORTMATCH',
                       'FAIL_RPLZ', 'FAIL_RORT', 'FAIL_RPLZORTMATCH', 'NEUKUNDE', 'FLAG_LRIDENTISCH']
    for column in encoded_columns:
        df[column] = label_encoder.fit_transform(df[column].astype(str))  # Cast to string to avoid issues with mixed types
    
    # Convert 'B_BIRTHDATE' to 'AGE' and drop 'B_BIRTHDATE'
    df['AGE'] = df['B_BIRTHDATE'].apply(calculate_age)
    df.drop(columns=['B_BIRTHDATE'], inplace=True)
    
    # Replace "?" with 0 in specific columns
    columns_to_replace_question_mark = [
        'ANUMMER_02', 'ANUMMER_03', 'ANUMMER_04', 'ANUMMER_05',
        'ANUMMER_06', 'ANUMMER_07', 'ANUMMER_08', 'ANUMMER_09',
        'ANUMMER_10', 'DATE_LORDER', 'MAHN_AKT', 'MAHN_HOECHST'
    ]
    df[columns_to_replace_question_mark] = df[columns_to_replace_question_mark].replace('?', 0)
    
    # Convert 'DATE_LORDER' to epoch time
    df['DATE_LORDER'] = pd.to_datetime(df['DATE_LORDER']).values.astype(np.int64) // 10 ** 9

    # Split the data into features and target variable if it's the training data
    if is_train:
        X = df.drop("CLASS", axis=1)
        y = df["CLASS"]
        return X, y
    else:
        X = df.drop("CLASS", axis=1, errors='ignore')  # 'errors' param to ignore if 'CLASS' is not present
        return X


### Preprocess the Training Data
Reads the training data from a file, preprocesses it using the preprocess_data function, and separates it into features (X) and the target variable (y).

In [5]:
data = pd.read_csv("risk-train.txt", sep='\t')
X, y = preprocess_data(data)


### Split the Data into Training and Validation Sets
Splits the data into training and validation sets for model training and evaluation.

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


 ### Create a Data Processing Pipeline
 Defines a data processing pipeline that includes scaling and imputation.

In [7]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer(strategy="mean"))
])


### Apply Transformations to Training and Validation Sets
Applies transformations, such as scaling and imputation, to both the training and validation sets.

In [8]:
X_train = pipeline.fit_transform(X_train)
X_val = pipeline.transform(X_val)


## Model Training and Evaluation

### Train the RandomForestClassifier
    Initializes and trains a Random Forest Classifier on the preprocessed training data.

In [9]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


### Validate the Model
Validates the model using the validation set, calculates and prints the confusion matrix and classification report.

In [10]:
y_pred = clf.predict(X_val)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))


[[5669    0]
 [ 330    1]]
              precision    recall  f1-score   support

          no       0.94      1.00      0.97      5669
         yes       1.00      0.00      0.01       331

    accuracy                           0.94      6000
   macro avg       0.97      0.50      0.49      6000
weighted avg       0.95      0.94      0.92      6000



### Calculate the Accuracy of the Model
    Calculates and prints the accuracy of the model on the validation data.

In [11]:
accuracy = (y_pred == y_val).sum() / len(y_val)
print(f"Model Accuracy on Validation Data: {accuracy * 100:.2f}%")


Model Accuracy on Validation Data: 94.50%


### Display the Confusion Matrix
    Displays the confusion matrix, which provides insights into the model's performance.

In [12]:
cm = confusion_matrix(y_val, y_pred)
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[5669    0]
 [ 330    1]]


### Print the Classification Report

In [13]:
print("Classification Report:")
print(classification_report(y_val, y_pred))


Classification Report:
              precision    recall  f1-score   support

          no       0.94      1.00      0.97      5669
         yes       1.00      0.00      0.01       331

    accuracy                           0.94      6000
   macro avg       0.97      0.50      0.49      6000
weighted avg       0.95      0.94      0.92      6000



## Cost Calculation and Model Saving

###  Calculate Misclassification Costs

In [14]:
# Calculate misclassification costs
cost_matrix = np.array([[0, 50], [5, 0]])
misclassification_cost = 0

for true_class, pred_class in zip(y_val, y_pred):
    if true_class != pred_class:
        if true_class == 'High risk':
            misclassification_cost += cost_matrix[0][1]
        else:
            misclassification_cost += cost_matrix[1][0]

print(f"Total Misclassification Cost: {misclassification_cost}")

Total Misclassification Cost: 1650


###  Save the Model and Pipeline for Future Use

In [15]:
joblib.dump(clf, "model.joblib")
joblib.dump(pipeline, "pipeline.joblib")


['pipeline.joblib']

## Test Data Prediction

### Load and Preprocess the Test Data

In [16]:
test_data = pd.read_csv("risk-test.txt", sep='\t')
X_test = preprocess_data(test_data, is_train=False)
print(test_data.columns)


Index(['ORDER_ID', 'B_EMAIL', 'B_TELEFON', 'B_BIRTHDATE', 'FLAG_LRIDENTISCH',
       'FLAG_NEWSLETTER', 'Z_METHODE', 'Z_CARD_ART', 'Z_CARD_VALID',
       'Z_LAST_NAME', 'VALUE_ORDER', 'WEEKDAY_ORDER', 'TIME_ORDER',
       'AMOUNT_ORDER', 'ANUMMER_01', 'ANUMMER_02', 'ANUMMER_03', 'ANUMMER_04',
       'ANUMMER_05', 'ANUMMER_06', 'ANUMMER_07', 'ANUMMER_08', 'ANUMMER_09',
       'ANUMMER_10', 'CHK_LADR', 'CHK_RADR', 'CHK_KTO', 'CHK_CARD',
       'CHK_COOKIE', 'CHK_IP', 'FAIL_LPLZ', 'FAIL_LORT', 'FAIL_LPLZORTMATCH',
       'FAIL_RPLZ', 'FAIL_RORT', 'FAIL_RPLZORTMATCH', 'SESSION_TIME',
       'NEUKUNDE', 'AMOUNT_ORDER_PRE', 'VALUE_ORDER_PRE', 'DATE_LORDER',
       'MAHN_AKT', 'MAHN_HOECHST'],
      dtype='object')


### Load the Saved Model and Pipeline

In [17]:
clf_loaded = joblib.load("model.joblib")
pipeline_loaded = joblib.load("pipeline.joblib")


### Apply Transformations to Test Data and Predict

In [18]:
X_test_transformed = pipeline_loaded.transform(X_test)
test_predictions = clf_loaded.predict(X_test_transformed)


### Save

In [19]:
# Create a DataFrame with 'ORDER-ID' and 'CLASS' columns
predictions_df = pd.DataFrame({'ORDER_ID': test_data['ORDER_ID'], 'CLASS': test_predictions})

# Convert 'CLASS' values to 'yes' and 'no' based on your classification
predictions_df['CLASS'] = np.where(predictions_df['CLASS'] == 'High risk', 'no', 'yes')

# Save the predictions to a text file with space separator
predictions_df.to_csv("test_predictions2.txt", sep=' ', index=False)