### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [1]:
pip install torch torchvision

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from torch.utils.data import TensorDataset
import os
from sklearn.utils import shuffle

In [1]:
# This part of the code loads the data, preprocesses it, and creates a clean version of the data based on our defined rules.
# The first rule flips labels from 1 to 0 for those data instances where the loan is not paid off and the ratio of number_of_missed_payments to number_of_repayments is greater than or equal to 2.
# The second rule flips labels from 0 to 1 for those data instances where the CreditScore is greater than or equal to 4, DefaultedLoansWithin90Days is 0, and last_defaults_12_months is 0.

columns_to_remove = [
    'disbursedon_date', 'principal_amount', 'Repayment_Period',
    'number_of_repayments', 'principal_outstanding', 'total_expected_repayment',
    'total_outstanding', 'loan_number', 'missed_payments', 'defaults_12_months', 'pay_off'
]
categorical_cols = ['Gender', 'state', 'HasDeliquentCreditInHistory', 'last_HasDeliquentCreditInHistory', 'last_pay_off']

def load_data(csv_file_path):
    return pd.read_csv(csv_file_path)

def preprocess_data(df, categorical_cols):
    full_features = df.drop(['decision'] + columns_to_remove, errors='ignore', axis=1)
    full_features = pd.get_dummies(full_features, columns=categorical_cols, drop_first=True)
    dummy_column_names = full_features.columns.tolist()
    scaler = StandardScaler()
    numeric_cols = full_features.select_dtypes(include=['number']).columns.tolist()
    full_features[numeric_cols] = scaler.fit_transform(full_features[numeric_cols])
    return full_features, df['decision'], dummy_column_names

def save_data(features, labels, filename, changed_indices=None):
    features_tensor = torch.tensor(features.values, dtype=torch.float32)
    labels_tensor = torch.tensor(labels.values, dtype=torch.long)
    dataset = TensorDataset(features_tensor, labels_tensor)
    torch.save(dataset, filename)
    print(f"Data saved to {filename}: Features {features_tensor.shape}, Labels {labels_tensor.shape}")
    if changed_indices is not None:
        np.save(filename.replace('.pth', '_changed_indices.npy'), changed_indices)
        print(f"Changed indices saved to {filename.replace('.pth', '_changed_indices.npy')}")

filename = 'Existing customer data with missing value handling using clustering-Apr18.csv'
df = load_data(filename)
condition1_indices = df[(df['decision'] == 1) & (df['pay_off'] == -1) & ((df['missed_payments'] / df['number_of_repayments']) >= 2)].index
condition2_indices = df[(df['decision'] == 0) & (df['CreditScore'] >= 4) & (df['DefaultedLoansWithin90Days'] == 0) & (df['last_defaults_12_months'] == 0)].index

df_clean = df.copy()
changed_indices = condition1_indices.union(condition2_indices)
df_clean.loc[changed_indices, 'decision'] = 1 - df_clean.loc[changed_indices, 'decision']
changed_labels_count = (df['decision'] != df_clean['decision']).sum()
print(f"Number of changed labels: {changed_labels_count}")
features, labels_noisy, dummy_column_names = preprocess_data(df, categorical_cols)
features_clean, labels_clean, _ = preprocess_data(df_clean, categorical_cols)
save_data(features, labels_noisy, 'D_noisy.pth')
save_data(features_clean, labels_clean, 'D_clean.pth', changed_indices=np.array(list(changed_indices)))

print("Data preprocessing complete. Tensors saved.")

Number of changed labels: 1170
Data saved to D_noisy.pth: Features torch.Size([5516, 70]), Labels torch.Size([5516])
Data saved to D_clean.pth: Features torch.Size([5516, 70]), Labels torch.Size([5516])
Changed indices saved to D_clean_changed_indices.npy
Data preprocessing complete. Tensors saved.


In [2]:
# This section of the code splits the main clean and noisy datasets to create the required subsets of data. It creates D_train (used for the primary NDCC algorithm) and D_pre (used for pretraining a model and selecting thresholds) and D_validation for validating the accuracy of the trained model.

def load_dataset(file_path):
    dataset = torch.load(file_path)
    features = dataset.tensors[0].numpy()
    labels = dataset.tensors[1].numpy()
    df = pd.DataFrame(features, columns=[f'feature_{i}' for i in range(features.shape[1])])
    df['decision'] = labels
    return df

def select_indices(df, label_0_count, label_1_count, name):
    df_shuffled = shuffle(df, random_state=42)
    decision_0 = df_shuffled[df_shuffled['decision'] == 0]
    decision_1 = df_shuffled[df_shuffled['decision'] == 1]
    if len(decision_0) < label_0_count or len(decision_1) < label_1_count:
        raise ValueError(f"Not enough data points in one or both decision categories for {name}.")
    indices_0 = decision_0.sample(n=label_0_count, random_state=42).index
    indices_1 = decision_1.sample(n=label_1_count, random_state=42).index
    selected_indices = indices_0.union(indices_1)
    # Save indices
    selected_indices.to_series().reset_index(drop=True).to_csv(f'{name}_indices.csv', index=False)
    print(f"Indices saved to {name}_indices.csv")
    return selected_indices

def create_subset(df_noisy, df_clean, indices, noisy_dataset_name, clean_dataset_name):
    subset_noisy = df_noisy.loc[indices]
    subset_clean = df_clean.loc[indices]
    save_data(subset_noisy, f'{noisy_dataset_name}.pth')
    save_data(subset_clean, f'{clean_dataset_name}.pth')

def save_data(df, filename):
    features = df.iloc[:, :-1].values  # Assuming last column is 'decision'
    labels = df['decision'].values
    dataset = TensorDataset(torch.tensor(features, dtype=torch.float32), torch.tensor(labels, dtype=torch.long))
    torch.save(dataset, filename)
    df.to_csv(filename.replace('.pth', '.csv'), index=False)
    print(f"Data saved to {filename} and {filename.replace('.pth', '.csv')}")

df_noisy = load_dataset('D_noisy.pth')
df_clean = load_dataset('D_clean.pth')
validation_indices = select_indices(df_noisy, 184, 698, 'D_validation')
create_subset(df_noisy, df_clean, validation_indices, 'D_validation_noisy', 'D_validation_clean')

df_noisy = df_noisy.drop(validation_indices)
df_clean = df_clean.drop(validation_indices)

train_indices = select_indices(df_noisy, 739, 2795, 'D_train')
create_subset(df_noisy, df_clean, train_indices, 'D_train_noisy', 'D_train_clean')

df_noisy = df_noisy.drop(train_indices)
df_clean = df_clean.drop(train_indices)

pre_indices = df_noisy.index
pre_indices.to_series().reset_index(drop=True).to_csv('Dpre_first_indices.csv', index=False)
create_subset(df_noisy, df_clean, pre_indices, 'Dpre_first_noisy', 'Dpre_clean')

print("Data processing and saving complete.")

Indices saved to D_validation_indices.csv
Data saved to D_validation_noisy.pth and D_validation_noisy.csv
Data saved to D_validation_clean.pth and D_validation_clean.csv
Indices saved to D_train_indices.csv
Data saved to D_train_noisy.pth and D_train_noisy.csv
Data saved to D_train_clean.pth and D_train_clean.csv
Data saved to Dpre_first_noisy.pth and Dpre_first_noisy.csv
Data saved to Dpre_clean.pth and Dpre_clean.csv
Data processing and saving complete.


In [4]:
# This section of the code restores 10% of the labels changed during data cleaning to create a noisy version of D_pre, which requires a noise level of 10%.

def load_dataset(filename):
    dataset = torch.load(filename)
    features = dataset.tensors[0].numpy()
    labels = dataset.tensors[1].numpy()
    return pd.DataFrame(features), pd.Series(labels)

def save_data(df, labels, filename):
    dataset = TensorDataset(torch.tensor(df.values, dtype=torch.float32), torch.tensor(labels.values, dtype=torch.long))
    torch.save(dataset, filename)
    combined = df.copy()
    combined['decision'] = labels
    combined = shuffle(combined)  # Shuffle to avoid grouping by label
    combined.to_csv(filename.replace('.pth', '.csv'), index=False)
    print(f"Data saved to {filename} and {filename.replace('.pth', '.csv')}")

df_clean_features, labels_clean = load_dataset('Dpre_clean.pth')
df_noisy_features, labels_noisy = load_dataset('Dpre_first_noisy.pth')
differing_indices = labels_clean[labels_clean != labels_noisy].index

num_to_flip = int(0.1 * len(labels_clean))
indices_to_flip = np.random.choice(differing_indices, size=num_to_flip, replace=False)

final_labels = labels_clean.copy()
final_labels.iloc[indices_to_flip] = 1 - final_labels.iloc[indices_to_flip]

save_data(df_clean_features, final_labels, 'Dpre_noisy.pth')

pd.Series(indices_to_flip).to_csv('Dpre_flipped_indices.csv', index=False)
print(f"Flipped indices saved to Dpre_flipped_indices.csv")

Data saved to Dpre_noisy.pth and Dpre_noisy.csv
Flipped indices saved to Dpre_flipped_indices.csv


In [11]:
#Creates csv version of the main noisy dataset.

dataset = torch.load('D_noisy.pth')
all_features = []
all_labels = [] 

for data_tensor, label_tensor in dataset:
    all_features.append(data_tensor.numpy())
    all_labels.append(label_tensor.numpy())  

features_array = np.vstack(all_features) 
labels_array = np.vstack(all_labels)  
df_features = pd.DataFrame(features_array)
df_features.to_csv('D_noisy.csv', index=False)

In [3]:
# This part of the code creates a validation dataset containing the original features.

df = pd.read_csv('Existing customer data with original credit score.csv')
df.fillna(df.mean(), inplace=True)
indices_df = pd.read_csv('D_validation_indices.csv', header=None)
validation_indices = indices_df[0].tolist()

validation_data = df.iloc[validation_indices]
validation_data.to_csv('D_validation_original.csv', index=False)

  df.fillna(df.mean(), inplace=True)
