# Imports

In [1]:
from urllib.request import urlopen
import pandas as pd
import numpy as np
from io import BytesIO
import requests

In [2]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from imblearn.under_sampling import RandomUnderSampler
import pickle
from pymongo import MongoClient
from datetime import datetime

# Data Ingestion

In [3]:
DATA_DESC = "https://archive.ics.uci.edu/ml/machine-learning-databases/00421/aps_failure_description.txt"
DATA_TRAIN = "https://archive.ics.uci.edu/ml/machine-learning-databases/00421/aps_failure_training_set.csv"
DATA_TEST = "https://archive.ics.uci.edu/ml/machine-learning-databases/00421/aps_failure_test_set.csv"

In [4]:
def READ_DATA_FROM_FILE(FILE_NAME: str):
  """
    FILE_NAME: str: Enter the name of the file
    return: list of strings 
  """
  with urlopen(FILE_NAME) as fptr:
    return BytesIO(fptr.read())

In [5]:
df_train = pd.read_csv(READ_DATA_FROM_FILE(DATA_TRAIN), skiprows=20, na_values='na')

In [6]:
df_test = pd.read_csv(READ_DATA_FROM_FILE(DATA_TEST), skiprows=20, na_values='na')

# EDA

In [7]:
def get_frame_stats(frame: pd.DataFrame, title = ""):
  print("-"*30)
  print("Statistics of frame "+ title)
  print("Shape:", frame.shape)
  print("Null Values:", frame.isna().sum().sum())
  print("Duplicate Values:", frame.duplicated().sum())
  print("Columns:", frame.columns)
  print("Memory usage:", frame.memory_usage(deep=True).sum())
  print("\n")

In [8]:
get_frame_stats(df_train, "df_train")
get_frame_stats(df_test, title="df_test")

------------------------------
Statistics of frame df_train
Shape: (60000, 171)
Null Values: 850015
Duplicate Values: 0
Columns: Index(['class', 'aa_000', 'ab_000', 'ac_000', 'ad_000', 'ae_000', 'af_000',
       'ag_000', 'ag_001', 'ag_002',
       ...
       'ee_002', 'ee_003', 'ee_004', 'ee_005', 'ee_006', 'ee_007', 'ee_008',
       'ee_009', 'ef_000', 'eg_000'],
      dtype='object', length=171)
Memory usage: 85200128


------------------------------
Statistics of frame df_test
Shape: (16000, 171)
Null Values: 228680
Duplicate Values: 0
Columns: Index(['class', 'aa_000', 'ab_000', 'ac_000', 'ad_000', 'ae_000', 'af_000',
       'ag_000', 'ag_001', 'ag_002',
       ...
       'ee_002', 'ee_003', 'ee_004', 'ee_005', 'ee_006', 'ee_007', 'ee_008',
       'ee_009', 'ef_000', 'eg_000'],
      dtype='object', length=171)
Memory usage: 22720128




In [9]:
df_train = df_train.loc[:, df_train.isin([np.NaN]).sum() < 9000]
df_test = df_test.loc[:, df_test.isin([np.NaN]).sum() < 2400]

In [10]:
get_frame_stats(df_train, "df_train")
get_frame_stats(df_test, title="df_test")

------------------------------
Statistics of frame df_train
Shape: (60000, 143)
Null Values: 193630
Duplicate Values: 9
Columns: Index(['class', 'aa_000', 'ac_000', 'ae_000', 'af_000', 'ag_000', 'ag_001',
       'ag_002', 'ag_003', 'ag_004',
       ...
       'ee_002', 'ee_003', 'ee_004', 'ee_005', 'ee_006', 'ee_007', 'ee_008',
       'ee_009', 'ef_000', 'eg_000'],
      dtype='object', length=143)
Memory usage: 71760128


------------------------------
Statistics of frame df_test
Shape: (16000, 143)
Null Values: 53661
Duplicate Values: 2
Columns: Index(['class', 'aa_000', 'ac_000', 'ae_000', 'af_000', 'ag_000', 'ag_001',
       'ag_002', 'ag_003', 'ag_004',
       ...
       'ee_002', 'ee_003', 'ee_004', 'ee_005', 'ee_006', 'ee_007', 'ee_008',
       'ee_009', 'ef_000', 'eg_000'],
      dtype='object', length=143)
Memory usage: 19136128




In [11]:
def replace_remaining_null_with_mean(frame: pd.DataFrame):
  for column in frame.columns[1:]:
    if frame[column].isna().sum() > 0:
      frame[column].fillna(frame[column].mean(), inplace=True)

In [12]:
replace_remaining_null_with_mean(df_train)

In [13]:
replace_remaining_null_with_mean(df_test)

In [14]:
df_train.isna().sum().sum()

0

In [15]:
df_test.isna().sum().sum()

0

In [16]:
get_frame_stats(df_train, "df_train")
get_frame_stats(df_test, title="df_test")

------------------------------
Statistics of frame df_train
Shape: (60000, 143)
Null Values: 0
Duplicate Values: 10
Columns: Index(['class', 'aa_000', 'ac_000', 'ae_000', 'af_000', 'ag_000', 'ag_001',
       'ag_002', 'ag_003', 'ag_004',
       ...
       'ee_002', 'ee_003', 'ee_004', 'ee_005', 'ee_006', 'ee_007', 'ee_008',
       'ee_009', 'ef_000', 'eg_000'],
      dtype='object', length=143)
Memory usage: 71760128


------------------------------
Statistics of frame df_test
Shape: (16000, 143)
Null Values: 0
Duplicate Values: 2
Columns: Index(['class', 'aa_000', 'ac_000', 'ae_000', 'af_000', 'ag_000', 'ag_001',
       'ag_002', 'ag_003', 'ag_004',
       ...
       'ee_002', 'ee_003', 'ee_004', 'ee_005', 'ee_006', 'ee_007', 'ee_008',
       'ee_009', 'ef_000', 'eg_000'],
      dtype='object', length=143)
Memory usage: 19136128






---



In [17]:
df_train['class'].value_counts()

neg    59000
pos     1000
Name: class, dtype: int64

In [18]:
df_test['class'].value_counts()

neg    15625
pos      375
Name: class, dtype: int64

In [19]:
def replace_class_with_indicators(df_train, df_test):
  if df_train['class'].isin(['pos', 'neg']).any():
    print("Replacing data frame train")
    df_train['class'] = df_train['class'].apply(lambda x: 1 if x=="pos" else 0)
  if df_test['class'].isin(['pos', 'neg']).any():
    print("Replacing data frame test")
    df_test['class'] = df_test['class'].apply(lambda x: 1 if x=="pos" else 0)

In [20]:
replace_class_with_indicators(df_train, df_test)

Replacing data frame train
Replacing data frame test


In [21]:
df_train['class'].value_counts()

0    59000
1     1000
Name: class, dtype: int64

In [22]:
df_test['class'].value_counts()

0    15625
1      375
Name: class, dtype: int64

# Preprocessing

## Data sampling

In [23]:
df_train = df_train.sample(frac=0.3)
X_train = df_train.iloc[:, 1:]
y_train = df_train.iloc[:, 0]

In [24]:
df_test = df_test.sample(frac=0.3)
X_test = df_test.iloc[:, 1:]
y_test = df_test.iloc[:, 0]

In [25]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(18000, 142) (4800, 142) (18000,) (4800,)


In [26]:
np.unique(df_train.iloc[:, 1])

array([      0,       2,       4, ..., 1901150, 1956648, 2220448])

## Applying MinMaxScaler

In [27]:
min_max = MinMaxScaler()

In [28]:
X_train_pp = min_max.fit_transform(X_train)

In [29]:
X_test_pp = min_max.fit_transform(X_test)

## Principal Component Analysis

In [30]:
pca = PCA(n_components=10)

In [31]:
X_train_pp = pca.fit_transform(X_train_pp)
X_test_pp = pca.transform(X_test_pp)

## Random Under Sampler

In [32]:
udr = RandomUnderSampler(random_state=42)

In [33]:
X_train_pp_dim_udr, y_train_dim_udr = udr.fit_resample(X_train_pp, y_train)

In [34]:
print(X_train_pp.shape, y_train.shape, X_train_pp_dim_udr.shape, y_train_dim_udr.shape) 

(18000, 10) (18000,) (652, 10) (652,)


# Data Pickling and Storing to Database

In [35]:
obj = {}
obj["X_train"] = X_train_pp_dim_udr
obj["X_test"] = X_test_pp
obj["y_train"] = y_train_dim_udr
obj["y_test"] = y_test
obj["features"] = df_train.columns

pickle_model = pickle.dumps(obj)

In [36]:
client = MongoClient("mongodb+srv://dev_user:1Cand0thi5@cluster0.34cpv.mongodb.net/?retryWrites=true&w=majority")

try:
  utc_timestamp = datetime.utcnow()
  document = client['aps_scania']
  collection = document['aps_scania_train_test']
  tag = "APS_SCANIA_DATASET"
  collection.delete_many({"tag": tag})
  inserted_response = collection.insert_one({
      "model": pickle_model,
      "tag": tag,
      "created_at": utc_timestamp
      })
  print("Inserted successfully with id: ", inserted_response.inserted_id, "Timestamp: ", utc_timestamp)
except Exception as e:
  print("Unable process into the store")
  print(e)

Inserted successfully with id:  6381bbbd9f4c10c82e80a297 Timestamp:  2022-11-26 07:09:48.331699


**==========THE END==========**