## Import the libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
import math
import umap.umap_ as umap
from sklearn.metrics import davies_bouldin_score
%config InlineBackend.figure_format = 'svg'
import random
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids

In [None]:
pip install imblearn


In [3]:
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from scipy.stats import f_oneway

In [16]:
import pickle

## Import the dataset

In [17]:
np.random.seed(42)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
data_with_target=pd.read_pickle('/home/suhananujum/folding_yeast6.pickle')

In [18]:
data_with_target

{'folding': [(array([[0.58, 0.61, 0.47, ..., 0.  , 0.48, 0.22],
          [0.43, 0.67, 0.48, ..., 0.  , 0.53, 0.22],
          [0.64, 0.62, 0.49, ..., 0.  , 0.53, 0.22],
          ...,
          [0.67, 0.57, 0.36, ..., 0.  , 0.56, 0.22],
          [0.43, 0.4 , 0.6 , ..., 0.  , 0.53, 0.39],
          [0.65, 0.54, 0.54, ..., 0.  , 0.53, 0.22]]),
   array([0., 0., 0., ..., 0., 0., 0.]),
   array([[0.48, 0.45, 0.59, ..., 0.  , 0.58, 0.34],
          [0.66, 0.55, 0.45, ..., 0.  , 0.46, 0.22],
          [0.58, 0.47, 0.54, ..., 0.  , 0.51, 0.26],
          ...,
          [0.59, 0.67, 0.54, ..., 0.  , 0.48, 0.6 ],
          [0.71, 0.5 , 0.5 , ..., 0.  , 0.46, 0.22],
          [0.38, 0.4 , 0.66, ..., 0.  , 0.43, 0.11]]),
   array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0.

In [19]:
label_column='Churn?'

In [20]:
data=data_with_target.drop([label_column],axis=1)

AttributeError: 'dict' object has no attribute 'drop'

In [None]:
data

In [None]:
data_with_target[label_column].value_counts()

## Feature types

In [None]:
data.columns

In [None]:
cont_list=['Account Length', 'VMail Message',  'Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins', 'Eve Calls',
       'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins', 'Intl Charge']
ord_list=['Intl Calls', 'CustServ Calls']
nom_list=['Int\'l Plan', 'VMail Plan']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, data_with_target[label_column], test_size=0.33, random_state=42)

## Baseline

In [None]:
embedding = umap.UMAP(random_state=42).fit(X_train, y=y_train)

In [None]:
test_embedding=embedding.transform(X_test)

In [None]:
X_test.shape

In [None]:
test_embedding.shape

In [None]:
y_test.shape

In [None]:
clf = RandomForestClassifier(max_depth=3, random_state=0)

In [None]:
np.mean(cross_val_score(clf, X_test, y_test,scoring="f1", cv=5))

In [None]:
np.mean(cross_val_score(clf, test_embedding, y_test,scoring="f1", cv=5))

## Oversampling

In [None]:
sm = SMOTE(random_state=42)

In [None]:
X_resampled_OS, y_resampled_OS=sm.fit_resample(X_train, y_train)

In [None]:
X_resampled_OS.shape

In [None]:
embedding_OS = umap.UMAP(random_state=42).fit(X_resampled_OS, y=y_resampled_OS)

In [None]:
test_embedding_OS=embedding.transform(X_test)

In [None]:
np.mean(cross_val_score(clf, test_embedding_OS, y_test,scoring="f1", cv=5))

## Undersampling

In [None]:
cc = ClusterCentroids(random_state=0)

In [None]:
X_resampled_US, y_resampled_US=cc.fit_resample(X_train, y_train)

In [None]:
X_resampled_US.shape

In [None]:
embedding_OS = umap.UMAP(random_state=42).fit(X_resampled_US, y=y_resampled_US)

In [None]:
test_embedding_US=embedding.transform(X_test)

In [None]:
np.mean(cross_val_score(clf, test_embedding_US, y_test,scoring="f1", cv=5))