# 1. Importing Modules

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import tree
from sklearn.metrics import accuracy_score

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# 2. Importing Data

In [2]:
test = pd.read_csv("test.csv")

In [3]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 449 entries, 0 to 448
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       449 non-null    int64  
 1   diagnosis                449 non-null    object 
 2   radius_mean              449 non-null    float64
 3   texture_mean             449 non-null    float64
 4   perimeter_mean           449 non-null    float64
 5   area_mean                449 non-null    float64
 6   smoothness_mean          449 non-null    float64
 7   compactness_mean         449 non-null    float64
 8   concavity_mean           449 non-null    float64
 9   concave points_mean      449 non-null    float64
 10  symmetry_mean            449 non-null    float64
 11  fractal_dimension_mean   449 non-null    float64
 12  radius_se                449 non-null    float64
 13  texture_se               449 non-null    float64
 14  perimeter_se             4

# 3. Preprocessing

## 3.1 Trimming Unnecessary Data

In [5]:
# Getting rid of unnecessary columns - Unnamed

train = train.drop(['id'] , axis = 1)
train = train.drop(['Unnamed: 32'] , axis = 1)

test = test.drop(['id'] , axis = 1)
test = test.drop(['Unnamed: 32'] , axis = 1)


## 3.2 Using Sklearn Preprocessing Algorithms

### 3.2.1 Training Data

In [6]:
X_train = train.drop(columns=['diagnosis'])
y_train = train['diagnosis']

In [7]:
X_train_clean = X_train

X_train_zscore = StandardScaler().fit_transform(X_train)

X_train_minmax = MinMaxScaler().fit_transform(X_train)

X_train_preprocessed = [X_train_clean, X_train_zscore, X_train_minmax]

In [8]:
y_train = y_train.map(lambda x: 1 if x == 'M' else 0)

### 3.2.2 Testing Data

In [9]:
X_test = test.drop(columns=['diagnosis'])
y_test = test['diagnosis']

In [10]:
X_test_clean = X_test

X_test_zscore = StandardScaler().fit_transform(X_test)

X_test_minmax = MinMaxScaler().fit_transform(X_test)

X_test_preprocessed = [X_test_clean, X_test_zscore, X_test_minmax]

In [11]:
y_test = y_test.map(lambda x: 1 if x == 'M' else 0)

# 4. Creating Decision Trees

## 4.1 No Preprocessing

In [12]:
def run_variations(X_train_preprocessed, X_test_preprocessed, y_train, y_test, depth_values, splitter_values, preprocessing_labels):
  outputs = []
  for i in range(len(X_train_preprocessed)):
    for d in depth_values:
      for s in splitter_values:
        clf = tree.DecisionTreeClassifier(max_depth=d, splitter=s)
        clf = clf.fit(X_train_preprocessed[i], y_train)
        y_pred = clf.predict(X_test_preprocessed[i])
        score = accuracy_score(y_test, y_pred)
        outputs.append({
            'Normalization': preprocessing_labels[i],
            'Max Depth': d,
            'Splitter': s,
            'Accuracy %': score
        })

  return outputs


In [13]:
outputs = run_variations(X_train_preprocessed, X_test_preprocessed, y_train, y_test, depth_values=[5, 7, 9], splitter_values=['best', 'random'], preprocessing_labels=['No Preprocessing', 'Z-score', 'Minmax'])
output_df = pd.DataFrame(outputs)

output_df

Unnamed: 0,Normalization,Max Depth,Splitter,Accuracy %
0,No Preprocessing,5,best,0.891667
1,No Preprocessing,5,random,0.925
2,No Preprocessing,7,best,0.883333
3,No Preprocessing,7,random,0.908333
4,No Preprocessing,9,best,0.85
5,No Preprocessing,9,random,0.908333
6,Z-score,5,best,0.9
7,Z-score,5,random,0.941667
8,Z-score,7,best,0.875
9,Z-score,7,random,0.908333
