Imports

In [267]:
import pandas as pd
import sklearn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
import pickle
from imblearn.over_sampling import RandomOverSampler

Importing the data and checking for null attributes(null being the '?' for now)

In [268]:
tumor_data = pd.read_csv("primary-tumor.data")
print((tumor_data.eq("?")).any())


class              False
age                False
sex                 True
histologic          True
degreeOfDiffe       True
bone               False
bone-marrow        False
lung               False
pleura             False
peritoneum         False
liver              False
brain              False
skin                True
neck               False
supraclavicular    False
axillar             True
mediastinum        False
abdominal          False
dtype: bool


checking how many missing values in each attribute

In [269]:
sex = tumor_data['sex'].str.contains("?", regex=False).sum()
print(sex)
histol = tumor_data['histologic'].str.contains("?", regex=False).sum()
print(histol)
degree = tumor_data['degreeOfDiffe'].str.contains("?", regex=False).sum()
print(degree)
skin = tumor_data['skin'].str.contains("?", regex=False).sum()
print(skin)
axillar = tumor_data['axillar'].str.contains("?", regex=False).sum()
print(axillar)


1
67
155
1
1


Deleting the histologic-type and degree-of-diffe attributes, deleting the rows that contain the other 3 null attributes, and checking if any null attributes remain

In [270]:
tumor_data = tumor_data.drop(['histologic', 'degreeOfDiffe'], axis=1)

#converting '?' to NaN
tumor_data = tumor_data.replace("?", np.nan)
tumor_data = tumor_data.dropna()

#(tumor_data.isnull()).any()

tumor_data

Unnamed: 0,class,age,sex,bone,bone-marrow,lung,pleura,peritoneum,liver,brain,skin,neck,supraclavicular,axillar,mediastinum,abdominal
0,1,1,1,2,2,1,2,2,2,2,2,2,2,2,2,2
1,1,1,1,2,2,2,2,2,1,2,2,2,1,2,1,2
2,1,1,2,1,2,2,2,2,2,2,2,2,2,2,1,2
3,1,1,2,1,2,1,1,2,2,2,2,2,2,2,1,2
4,1,1,2,1,2,1,1,2,2,2,2,2,2,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334,22,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2
335,22,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2
336,22,2,2,1,2,2,2,2,2,2,2,2,1,1,2,2
337,22,3,2,2,2,2,2,2,2,2,2,1,1,1,2,2


Now we can model and start training our data using the SVM algorithm

In [271]:
best = 0
worst = 1
average = 0

for i in range(1000):
    X = np.array(tumor_data.drop(['class'], axis=1))
    y = np.array(tumor_data['class'])

    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size = 0.2)

    over = RandomOverSampler(sampling_strategy="not majority")
    X_train, y_train = over.fit_resample(X, y)

    model = svm.SVC(kernel="poly", degree = 6)
    model.fit(X_train, y_train)

    predictions = model.predict(X_test)

    accuracy = metrics.accuracy_score(y_test, predictions)

    average += accuracy

    if accuracy > best:
        with open("tumormodel_svm_3.pickle", "wb") as f:
            pickle.dump(model, f)
        best = accuracy
        
    if accuracy < worst:
        worst = accuracy

average /= 1000

Looks like SVM is not working that well even with the kernel being non-linear. 

Actually this observation was innacurate. The reason for such a low accuracy was because the data was very imbalanced; however, after oversampling the data, the accuracy increased by a substantial amount both during the implementation of the SVM algorithm and KNN algorithm that I use down below. I originally decided I would still stick with KNN as seen below, but I instead chose to go with SVM because it held a smaller margin of error(25 percent vs 32 percent) and the highs and lows of my SVM implementation were better than those of my KNN implementation.

In [272]:
# X = np.array(tumor_data.drop(['class'], axis=1))
# y = np.array(tumor_data['class'])

# X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size = 0.2)

# #oversampling because some of the data is inbalanced
# over = RandomOverSampler(sampling_strategy="not majority")
# X_train, y_train = over.fit_resample(X, y)

# best = 0
# worst = 1

# for i in range(1000):
#     X = np.array(tumor_data.drop(['class'], axis=1))
#     y = np.array(tumor_data['class'])

#     X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size = 0.2)
    
#     over = RandomOverSampler(sampling_strategy="not majority")
#     X_train, y_train = over.fit_resample(X, y)

#     model = KNeighborsClassifier(n_neighbors=3)

#     model.fit(X_train, y_train)
#     accuracy = model.score(X_test, y_test)
    
#     if accuracy > best:
#         with open("tumormodel.pickle", "wb") as f:
#             pickle.dump(model, f)
#         best = accuracy
        
#     if accuracy < worst:
#         worst = accuracy

printing out the best case accuracy of my SVM implementation(currently used model) and worst case accuracy over 1000 runs

In [273]:
print(f'best is {best}, worst is {worst}, and average is {average}')

best is 0.9411764705882353, worst is 0.6470588235294118, and average is 0.7890294117647034
