## Importing Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("breast_cancer_train.csv")

In [3]:
df

Unnamed: 0,id,clump_thickness,uniform_cell_size,uniform_cell_shape,marginal_adhesion,single_epithelial_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
643,776715,3,1,1,1,3,2,1,1,1,2
644,841769,2,1,1,1,2,1,1,1,1,2
645,888820,5,10,10,3,7,3,8,10,2,4
646,897471,4,8,6,4,3,4,10,6,1,4


## Data Cleaning

In [4]:
df = df.drop(df[df["bare_nuclei"]=="?"].index,axis=0)

In [5]:
df["bare_nuclei"] = df["bare_nuclei"].astype(int)

In [6]:
df.dtypes

id                        int64
clump_thickness           int64
uniform_cell_size         int64
uniform_cell_shape        int64
marginal_adhesion         int64
single_epithelial_size    int64
bare_nuclei               int32
bland_chromatin           int64
normal_nucleoli           int64
mitoses                   int64
class                     int64
dtype: object

In [7]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]


In [8]:
df

Unnamed: 0,id,clump_thickness,uniform_cell_size,uniform_cell_shape,marginal_adhesion,single_epithelial_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
643,776715,3,1,1,1,3,2,1,1,1,2
644,841769,2,1,1,1,2,1,1,1,1,2
645,888820,5,10,10,3,7,3,8,10,2,4
646,897471,4,8,6,4,3,4,10,6,1,4


In [9]:
df.describe()

Unnamed: 0,id,clump_thickness,uniform_cell_size,uniform_cell_shape,marginal_adhesion,single_epithelial_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
count,632.0,632.0,632.0,632.0,632.0,632.0,632.0,632.0,632.0,632.0,632.0
mean,1068136.0,4.392405,3.064873,3.139241,2.743671,3.227848,3.474684,3.378165,2.814873,1.602848,2.68038
std,643371.5,2.784575,2.988711,2.920974,2.791904,2.210474,3.616653,2.40068,2.998505,1.740409,0.948296
min,63375.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,849831.2,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
50%,1167046.0,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2.0
75%,1239365.0,6.0,4.0,5.0,3.0,4.0,5.25,4.0,3.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


## Data Modelling

In [10]:
from sklearn.ensemble import RandomForestClassifier

In [11]:
model = RandomForestClassifier(n_estimators=10)
model.fit(x,y)

RandomForestClassifier(n_estimators=10)

In [13]:
val_result = pd.read_csv('validation_results.csv')

In [14]:
val_result.head()

Unnamed: 0,id,clump_thickness,uniform_cell_size,uniform_cell_shape,marginal_adhesion,single_epithelial_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,1296025,4,1,2,1,2,1,1,1,1,2
1,1296263,4,1,1,1,2,1,1,1,1,2
2,1296593,5,2,1,1,2,1,1,1,1,2
3,1299161,4,8,7,10,4,10,7,5,1,4
4,1301945,5,1,1,1,1,1,1,1,1,2


In [16]:
val = pd.read_csv('breast_cancer_validation.csv')

In [17]:
val.head()

Unnamed: 0,id,clump_thickness,uniform_cell_size,uniform_cell_shape,marginal_adhesion,single_epithelial_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses
0,1296025,4,1,2,1,2,1,1,1,1
1,1296263,4,1,1,1,2,1,1,1,1
2,1296593,5,2,1,1,2,1,1,1,1
3,1299161,4,8,7,10,4,10,7,5,1
4,1301945,5,1,1,1,1,1,1,1,1


In [18]:
y_pred = model.predict(val)

In [19]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [20]:
y_val = val_result.iloc[:,-1]

In [21]:
accuracy_score(y_val,y_pred)

0.9666666666666667

In [22]:
confusion_matrix(y_val,y_pred)

array([[22,  0],
       [ 1,  7]], dtype=int64)

In [23]:
result = pd.read_csv("breast_cancer_test.csv")

In [24]:
result.head()

Unnamed: 0,id,clump_thickness,uniform_cell_size,uniform_cell_shape,marginal_adhesion,single_epithelial_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses
0,1214092,1,1,1,1,2,1,1,1,1
1,1214556,3,1,1,1,2,1,2,1,1
2,1214966,9,7,7,5,5,10,7,8,3
3,1216694,10,8,8,4,10,10,8,1,1
4,1216947,1,1,1,1,2,1,3,1,1


## For final apply for test csv

In [25]:
predition = model.predict(result)

In [26]:
data = {"id":result["id"],"class":predition}

In [27]:
dataset = pd.DataFrame(data)

In [28]:
dataset

Unnamed: 0,id,class
0,1214092,2
1,1214556,2
2,1214966,4
3,1216694,4
4,1216947,2
5,1217051,2
6,1217264,2
7,1218105,4
8,1218741,4
9,1218860,2


In [29]:
x = accuracy_score(y_val,y_pred)

In [30]:
Accuracy_Percentage = x * 100

In [31]:
Accuracy_Percentage

96.66666666666667

## We got 96.6% accuracy

In [32]:
dataset.to_csv("sample_submission.csv")