In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# a) Read the data with pandas and find features and target variables
# Assuming your data is in a CSV file named 'data.csv' where the last column is the target variable
data = pd.read_csv('/content/breast_cancer_survival.csv')
X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target variable

# b) Train Random Forest and AdaBoost
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

adaboost_classifier = AdaBoostClassifier(n_estimators=50, random_state=42)
adaboost_classifier.fit(X_train, y_train)
# c) Find accuracy, precision, recall, f1-score
def evaluate_model(classifier, X_test, y_test):
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1

rf_accuracy, rf_precision, rf_recall, rf_f1 = evaluate_model(rf_classifier, X_test, y_test)
adaboost_accuracy, adaboost_precision, adaboost_recall, adaboost_f1 = evaluate_model(adaboost_classifier, X_test, y_test)

print("Random Forest Classifier:")
print("Accuracy:", rf_accuracy)
print("Precision:", rf_precision)
print("Recall:", rf_recall)
print("F1-score:", rf_f1)
print("\n")

print("AdaBoost Classifier:")
print("Accuracy:", adaboost_accuracy)
print("Precision:", adaboost_precision)
print("Recall:", adaboost_recall)
print("F1-score:", adaboost_f1)


ValueError: could not convert string to float: 'FEMALE'

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# a) Read the data with pandas and find features and target variables
# Assuming your data is in a CSV file named 'data.csv' where the last column is the target variable
data = pd.read_csv('/content/breast_cancer_survival.csv')
X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target variable


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Read the data with pandas
data = pd.read_csv("/content/breast_cancer_survival.csv")

# Preprocessing
data.dropna(inplace=True)
X = data.drop(columns=["CUST_ID"])

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into train and test sets
X_train, X_test = train_test_split(X_scaled, test_size=0.2, random_state=42)

# Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X_train)

# Predict clusters
train_clusters = kmeans.predict(X_train)
test_clusters = kmeans.predict(X_test)

# Evaluate the model
train_silhouette_score = silhouette_score(X_train, train_clusters)
test_silhouette_score = silhouette_score(X_test, train_clusters)
test_silhouette_score = silhouette_score(X_test
print("Train Silhouette Score:", train_silhouette_score)
print("Test Silhouette Score:", test_silhouette_score)


SyntaxError: '(' was never closed (<ipython-input-16-3a9982b7742e>, line 32)

In [17]:
import pandas as pd

# Read the data
data = pd.read_csv("/content/breast_cancer_survival.csv")

# Convert 'Patient_Status' to binary values
data['Patient_Status'] = data['Patient_Status'].apply(lambda x: 1 if x == 'Alive' else 0)

# Now 'Alive' is represented as 1 and 'Dead' as 0
print(data['Patient_Status'])


0      1
1      0
2      1
3      1
4      1
      ..
329    1
330    1
331    0
332    1
333    0
Name: Patient_Status, Length: 334, dtype: int64


In [18]:
import pandas as pd

# Assuming your DataFrame is named 'data'

# Convert specific columns to numbers
data['Protein1'] = pd.to_numeric(data['Protein1'], errors='coerce')
data['Protein2'] = pd.to_numeric(data['Protein2'], errors='coerce')
data['Protein3'] = pd.to_numeric(data['Protein3'], errors='coerce')
data['Protein4'] = pd.to_numeric(data['Protein4'], errors='coerce')

# Convert 'Tumour_Stage' column to numbers (assuming it contains strings like 'I', 'II', 'III', etc.)
# If it contains Roman numerals, you may first convert them to numbers using the function you provided
# For example:
# data['Tumour_Stage'] = data['Tumour_Stage'].apply(roman_to_normal)

# Convert other columns if needed

# Now the specified columns should contain numeric values
print(data.head())

   Age  Gender  Protein1  Protein2  Protein3  Protein4 Tumour_Stage  \
0   42  FEMALE   0.95256   2.15000  0.007972 -0.048340           II   
1   54  FEMALE   0.00000   1.38020 -0.498030 -0.507320           II   
2   63  FEMALE  -0.52303   1.76400 -0.370190  0.010815           II   
3   78  FEMALE  -0.87618   0.12943 -0.370380  0.132190            I   
4   42  FEMALE   0.22611   1.74910 -0.543970 -0.390210           II   

                       Histology ER status PR status HER2 status Surgery_type  \
0  Infiltrating Ductal Carcinoma  Positive  Positive    Negative        Other   
1  Infiltrating Ductal Carcinoma  Positive  Positive    Negative        Other   
2  Infiltrating Ductal Carcinoma  Positive  Positive    Negative   Lumpectomy   
3  Infiltrating Ductal Carcinoma  Positive  Positive    Negative        Other   
4  Infiltrating Ductal Carcinoma  Positive  Positive    Positive   Lumpectomy   

  Date_of_Surgery Date_of_Last_Visit  Patient_Status  
0       20-May-18          26-A

In [19]:
import pandas as pd

# Read the data
data = pd.read_csv("/content/breast_cancer_survival.csv")

# Convert specific columns to numeric
numeric_columns = ['Age', 'Protein1', 'Protein2', 'Protein3', 'Protein4']
data[numeric_columns] = data[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Now, categorical columns need special handling:
# For example, you can use one-hot encoding for 'Gender' column:
data = pd.get_dummies(data, columns=['Gender'], drop_first=True)  # Assuming Male is 1 and Female is 0

# For 'Tumour_Stage', if it's ordinal, you may map it to numerical values:
tumour_stage_mapping = {'I': 1, 'II': 2, 'III': 3}  # Define the mapping
data['Tumour_Stage'] = data['Tumour_Stage'].map(tumour_stage_mapping)

# Convert other categorical columns if needed

# Now, the DataFrame should have numeric values where applicable
print(data.head())

   Age  Protein1  Protein2  Protein3  Protein4  Tumour_Stage  \
0   42   0.95256   2.15000  0.007972 -0.048340             2   
1   54   0.00000   1.38020 -0.498030 -0.507320             2   
2   63  -0.52303   1.76400 -0.370190  0.010815             2   
3   78  -0.87618   0.12943 -0.370380  0.132190             1   
4   42   0.22611   1.74910 -0.543970 -0.390210             2   

                       Histology ER status PR status HER2 status Surgery_type  \
0  Infiltrating Ductal Carcinoma  Positive  Positive    Negative        Other   
1  Infiltrating Ductal Carcinoma  Positive  Positive    Negative        Other   
2  Infiltrating Ductal Carcinoma  Positive  Positive    Negative   Lumpectomy   
3  Infiltrating Ductal Carcinoma  Positive  Positive    Negative        Other   
4  Infiltrating Ductal Carcinoma  Positive  Positive    Positive   Lumpectomy   

  Date_of_Surgery Date_of_Last_Visit Patient_Status  Gender_MALE  
0       20-May-18          26-Aug-18          Alive        Fa

In [20]:
# Assuming your DataFrame is named 'data'

# Map categories to numerical values
status_mapping = {'Alive': 1, 'Dead': 0}

# Convert 'Patient_Status' column to numerical values
data['Patient_Status'] = data['Patient_Status'].map(status_mapping)

# Now, the 'Patient_Status' column should contain numerical values
print(data.head())

   Age  Protein1  Protein2  Protein3  Protein4  Tumour_Stage  \
0   42   0.95256   2.15000  0.007972 -0.048340             2   
1   54   0.00000   1.38020 -0.498030 -0.507320             2   
2   63  -0.52303   1.76400 -0.370190  0.010815             2   
3   78  -0.87618   0.12943 -0.370380  0.132190             1   
4   42   0.22611   1.74910 -0.543970 -0.390210             2   

                       Histology ER status PR status HER2 status Surgery_type  \
0  Infiltrating Ductal Carcinoma  Positive  Positive    Negative        Other   
1  Infiltrating Ductal Carcinoma  Positive  Positive    Negative        Other   
2  Infiltrating Ductal Carcinoma  Positive  Positive    Negative   Lumpectomy   
3  Infiltrating Ductal Carcinoma  Positive  Positive    Negative        Other   
4  Infiltrating Ductal Carcinoma  Positive  Positive    Positive   Lumpectomy   

  Date_of_Surgery Date_of_Last_Visit  Patient_Status  Gender_MALE  
0       20-May-18          26-Aug-18             1.0        

In [21]:
data=data.drop('Histology',axis=1)


In [22]:
print(data)

     Age  Protein1  Protein2  Protein3  Protein4  Tumour_Stage ER status  \
0     42  0.952560   2.15000  0.007972 -0.048340             2  Positive   
1     54  0.000000   1.38020 -0.498030 -0.507320             2  Positive   
2     63 -0.523030   1.76400 -0.370190  0.010815             2  Positive   
3     78 -0.876180   0.12943 -0.370380  0.132190             1  Positive   
4     42  0.226110   1.74910 -0.543970 -0.390210             2  Positive   
..   ...       ...       ...       ...       ...           ...       ...   
329   59  0.024598   1.40050  0.024751  0.280320             2  Positive   
330   41  0.100120  -0.46547  0.472370 -0.523870             1  Positive   
331   54  0.753820   1.64250 -0.332850  0.857860             2  Positive   
332   74  0.972510   1.42680 -0.366570 -0.107820             2  Positive   
333   66  0.286380   1.39980  0.318830  0.836050             2  Positive   

    PR status HER2 status                 Surgery_type Date_of_Surgery  \
0    Positive

In [23]:
data=data.drop('PR status',axis=1)

In [24]:
print(data)

     Age  Protein1  Protein2  Protein3  Protein4  Tumour_Stage ER status  \
0     42  0.952560   2.15000  0.007972 -0.048340             2  Positive   
1     54  0.000000   1.38020 -0.498030 -0.507320             2  Positive   
2     63 -0.523030   1.76400 -0.370190  0.010815             2  Positive   
3     78 -0.876180   0.12943 -0.370380  0.132190             1  Positive   
4     42  0.226110   1.74910 -0.543970 -0.390210             2  Positive   
..   ...       ...       ...       ...       ...           ...       ...   
329   59  0.024598   1.40050  0.024751  0.280320             2  Positive   
330   41  0.100120  -0.46547  0.472370 -0.523870             1  Positive   
331   54  0.753820   1.64250 -0.332850  0.857860             2  Positive   
332   74  0.972510   1.42680 -0.366570 -0.107820             2  Positive   
333   66  0.286380   1.39980  0.318830  0.836050             2  Positive   

    HER2 status                 Surgery_type Date_of_Surgery  \
0      Negative        

In [25]:
data=data.drop('HER2 status',axis=1)

In [26]:
print(data)

     Age  Protein1  Protein2  Protein3  Protein4  Tumour_Stage ER status  \
0     42  0.952560   2.15000  0.007972 -0.048340             2  Positive   
1     54  0.000000   1.38020 -0.498030 -0.507320             2  Positive   
2     63 -0.523030   1.76400 -0.370190  0.010815             2  Positive   
3     78 -0.876180   0.12943 -0.370380  0.132190             1  Positive   
4     42  0.226110   1.74910 -0.543970 -0.390210             2  Positive   
..   ...       ...       ...       ...       ...           ...       ...   
329   59  0.024598   1.40050  0.024751  0.280320             2  Positive   
330   41  0.100120  -0.46547  0.472370 -0.523870             1  Positive   
331   54  0.753820   1.64250 -0.332850  0.857860             2  Positive   
332   74  0.972510   1.42680 -0.366570 -0.107820             2  Positive   
333   66  0.286380   1.39980  0.318830  0.836050             2  Positive   

                    Surgery_type Date_of_Surgery Date_of_Last_Visit  \
0               

In [27]:
print(data)

     Age  Protein1  Protein2  Protein3  Protein4  Tumour_Stage ER status  \
0     42  0.952560   2.15000  0.007972 -0.048340             2  Positive   
1     54  0.000000   1.38020 -0.498030 -0.507320             2  Positive   
2     63 -0.523030   1.76400 -0.370190  0.010815             2  Positive   
3     78 -0.876180   0.12943 -0.370380  0.132190             1  Positive   
4     42  0.226110   1.74910 -0.543970 -0.390210             2  Positive   
..   ...       ...       ...       ...       ...           ...       ...   
329   59  0.024598   1.40050  0.024751  0.280320             2  Positive   
330   41  0.100120  -0.46547  0.472370 -0.523870             1  Positive   
331   54  0.753820   1.64250 -0.332850  0.857860             2  Positive   
332   74  0.972510   1.42680 -0.366570 -0.107820             2  Positive   
333   66  0.286380   1.39980  0.318830  0.836050             2  Positive   

                    Surgery_type Date_of_Surgery Date_of_Last_Visit  \
0               

In [28]:
data=data.drop('Surgery_type',axis=1)

In [29]:
print(data)

     Age  Protein1  Protein2  Protein3  Protein4  Tumour_Stage ER status  \
0     42  0.952560   2.15000  0.007972 -0.048340             2  Positive   
1     54  0.000000   1.38020 -0.498030 -0.507320             2  Positive   
2     63 -0.523030   1.76400 -0.370190  0.010815             2  Positive   
3     78 -0.876180   0.12943 -0.370380  0.132190             1  Positive   
4     42  0.226110   1.74910 -0.543970 -0.390210             2  Positive   
..   ...       ...       ...       ...       ...           ...       ...   
329   59  0.024598   1.40050  0.024751  0.280320             2  Positive   
330   41  0.100120  -0.46547  0.472370 -0.523870             1  Positive   
331   54  0.753820   1.64250 -0.332850  0.857860             2  Positive   
332   74  0.972510   1.42680 -0.366570 -0.107820             2  Positive   
333   66  0.286380   1.39980  0.318830  0.836050             2  Positive   

    Date_of_Surgery Date_of_Last_Visit  Patient_Status  Gender_MALE  
0         20-May-

In [30]:
data=data.drop('Date_of_Surgery',axis=1)

In [31]:
print(data)

     Age  Protein1  Protein2  Protein3  Protein4  Tumour_Stage ER status  \
0     42  0.952560   2.15000  0.007972 -0.048340             2  Positive   
1     54  0.000000   1.38020 -0.498030 -0.507320             2  Positive   
2     63 -0.523030   1.76400 -0.370190  0.010815             2  Positive   
3     78 -0.876180   0.12943 -0.370380  0.132190             1  Positive   
4     42  0.226110   1.74910 -0.543970 -0.390210             2  Positive   
..   ...       ...       ...       ...       ...           ...       ...   
329   59  0.024598   1.40050  0.024751  0.280320             2  Positive   
330   41  0.100120  -0.46547  0.472370 -0.523870             1  Positive   
331   54  0.753820   1.64250 -0.332850  0.857860             2  Positive   
332   74  0.972510   1.42680 -0.366570 -0.107820             2  Positive   
333   66  0.286380   1.39980  0.318830  0.836050             2  Positive   

    Date_of_Last_Visit  Patient_Status  Gender_MALE  
0            26-Aug-18           

In [32]:
data=data.drop('Date_of_Last_Visit',axis=1)

In [33]:
print(data)

     Age  Protein1  Protein2  Protein3  Protein4  Tumour_Stage ER status  \
0     42  0.952560   2.15000  0.007972 -0.048340             2  Positive   
1     54  0.000000   1.38020 -0.498030 -0.507320             2  Positive   
2     63 -0.523030   1.76400 -0.370190  0.010815             2  Positive   
3     78 -0.876180   0.12943 -0.370380  0.132190             1  Positive   
4     42  0.226110   1.74910 -0.543970 -0.390210             2  Positive   
..   ...       ...       ...       ...       ...           ...       ...   
329   59  0.024598   1.40050  0.024751  0.280320             2  Positive   
330   41  0.100120  -0.46547  0.472370 -0.523870             1  Positive   
331   54  0.753820   1.64250 -0.332850  0.857860             2  Positive   
332   74  0.972510   1.42680 -0.366570 -0.107820             2  Positive   
333   66  0.286380   1.39980  0.318830  0.836050             2  Positive   

     Patient_Status  Gender_MALE  
0               1.0        False  
1               0

In [34]:
print(data)
data=data.fillna(0)

     Age  Protein1  Protein2  Protein3  Protein4  Tumour_Stage ER status  \
0     42  0.952560   2.15000  0.007972 -0.048340             2  Positive   
1     54  0.000000   1.38020 -0.498030 -0.507320             2  Positive   
2     63 -0.523030   1.76400 -0.370190  0.010815             2  Positive   
3     78 -0.876180   0.12943 -0.370380  0.132190             1  Positive   
4     42  0.226110   1.74910 -0.543970 -0.390210             2  Positive   
..   ...       ...       ...       ...       ...           ...       ...   
329   59  0.024598   1.40050  0.024751  0.280320             2  Positive   
330   41  0.100120  -0.46547  0.472370 -0.523870             1  Positive   
331   54  0.753820   1.64250 -0.332850  0.857860             2  Positive   
332   74  0.972510   1.42680 -0.366570 -0.107820             2  Positive   
333   66  0.286380   1.39980  0.318830  0.836050             2  Positive   

     Patient_Status  Gender_MALE  
0               1.0        False  
1               0

In [35]:
y=data['Patient_Status']
x=data.drop('Patient_Status',axis=1)

In [36]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=32)


In [37]:
print(x_train.shape)

(267, 8)


In [38]:
print(x_test.shape)

(67, 8)


In [43]:
data=data.drop('ER status',axis=1)

In [44]:
print(data)

     Age  Protein1  Protein2  Protein3  Protein4  Tumour_Stage  \
0     42  0.952560   2.15000  0.007972 -0.048340             2   
1     54  0.000000   1.38020 -0.498030 -0.507320             2   
2     63 -0.523030   1.76400 -0.370190  0.010815             2   
3     78 -0.876180   0.12943 -0.370380  0.132190             1   
4     42  0.226110   1.74910 -0.543970 -0.390210             2   
..   ...       ...       ...       ...       ...           ...   
329   59  0.024598   1.40050  0.024751  0.280320             2   
330   41  0.100120  -0.46547  0.472370 -0.523870             1   
331   54  0.753820   1.64250 -0.332850  0.857860             2   
332   74  0.972510   1.42680 -0.366570 -0.107820             2   
333   66  0.286380   1.39980  0.318830  0.836050             2   

     Patient_Status  Gender_MALE  
0               1.0        False  
1               0.0        False  
2               1.0        False  
3               1.0        False  
4               1.0        False

In [45]:
data=data.drop('Gender_MALE',axis=1)

In [46]:
print(data)

     Age  Protein1  Protein2  Protein3  Protein4  Tumour_Stage  Patient_Status
0     42  0.952560   2.15000  0.007972 -0.048340             2             1.0
1     54  0.000000   1.38020 -0.498030 -0.507320             2             0.0
2     63 -0.523030   1.76400 -0.370190  0.010815             2             1.0
3     78 -0.876180   0.12943 -0.370380  0.132190             1             1.0
4     42  0.226110   1.74910 -0.543970 -0.390210             2             1.0
..   ...       ...       ...       ...       ...           ...             ...
329   59  0.024598   1.40050  0.024751  0.280320             2             1.0
330   41  0.100120  -0.46547  0.472370 -0.523870             1             1.0
331   54  0.753820   1.64250 -0.332850  0.857860             2             0.0
332   74  0.972510   1.42680 -0.366570 -0.107820             2             1.0
333   66  0.286380   1.39980  0.318830  0.836050             2             0.0

[334 rows x 7 columns]


In [52]:
y=data['Patient_Status']
x=data.drop('Patient_Status',axis=1)

In [59]:
print(x.shape)


(334, 6)


In [61]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=21)

In [62]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier()
model=dt.fit(x_train,y_train)

In [63]:
yp=model.predict(x_test)

In [64]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [65]:
print(accuracy_score(y_test,yp))

0.5970149253731343


In [66]:
print(confusion_matrix(y_test,yp))

[[ 4  9]
 [18 36]]
