In [4]:
import pandas as pd
import matplotlib.pyplot as plt

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import joblib

# Load dataset
file_path = 'Level_1_valid_dataset_final_affected.csv'
df = pd.read_csv(file_path)

# Encode categorical variables
categorical_columns = ['Gender', 'Family_History_Cancer', 'Smoking_History', 'Alcohol_Consumption', 'Cancer_Risk_Level']
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Define features and target
X = df.drop(columns=['Cancer_Risk_Level'])
y = df['Cancer_Risk_Level']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE for class balancing
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define base models
rf = RandomForestClassifier(n_estimators=200, random_state=42)
gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, random_state=42)
lr = LogisticRegression(max_iter=500)
svc = SVC(kernel='linear', probability=True, random_state=42)

# Create ensemble model (Voting Classifier)
voting_clf = VotingClassifier(estimators=[
    ('Random Forest', rf),
    ('Gradient Boosting', gb),
    ('Logistic Regression', lr),
    ('SVM', svc)
], voting='soft')

# Train the ensemble model
voting_clf.fit(X_train, y_train)

# Evaluate the model
y_pred = voting_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Ensemble Model Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))

# Save the trained model
model_save_path = 'cancer_risk_prediction_model.pkl'
joblib.dump(voting_clf, model_save_path)

print(f"Model saved at: {model_save_path}")


Ensemble Model Accuracy: 0.8600
              precision    recall  f1-score   support

           0       0.96      0.91      0.93       155
           1       0.45      0.71      0.56         7
           2       0.62      0.68      0.65        38

    accuracy                           0.86       200
   macro avg       0.68      0.77      0.71       200
weighted avg       0.88      0.86      0.87       200

Model saved at: cancer_risk_prediction_model.pkl


In [16]:
label_encoders_save_path = 'label_encoders.pkl'
joblib.dump(label_encoders, label_encoders_save_path)

# Save the scaler
scaler_save_path = 'scaler.pkl'
joblib.dump(scaler, scaler_save_path)

['scaler.pkl']

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import joblib

# Load the Level 2 dataset without Age
file_path = 'level2.csv'
df = pd.read_csv(file_path)

# Encode categorical variables
label_encoders = {}
categorical_columns = ['History_Chronic_Diseases', 'Past_Cancer_Diagnosis', 'Family_History_Other_Cancers',
                        'Unusual_Bleeding', 'Radiation_Exposure', 'Occupational_Hazard_Exposure',
                        'Chronic_Inflammation_History', 'Autoimmune_Disease_Diagnosis', 'Cancer_Risk_Level']
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Define features and target
X = df.drop(columns=['Cancer_Risk_Level'])
y = df['Cancer_Risk_Level']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE for class balancing with adjusted k_neighbors
smote = SMOTE(random_state=42, k_neighbors=1)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define base models
rf = RandomForestClassifier(n_estimators=200, random_state=42)
gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, random_state=42)
lr = LogisticRegression(max_iter=500)
svc = SVC(kernel='linear', probability=True, random_state=42)

# Create ensemble model (Voting Classifier)
voting_clf = VotingClassifier(estimators=[
    ('Random Forest', rf),
    ('Gradient Boosting', gb),
    ('Logistic Regression', lr),
    ('SVM', svc)
], voting='soft')

# Train the ensemble model
voting_clf.fit(X_train, y_train)

# Evaluate the model
y_pred = voting_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Ensemble Model Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))

# Save the model, label encoders, and scaler
model_save_path = 'cancer_risk_prediction_model_level2.pkl'
label_encoders_save_path = 'label_encoders_level2.pkl'
scaler_save_path = 'scaler_level2.pkl'

joblib.dump(voting_clf, model_save_path)
joblib.dump(label_encoders, label_encoders_save_path)
joblib.dump(scaler, scaler_save_path)

print(f"Model saved at: {model_save_path}")
print(f"Label encoders saved at: {label_encoders_save_path}")
print(f"Scaler saved at: {scaler_save_path}")
   


Ensemble Model Accuracy: 0.3818
              precision    recall  f1-score   support

           0       0.20      0.17      0.19        23
           1       0.49      0.55      0.52        55
           2       0.28      0.25      0.26        32

    accuracy                           0.38       110
   macro avg       0.32      0.32      0.32       110
weighted avg       0.37      0.38      0.37       110

Model saved at: cancer_risk_prediction_model_level2.pkl
Label encoders saved at: label_encoders_level2.pkl
Scaler saved at: scaler_level2.pkl


In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import joblib

# Load the Level 3 dataset
file_path = 'Level_3_self_observable_dataset.csv'
df = pd.read_csv(file_path)

# Encode categorical variables
label_encoders = {}
categorical_columns = ['Unexplained_Bruising', 'Night_Sweats_Frequency', 'New_Changing_Skin_Lesions', 'Body_Swelling',
                        'Bowel_Bladder_Habit_Changes', 'Breathlessness_Routine_Activity', 'Hoarseness_Voice_Changes',
                        'Appetite_Change', 'Physical_Activity_Change', 'Sleep_Quality', 'Smoking_Pattern_Change',
                        'Alcohol_Consumption_Change', 'Unusual_Body_Odor', 'Unusual_Taste_Sensation', 'Unusual_Smell_Sensation',
                        'Persistent_Tingling_Numbness', 'Lump_Pressure_Feeling', 'Memory_Problems', 'Mood_Swings',
                        'Difficulty_Concentrating', 'Increased_Anxiety_Stress', 'Loss_of_Interest_Daily_Activities', 'Cancer_Risk_Level']
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Define features and target
X = df.drop(columns=['Cancer_Risk_Level'])
y = df['Cancer_Risk_Level']

# Check class distribution
print(y.value_counts())

# Split data into train and test sets (no stratify)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE for class balancing with adjusted k_neighbors
smote = SMOTE(random_state=42, k_neighbors=1)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define base models
rf = RandomForestClassifier(n_estimators=200, random_state=42)
gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, random_state=42)
lr = LogisticRegression(max_iter=500)
svc = SVC(kernel='linear', probability=True, random_state=42)

# Create ensemble model (Voting Classifier)
voting_clf = VotingClassifier(estimators=[
    ('Random Forest', rf),
    ('Gradient Boosting', gb),
    ('Logistic Regression', lr),
    ('SVM', svc)
], voting='soft')

# Train the ensemble model
voting_clf.fit(X_train, y_train)

# Evaluate the model
y_pred = voting_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Ensemble Model Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))

# Save the model, label encoders, and scaler
model_save_path = 'cancer_risk_prediction_model_level3.pkl'
label_encoders_save_path = 'label_encoders_level3.pkl'
scaler_save_path = 'scaler_level3.pkl'

joblib.dump(voting_clf, model_save_path)
joblib.dump(label_encoders, label_encoders_save_path)
joblib.dump(scaler, scaler_save_path)

print(f"Model saved at: {model_save_path}")
print(f"Label encoders saved at: {label_encoders_save_path}")
print(f"Scaler saved at: {scaler_save_path}")

Cancer_Risk_Level
0    946
2     53
1      1
Name: count, dtype: int64
Ensemble Model Accuracy: 0.9500
              precision    recall  f1-score   support

           0       0.96      0.99      0.98       188
           1       0.00      0.00      0.00         1
           2       0.57      0.36      0.44        11

    accuracy                           0.95       200
   macro avg       0.51      0.45      0.47       200
weighted avg       0.94      0.95      0.94       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model saved at: cancer_risk_prediction_model_level3.pkl
Label encoders saved at: label_encoders_level3.pkl
Scaler saved at: scaler_level3.pkl


In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import joblib

# Load the final cancer validation dataset
file_path = 'Final_Cancer_Risk_Validation_Dataset.csv'
df = pd.read_csv(file_path)

# Encode categorical variables
label_encoders = {}
categorical_columns = ['Unexplained_Weight_Loss', 'Persistent_Cough_3Weeks', 'Severe_Persistent_Fatigue',
                        'Persistent_Pain_Unresponsive', 'Unusual_Lumps_Swelling', 'Skin_Lesion_Mole_Change',
                        'Swollen_Lymph_Nodes', 'Abnormal_Bleeding_Discharge', 'Voice_Change_Hoarseness',
                        'Difficulty_Swallowing', 'Breathlessness_No_Exertion', 'Persistent_Headache',
                        'Memory_Loss_Confusion', 'Vision_Changes', 'Seizures_Fits', 'Unusual_Body_Odor',
                        'Unexplained_Bruising', 'New_Growth', 'Persistent_Swelling', 'Cancer_Risk_Validation']
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Define features and target
X = df.drop(columns=['Cancer_Risk_Validation'])
y = df['Cancer_Risk_Validation']

# Check class distribution
print(y.value_counts())

# Split data into train and test sets (no stratify to avoid imbalance errors)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE for class balancing with adjusted k_neighbors
smote = SMOTE(random_state=42, k_neighbors=1)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define base models
rf = RandomForestClassifier(n_estimators=200, random_state=42)
gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, random_state=42)
lr = LogisticRegression(max_iter=500)
svc = SVC(kernel='linear', probability=True, random_state=42)

# Create ensemble model (Voting Classifier)
voting_clf = VotingClassifier(estimators=[
    ('Random Forest', rf),
    ('Gradient Boosting', gb),
    ('Logistic Regression', lr),
    ('SVM', svc)
], voting='soft')

# Train the ensemble model
voting_clf.fit(X_train, y_train)

# Evaluate the model
y_pred = voting_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Ensemble Model Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))

# Save the model, label encoders, and scaler
model_save_path = 'cancer_risk_prediction_model_final.pkl'
label_encoders_save_path = 'label_encoders_final.pkl'
scaler_save_path = 'scaler_final.pkl'

joblib.dump(voting_clf, model_save_path)
joblib.dump(label_encoders, label_encoders_save_path)
joblib.dump(scaler, scaler_save_path)

print(f"Model saved at: {model_save_path}")
print(f"Label encoders saved at: {label_encoders_save_path}")
print(f"Scaler saved at: {scaler_save_path}")


Cancer_Risk_Validation
0    733
2    265
1      2
Name: count, dtype: int64
Ensemble Model Accuracy: 0.9900
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       151
           1       0.00      0.00      0.00         0
           2       1.00      0.96      0.98        49

    accuracy                           0.99       200
   macro avg       0.66      0.65      0.66       200
weighted avg       1.00      0.99      0.99       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model saved at: cancer_risk_prediction_model_final.pkl
Label encoders saved at: label_encoders_final.pkl
Scaler saved at: scaler_final.pkl


In [6]:
import pandas as pd


In [7]:
df = pd.read_csv("level2.csv")
df.head(1)

Unnamed: 0,History_Chronic_Diseases,Past_Cancer_Diag0sis,Family_History_Other_Cancers,Persistent_Cough_Duration_weeks,Unusual_Bleeding,Swollen_Lymph_0des_Duration_weeks,Recurring_Infections_Count_year,Radiation_Exposure,Occupational_Hazard_Exposure,Chronic_Inflammation_History,Autoimmune_Disease_Diag0sis,Cancer_Risk_Level
0,0,0,1,16,0,8,2,1,1,0,0,Medium


In [15]:
#y = df["Cancer_Risk_Level"]
#x = df.drop("Cancer_Risk_Level",axis=1,inplace=True)
x=df

In [22]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [19]:
scaler = StandardScaler()

In [20]:
x= scaler.fit_transform(x)

In [23]:
clr = RandomForestClassifier(n_estimators=100)

In [24]:
clr.fit(x_train,y_train)

In [25]:
y_pred = clr.predict(x_test)
print(accuracy_score(y_pred,y_test))

0.36363636363636365
