In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


In [3]:
# Load the dataset
df = pd.read_csv("global_cancer_patients_2015_2024.csv")

# Define features
numerical_features = ['Age', 'Genetic_Risk', 'Air_Pollution', 'Alcohol_Use', 'Smoking', 'Obesity_Level']
categorical_features = ['Gender', 'Country_Region']
target = 'Cancer_Type'

In [4]:
# Split features and target
X = df[numerical_features + categorical_features]
y = df[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ]
)

# Model pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000))
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)




Model Accuracy: 0.1117
Classification Report:
              precision    recall  f1-score   support

      Breast       0.12      0.05      0.07       303
    Cervical       0.07      0.04      0.05       306
       Colon       0.11      0.19      0.14       323
    Leukemia       0.11      0.18      0.14       323
       Liver       0.14      0.03      0.05       301
        Lung       0.12      0.13      0.12       312
    Prostate       0.12      0.20      0.15       315
        Skin       0.11      0.06      0.07       305

    accuracy                           0.11      2488
   macro avg       0.11      0.11      0.10      2488
weighted avg       0.11      0.11      0.10      2488

