In [19]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib

In [20]:
df = pd.read_csv("liver_cancer_prediction.csv")

In [21]:
df.head()

Unnamed: 0,Country,Region,Population,Incidence_Rate,Mortality_Rate,Gender,Age,Alcohol_Consumption,Smoking_Status,Hepatitis_B_Status,...,Herbal_Medicine_Use,Healthcare_Access,Screening_Availability,Treatment_Availability,Liver_Transplant_Access,Ethnicity,Preventive_Care,Survival_Rate,Cost_of_Treatment,Prediction
0,Nigeria,Sub-Saharan Africa,340672131,15.38136,6.16048,Male,81,Low,Smoker,Negative,...,No,Poor,Available,Available,No,Hispanic,Good,17.724793,47486.167423,Yes
1,United Kingdom,Europe,1054632817,3.306101,14.392985,Male,87,Low,Smoker,Negative,...,Yes,Good,Available,Not Available,No,Mixed,Moderate,19.558853,13782.265151,No
2,India,South Asia,751241440,9.325053,12.777878,Male,34,Moderate,Smoker,Negative,...,No,Good,Not Available,Not Available,No,Mixed,Moderate,68.468892,25308.034132,No
3,Colombia,South America,1167333367,9.399658,8.634609,Male,63,Low,Non-Smoker,Positive,...,No,Good,Not Available,Not Available,Yes,Hispanic,Moderate,18.200287,38221.622202,No
4,Iran,Middle East,1082070787,9.665663,12.422518,Male,85,High,Non-Smoker,Positive,...,Yes,Moderate,Available,Available,Yes,Mixed,Moderate,45.019153,26765.301404,No


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160292 entries, 0 to 160291
Data columns (total 25 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Country                  160292 non-null  object 
 1   Region                   160292 non-null  object 
 2   Population               160292 non-null  int64  
 3   Incidence_Rate           160292 non-null  float64
 4   Mortality_Rate           160292 non-null  float64
 5   Gender                   160292 non-null  object 
 6   Age                      160292 non-null  int64  
 7   Alcohol_Consumption      160292 non-null  object 
 8   Smoking_Status           160292 non-null  object 
 9   Hepatitis_B_Status       160292 non-null  object 
 10  Hepatitis_C_Status       160292 non-null  object 
 11  Obesity                  160292 non-null  object 
 12  Diabetes                 160292 non-null  object 
 13  Rural_or_Urban           160292 non-null  object 
 14  Seaf

In [23]:
df['Prediction'].value_counts()

Prediction
No     120270
Yes     40022
Name: count, dtype: int64

In [24]:
cat_col=df.select_dtypes(include=['object','category']).columns.to_list()
cat_col.remove('Prediction')


In [25]:
cat_col

['Country',
 'Region',
 'Gender',
 'Alcohol_Consumption',
 'Smoking_Status',
 'Hepatitis_B_Status',
 'Hepatitis_C_Status',
 'Obesity',
 'Diabetes',
 'Rural_or_Urban',
 'Seafood_Consumption',
 'Herbal_Medicine_Use',
 'Healthcare_Access',
 'Screening_Availability',
 'Treatment_Availability',
 'Liver_Transplant_Access',
 'Ethnicity',
 'Preventive_Care']

In [26]:
cardinality_threshold = 5
label_encoders = {}

for col in cat_col:
    unique_values = df[col].nunique()
    
    if unique_values <= cardinality_threshold:
        print(f"Applying One-Hot Encoding to {col} (Unique values: {unique_values})")
        df = pd.get_dummies(df, columns=[col])  # One-hot encoding
    else:
        print(f"Applying Label Encoding to {col} (Unique values: {unique_values})")
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(df[col])
        label_encoders[col] = encoder  # Store encoder for later use

Applying Label Encoding to Country (Unique values: 30)
Applying Label Encoding to Region (Unique values: 10)
Applying One-Hot Encoding to Gender (Unique values: 2)
Applying One-Hot Encoding to Alcohol_Consumption (Unique values: 3)
Applying One-Hot Encoding to Smoking_Status (Unique values: 2)
Applying One-Hot Encoding to Hepatitis_B_Status (Unique values: 2)
Applying One-Hot Encoding to Hepatitis_C_Status (Unique values: 2)
Applying One-Hot Encoding to Obesity (Unique values: 4)
Applying One-Hot Encoding to Diabetes (Unique values: 2)
Applying One-Hot Encoding to Rural_or_Urban (Unique values: 2)
Applying One-Hot Encoding to Seafood_Consumption (Unique values: 3)
Applying One-Hot Encoding to Herbal_Medicine_Use (Unique values: 2)
Applying One-Hot Encoding to Healthcare_Access (Unique values: 3)
Applying One-Hot Encoding to Screening_Availability (Unique values: 2)
Applying One-Hot Encoding to Treatment_Availability (Unique values: 2)
Applying One-Hot Encoding to Liver_Transplant_Acces

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160292 entries, 0 to 160291
Data columns (total 50 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   Country                               160292 non-null  int64  
 1   Region                                160292 non-null  int64  
 2   Population                            160292 non-null  int64  
 3   Incidence_Rate                        160292 non-null  float64
 4   Mortality_Rate                        160292 non-null  float64
 5   Age                                   160292 non-null  int64  
 6   Survival_Rate                         160292 non-null  float64
 7   Cost_of_Treatment                     160292 non-null  float64
 8   Prediction                            160292 non-null  object 
 9   Gender_Female                         160292 non-null  bool   
 10  Gender_Male                           160292 non-null  bool   
 11  

In [28]:
df['Prediction']

0         Yes
1          No
2          No
3          No
4          No
         ... 
160287    Yes
160288     No
160289     No
160290     No
160291     No
Name: Prediction, Length: 160292, dtype: object

In [29]:
import klib
df=klib.data_cleaning(df)

Shape of cleaned data: (160292, 50) - Remaining NAs: 0


Dropped rows: 0
     of which 0 duplicates. (Rows (first 150 shown): [])

Dropped columns: 0
     of which 0 single valued.     Columns: []
Dropped missing values: 0
Reduced memory by at least: 0.45 MB (-2.61%)



In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160292 entries, 0 to 160291
Data columns (total 50 columns):
 #   Column                                Non-Null Count   Dtype   
---  ------                                --------------   -----   
 0   country                               160292 non-null  int8    
 1   region                                160292 non-null  int8    
 2   population                            160292 non-null  int32   
 3   incidence_rate                        160292 non-null  float32 
 4   mortality_rate                        160292 non-null  float32 
 5   age                                   160292 non-null  int8    
 6   survival_rate                         160292 non-null  float32 
 7   cost_of_treatment                     160292 non-null  float64 
 8   prediction                            160292 non-null  category
 9   gender_female                         160292 non-null  boolean 
 10  gender_male                           160292 non-null  b

In [31]:
X = df.drop(columns=["prediction"])  
y = df["prediction"]

In [32]:

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)


X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train Decision Tree model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 61.48%


In [33]:
# pip install numpy pandas scikit-learn joblib streamlit


In [34]:
joblib.dump(model, "decision_tree_model.pkl")

['decision_tree_model.pkl']

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160292 entries, 0 to 160291
Data columns (total 50 columns):
 #   Column                                Non-Null Count   Dtype   
---  ------                                --------------   -----   
 0   country                               160292 non-null  int8    
 1   region                                160292 non-null  int8    
 2   population                            160292 non-null  int32   
 3   incidence_rate                        160292 non-null  float32 
 4   mortality_rate                        160292 non-null  float32 
 5   age                                   160292 non-null  int8    
 6   survival_rate                         160292 non-null  float32 
 7   cost_of_treatment                     160292 non-null  float64 
 8   prediction                            160292 non-null  category
 9   gender_female                         160292 non-null  boolean 
 10  gender_male                           160292 non-null  b

In [36]:
df.head()

Unnamed: 0,country,region,population,incidence_rate,mortality_rate,age,survival_rate,cost_of_treatment,prediction,gender_female,...,liver_transplant_access_no,liver_transplant_access_yes,ethnicity_african,ethnicity_asian,ethnicity_caucasian,ethnicity_hispanic,ethnicity_mixed,preventive_care_good,preventive_care_moderate,preventive_care_poor
0,17,9,340672131,15.38136,6.16048,81,17.724792,47486.167423,Yes,False,...,True,False,False,False,False,True,False,True,False,False
1,27,1,1054632817,3.306101,14.392985,87,19.558853,13782.265151,No,False,...,True,False,False,False,False,False,True,False,True,False
2,9,7,751241440,9.325053,12.777878,34,68.468895,25308.034132,No,False,...,True,False,False,False,False,False,True,False,True,False
3,3,6,1167333367,9.399657,8.634609,63,18.200287,38221.622202,No,False,...,False,True,False,False,False,True,False,False,True,False
4,11,3,1082070787,9.665664,12.422519,85,45.019154,26765.301404,No,False,...,False,True,False,False,False,False,True,False,True,False


In [40]:
with open("accuracy.txt", "w") as f:
    f.write(str(accuracy * 100))

In [41]:
import streamlit as st

st.markdown("""
| **Bosqich**           | **Modelni sinash (Offline) 🧪** | **Modelni joylashtirish (Online) 🚀** |
|----------------------|--------------------------------|----------------------------------|
| **Maqsad**         | Model natijalarini tekshirish | Modeldan foydalanish imkoniyatini yaratish |
| **Muhit**          | Mahalliy kompyuter, Jupyter Notebook | Bulut, Veb API, Mobil ilova |
| **Ma'lumot turi**  | Sinov uchun ma'lumotlar (Fixed dataset) | Jonli, real vaqt ma'lumotlari |
| **Foydalanuvchi**  | Ma'lumotlar olimlari (Data Scientists) | Yakuniy foydalanuvchilar, Ilovalar, Tizimlar |
| **Texnologiyalar** | Scikit-learn, TensorFlow | Flask, FastAPI, Streamlit, Kubernetes |
""")




DeltaGenerator()