In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [3]:
df = pd.read_csv("..\dataset\oral_cancer_prediction_dataset.csv")
df.head()

Unnamed: 0,ID,Country,Gender,Age,Tobacco_Use,Alcohol_Use,Socioeconomic_Status,Diagnosis_Stage,Treatment_Type,Survival_Rate,HPV_Related
0,1,Ethiopia,Male,34,1,1,High,Early,Radiotherapy,0.826235,0
1,2,Turkey,Female,84,1,1,High,Moderate,Radiotherapy,0.376607,0
2,3,Turkey,Female,62,1,1,Middle,Early,Radiotherapy,0.736296,1
3,4,Tanzania,Male,48,1,1,Middle,Moderate,Combination,0.786118,0
4,5,France,Male,26,1,1,Middle,Early,Radiotherapy,0.830411,0


In [4]:
df.drop(columns=['ID'],inplace=True)
df.head()

Unnamed: 0,Country,Gender,Age,Tobacco_Use,Alcohol_Use,Socioeconomic_Status,Diagnosis_Stage,Treatment_Type,Survival_Rate,HPV_Related
0,Ethiopia,Male,34,1,1,High,Early,Radiotherapy,0.826235,0
1,Turkey,Female,84,1,1,High,Moderate,Radiotherapy,0.376607,0
2,Turkey,Female,62,1,1,Middle,Early,Radiotherapy,0.736296,1
3,Tanzania,Male,48,1,1,Middle,Moderate,Combination,0.786118,0
4,France,Male,26,1,1,Middle,Early,Radiotherapy,0.830411,0


In [5]:
encoders = {}
le_columns = ['Gender','Socioeconomic_Status','Diagnosis_Stage']
# Apply LabelEncoder to each column
for col in le_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])  # Transform the column
    encoders[col] = le  # Store the encoder for later use
encoders

{'Gender': LabelEncoder(),
 'Socioeconomic_Status': LabelEncoder(),
 'Diagnosis_Stage': LabelEncoder()}

In [6]:
import pickle

# Save all encoders
with open("label_encoders.pkl", "wb") as file:
    pickle.dump(encoders, file)

In [7]:
# Select categorical columns
categorical_cols = ["Country", "Treatment_Type"]

# Initialize OneHotEncoder
ohe = OneHotEncoder(handle_unknown="ignore") 

# Fit and transform the categorical columns
encoded_array = ohe.fit_transform(df[categorical_cols])

# Convert encoded values into a DataFrame with proper column names
encoded_df = pd.DataFrame(encoded_array.toarray(), columns=ohe.get_feature_names_out(categorical_cols))

# Concatenate with original DataFrame (excluding original categorical columns)
final_df = pd.concat([df.drop(columns=categorical_cols), encoded_df], axis=1)

# Save the trained encoder into a pickle file
with open("onehot_encoder.pkl", "wb") as file:
    pickle.dump(ohe, file)

In [8]:
final_df.head()

Unnamed: 0,Gender,Age,Tobacco_Use,Alcohol_Use,Socioeconomic_Status,Diagnosis_Stage,Survival_Rate,HPV_Related,Country_Bangladesh,Country_Brazil,Country_China,Country_Colombia,Country_DR Congo,Country_Egypt,Country_Ethiopia,Country_France,Country_Germany,Country_India,Country_Indonesia,Country_Iran,Country_Italy,Country_Japan,Country_Kenya,Country_Mexico,Country_Myanmar,Country_Nigeria,Country_Pakistan,Country_Philippines,Country_Russia,Country_South Africa,Country_South Korea,Country_Spain,Country_Tanzania,Country_Thailand,Country_Turkey,Country_United Kingdom,Country_United States,Country_Vietnam,Treatment_Type_Chemotherapy,Treatment_Type_Combination,Treatment_Type_Palliative,Treatment_Type_Radiotherapy,Treatment_Type_Surgery
0,1,34,1,1,0,0,0.826235,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0,84,1,1,0,2,0.376607,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0,62,1,1,2,0,0.736296,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1,48,1,1,2,2,0.786118,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1,26,1,1,2,0,0.830411,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [9]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = final_df.drop(columns='Survival_Rate')
y = final_df['Survival_Rate']

X_train,X_test,y_train,_y_test = train_test_split(X,y,random_state=42,test_size=0.2)
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [10]:
with open('StandardScaler.pkl','wb') as file:
    pickle.dump(sc,file)

In [11]:
from sklearn.linear_model import SGDRegressor
sgd = SGDRegressor()
sgd.fit(X_train,y_train)
y_pred = sgd.predict(X_test)

In [12]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_pred,_y_test)
mse

0.029975351933682864

In [13]:
import joblib
joblib.dump(sgd,'SGD.joblib')

['SGD.joblib']

In [14]:
with open("onehot_encoder.pkl",'rb') as file:
    ohe = pickle.load(file)

In [15]:
ohe.categories_

[array(['Bangladesh', 'Brazil', 'China', 'Colombia', 'DR Congo', 'Egypt',
        'Ethiopia', 'France', 'Germany', 'India', 'Indonesia', 'Iran',
        'Italy', 'Japan', 'Kenya', 'Mexico', 'Myanmar', 'Nigeria',
        'Pakistan', 'Philippines', 'Russia', 'South Africa', 'South Korea',
        'Spain', 'Tanzania', 'Thailand', 'Turkey', 'United Kingdom',
        'United States', 'Vietnam'], dtype=object),
 array(['Chemotherapy', 'Combination', 'Palliative', 'Radiotherapy',
        'Surgery'], dtype=object)]

In [16]:
with open("label_encoders.pkl",'rb') as file:
    encoders = pickle.load(file)

categorical_cols = ["Gender", "Socioeconomic_Status","Diagnosis_Stage"]

label_data = pd.DataFrame([["Male","High","Late"]], columns=categorical_cols)

# Apply LabelEncoder transformation
for col in categorical_cols:
    label_data[col] = encoders[col].transform(label_data[col])

label_data['Gender']

0    1
Name: Gender, dtype: int64

In [17]:
with open("onehot_encoder.pkl", "rb") as file:
    ohe = pickle.load(file)

# Define the categorical columns used during training
ohe_cols = ["Country", "Treatment_Type"]

# ✅ Correcting the input data structure
encode_data = pd.DataFrame([["Turkey", "Radiotherapy"]], columns=ohe_cols)

# ✅ Transforming the data
encoded_array = ohe.transform(encode_data).toarray()

# ✅ Convert the encoded array to a DataFrame
encoded_df = pd.DataFrame(encoded_array, columns=ohe.get_feature_names_out())

# ✅ Display the final encoded DataFrame
print(encoded_df)

   Country_Bangladesh  ...  Treatment_Type_Surgery
0                 0.0  ...                     0.0

[1 rows x 35 columns]
