In [1]:
import pandas as pd

# باز کردن فایل
df = pd.read_csv('ObesityDataSet.csv')

# نمایش اطلاعات اولیه
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2110 non-null   float64
 2   Height                          2107 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2110 non-null   object 
 5   FAVC                            2109 non-null   object 
 6   FCVC                            2110 non-null   float64
 7   NCP                             2110 non-null   float64
 8   CAEC                            2110 non-null   object 
 9   SMOKE                           2110 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2110 non-null   object 
 12  FAF                             21

In [2]:
from sklearn.impute import SimpleImputer

# ستون‌های عددی با مقادیر NULL
numeric_columns_with_null = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

# ایجاد یک SimpleImputer با استراتژی میانگین
imputer_numeric = SimpleImputer(strategy='mean')

# پر کردن مقادیر NULL در ستون‌های عددی با میانگین
df[numeric_columns_with_null] = imputer_numeric.fit_transform(df[numeric_columns_with_null])


In [None]:
df.head(10)

: 

In [3]:
# ستون‌های کیفی با مقادیر NULL
categorical_columns_with_null = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']

# ایجاد یک SimpleImputer با استراتژی مد (most frequent)
imputer_categorical = SimpleImputer(strategy='most_frequent')

# پر کردن مقادیر NULL در ستون‌های کیفی با مد
df[categorical_columns_with_null] = imputer_categorical.fit_transform(df[categorical_columns_with_null])


In [4]:
df.head(10)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,24.31417,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II
5,Male,29.0,1.62,53.0,no,yes,2.0,3.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Automobile,Normal_Weight
6,Female,23.0,1.5,55.0,yes,yes,3.0,3.0,Sometimes,no,2.0,no,1.0,0.0,Sometimes,Motorbike,Normal_Weight
7,Male,22.0,1.64,53.0,no,no,2.0,3.0,Sometimes,no,2.0,no,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
8,Male,24.0,1.78,64.0,yes,yes,3.0,3.0,Sometimes,no,2.0,no,1.0,1.0,Frequently,Public_Transportation,Normal_Weight
9,Male,22.0,1.72,68.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,1.0,1.0,no,Public_Transportation,Normal_Weight


In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# لیستی از نام ستون‌های کیفی
categorical_columns = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']


# ایجاد یک ColumnTransformer برای اعمال One-Hot Encoding به تمام ستون‌های کیفی
column_transformer = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), categorical_columns)
    ],
    remainder='passthrough'  # ستون‌های غیرکیفی باقی‌مانده
)

# اجرای تبدیلات بر روی داده
df_transformed = pd.DataFrame(column_transformer.fit_transform(df))

# بازگرداندن نام‌های ستون اصلی
column_names = list(column_transformer.get_feature_names_out(df.columns))
df_transformed.columns = column_names

# جایگزینی نام‌های ایجاد شده با نام‌های مطابق با نام‌های اصلی برای هر ستون کیفی
for col in categorical_columns:
    original_values = df[col].unique()
    for i, val in enumerate(original_values):
        df_transformed.columns = df_transformed.columns.str.replace(f'{col}_{i}', f'{col}_{val}')


In [8]:
df_transformed.head(10)

Unnamed: 0,onehot__Gender_Female,onehot__Gender_Male,onehot__family_history_with_overweight_no,onehot__family_history_with_overweight_yes,onehot__FAVC_no,onehot__FAVC_yes,onehot__CAEC_Always,onehot__CAEC_Frequently,onehot__CAEC_Sometimes,onehot__CAEC_no,...,onehot__MTRANS_Walking,remainder__Age,remainder__Height,remainder__Weight,remainder__FCVC,remainder__NCP,remainder__CH2O,remainder__FAF,remainder__TUE,remainder__NObeyesdad
0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,24.31417,1.62,64.0,2.0,3.0,2.0,0.0,1.0,Normal_Weight
1,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,21.0,1.52,56.0,3.0,3.0,3.0,3.0,0.0,Normal_Weight
2,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,23.0,1.8,77.0,2.0,3.0,2.0,2.0,1.0,Normal_Weight
3,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,27.0,1.8,87.0,3.0,3.0,2.0,2.0,0.0,Overweight_Level_I
4,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,22.0,1.78,89.8,2.0,1.0,2.0,0.0,0.0,Overweight_Level_II
5,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,29.0,1.62,53.0,2.0,3.0,2.0,0.0,0.0,Normal_Weight
6,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,23.0,1.5,55.0,3.0,3.0,2.0,1.0,0.0,Normal_Weight
7,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,22.0,1.64,53.0,2.0,3.0,2.0,3.0,0.0,Normal_Weight
8,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,24.0,1.78,64.0,3.0,3.0,2.0,1.0,1.0,Normal_Weight
9,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,22.0,1.72,68.0,2.0,3.0,2.0,1.0,1.0,Normal_Weight


In [14]:
from sklearn.preprocessing import StandardScaler

# تشخیص ستون‌های نیازمند نرمال‌سازی
columns_to_normalize = ['remainder__Age', 'remainder__Height', 'remainder__Weight', 'remainder__FCVC', 'remainder__NCP', 'remainder__CH2O', 'remainder__FAF', 'remainder__TUE']

# نرمال‌سازی
scaler = StandardScaler()
df_transformed[columns_to_normalize] = scaler.fit_transform(df_transformed[columns_to_normalize])


In [15]:
df_transformed.head(10)

Unnamed: 0,onehot__Gender_Female,onehot__Gender_Male,onehot__family_history_with_overweight_no,onehot__family_history_with_overweight_yes,onehot__FAVC_no,onehot__FAVC_yes,onehot__CAEC_Always,onehot__CAEC_Frequently,onehot__CAEC_Sometimes,onehot__CAEC_no,...,onehot__MTRANS_Walking,remainder__Age,remainder__Height,remainder__Weight,remainder__FCVC,remainder__NCP,remainder__CH2O,remainder__FAF,remainder__TUE,remainder__NObeyesdad
0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,2.240027e-15,-0.877853,-0.862558,-0.785505,0.403575,-0.013073,-1.189435,0.563691,Normal_Weight
1,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,-0.5224056,-1.950971,-1.168077,1.088129,0.403575,1.618759,2.344123,-1.080825,Normal_Weight
2,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,-0.2071498,1.05376,-0.36609,-0.785505,0.403575,-0.013073,1.16627,0.563691,Normal_Weight
3,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.4233617,1.05376,0.015808,1.088129,0.403575,-0.013073,1.16627,-1.080825,Overweight_Level_I
4,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,-0.3647777,0.839136,0.12274,-0.785505,-2.170466,-0.013073,-1.189435,-1.080825,Overweight_Level_II
5,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.7386175,-0.877853,-1.282647,-0.785505,0.403575,-0.013073,-1.189435,-1.080825,Normal_Weight
6,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,-0.2071498,-2.165594,-1.206267,1.088129,0.403575,-0.013073,-0.011582,-1.080825,Normal_Weight
7,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,-0.3647777,-0.663229,-1.282647,-0.785505,0.403575,-0.013073,2.344123,-1.080825,Normal_Weight
8,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,-0.04952193,0.839136,-0.862558,1.088129,0.403575,-0.013073,-0.011582,0.563691,Normal_Weight
9,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,-0.3647777,0.195265,-0.709799,-0.785505,0.403575,-0.013073,-0.011582,0.563691,Normal_Weight


In [16]:
print(df_transformed.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 32 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   onehot__Gender_Female                       2111 non-null   object 
 1   onehot__Gender_Male                         2111 non-null   object 
 2   onehot__family_history_with_overweight_no   2111 non-null   object 
 3   onehot__family_history_with_overweight_yes  2111 non-null   object 
 4   onehot__FAVC_no                             2111 non-null   object 
 5   onehot__FAVC_yes                            2111 non-null   object 
 6   onehot__CAEC_Always                         2111 non-null   object 
 7   onehot__CAEC_Frequently                     2111 non-null   object 
 8   onehot__CAEC_Sometimes                      2111 non-null   object 
 9   onehot__CAEC_no                             2111 non-null   object 
 10  onehot__SMOK

In [17]:
print(df_transformed.head())

  onehot__Gender_Female onehot__Gender_Male  \
0                   1.0                 0.0   
1                   1.0                 0.0   
2                   0.0                 1.0   
3                   0.0                 1.0   
4                   0.0                 1.0   

  onehot__family_history_with_overweight_no  \
0                                       0.0   
1                                       0.0   
2                                       0.0   
3                                       1.0   
4                                       1.0   

  onehot__family_history_with_overweight_yes onehot__FAVC_no onehot__FAVC_yes  \
0                                        1.0             1.0              0.0   
1                                        1.0             1.0              0.0   
2                                        1.0             1.0              0.0   
3                                        0.0             1.0              0.0   
4                            

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# تقسیم داده به دیتاست آموزش و تست
X = df_transformed.drop('remainder__NObeyesdad', axis=1)
y = df_transformed['remainder__NObeyesdad']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# آموزش مدل درخت تصمیم
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

# آموزش مدل random forest (امتیازی مثبت)
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

In [19]:
# پیش‌بینی برچسب‌ها برای داده‌های آزمون
dt_y_pred = dt_model.predict(X_test)
rf_y_pred = rf_model.predict(X_test)

# ارزیابی عملکرد مدل درخت تصمیم
print("Decision Tree:")
print(f'Accuracy: {accuracy_score(y_test, dt_y_pred)}')
print(f'Precision: {precision_score(y_test, dt_y_pred, average="weighted")}')
print(f'Recall: {recall_score(y_test, dt_y_pred, average="weighted")}')
print(f'F1 Score: {f1_score(y_test, dt_y_pred, average="weighted")}')

# ارزیابی عملکرد مدل random forest
print("\nRandom Forest:")
print(f'Accuracy: {accuracy_score(y_test, rf_y_pred)}')
print(f'Precision: {precision_score(y_test, rf_y_pred, average="weighted")}')
print(f'Recall: {recall_score(y_test, rf_y_pred, average="weighted")}')
print(f'F1 Score: {f1_score(y_test, rf_y_pred, average="weighted")}')


Decision Tree:
Accuracy: 0.9361702127659575
Precision: 0.936458271579944
Recall: 0.9361702127659575
F1 Score: 0.9361814257281418

Random Forest:
Accuracy: 0.9432624113475178
Precision: 0.9446683508660677
Recall: 0.9432624113475178
F1 Score: 0.9436400477178193
