In [133]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [134]:
data = pd.read_csv('survey.csv')
print(data.shape)
print(data.isnull().sum())

(1259, 27)
Timestamp                       0
Age                             0
Gender                          0
Country                         0
state                         515
self_employed                  18
family_history                  0
treatment                       0
work_interfere                264
no_employees                    0
remote_work                     0
tech_company                    0
benefits                        0
care_options                    0
wellness_program                0
seek_help                       0
anonymity                       0
leave                           0
mental_health_consequence       0
phys_health_consequence         0
coworkers                       0
supervisor                      0
mental_health_interview         0
phys_health_interview           0
mental_vs_physical              0
obs_consequence                 0
comments                     1095
dtype: int64


In [135]:
data['self_employed'] = data['self_employed'].fillna('Yes') # boş değerleri yes ile doldur çünkü yes sayısı çok az

fill_values = ['Often', 'Rarely', 'Sometimes', 'Never']
data["work_interfere"] = data["work_interfere"].apply(
    lambda x: np.random.choice(fill_values) if pd.isnull(x) else x
)
data = data.drop(columns=['state','comments','Timestamp'])
print(data.isnull().sum())



Age                          0
Gender                       0
Country                      0
self_employed                0
family_history               0
treatment                    0
work_interfere               0
no_employees                 0
remote_work                  0
tech_company                 0
benefits                     0
care_options                 0
wellness_program             0
seek_help                    0
anonymity                    0
leave                        0
mental_health_consequence    0
phys_health_consequence      0
coworkers                    0
supervisor                   0
mental_health_interview      0
phys_health_interview        0
mental_vs_physical           0
obs_consequence              0
dtype: int64


In [136]:
data.head()

Unnamed: 0,Age,Gender,Country,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,...,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,37,Female,United States,Yes,No,Yes,Often,6-25,No,Yes,...,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No
1,44,M,United States,Yes,No,No,Rarely,More than 1000,No,No,...,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know,No
2,32,Male,Canada,Yes,No,No,Rarely,6-25,No,Yes,...,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No
3,31,Male,United Kingdom,Yes,Yes,Yes,Often,26-100,No,Yes,...,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes
4,31,Male,United States,Yes,No,No,Never,100-500,Yes,Yes,...,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No


In [137]:
categorical_columns = [
    "Gender", "Country", "self_employed", "family_history", "treatment",
    "work_interfere", "remote_work", "tech_company", "benefits", "care_options",
    "wellness_program", "seek_help", "anonymity", "leave", "mental_health_consequence",
    "phys_health_consequence", "coworkers", "supervisor", "mental_health_interview",
    "phys_health_interview", "mental_vs_physical", "obs_consequence","no_employees"
]

# Her kategorik sütun için LabelEncoder uygulama
encoder = LabelEncoder()
for col in categorical_columns:
    # Sadece kategorik veya metin sütunları için işlem yap
    if data[col].dtype == 'object' or data[col].dtype.name == 'category':
        data[col] = encoder.fit_transform(data[col])
        
print(data.head())

   Age  Gender  Country  self_employed  family_history  treatment  \
0   37      10       45              1               0          1   
1   44      16       45              1               0          0   
2   32      20        7              1               0          0   
3   31      20       44              1               1          1   
4   31      20       45              1               0          0   

   work_interfere  no_employees  remote_work  tech_company  ...  anonymity  \
0               1             4            0             1  ...          2   
1               2             5            0             0  ...          0   
2               2             4            0             1  ...          0   
3               1             2            0             1  ...          1   
4               0             1            1             1  ...          0   

   leave  mental_health_consequence  phys_health_consequence  coworkers  \
0      2                          1      

In [138]:
X = data.drop(columns=["treatment"])  # Özellikler
y = data["treatment"]  # Hedef sütun

# Hedef sütunu encode etme
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [139]:
# Random Forest Classifier modeli oluşturma
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Tahmin yapma
y_pred = rf_model.predict(X_test)
# Sonuçları değerlendirme
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.7738095238095238

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.78      0.78       129
           1       0.77      0.76      0.77       123

    accuracy                           0.77       252
   macro avg       0.77      0.77      0.77       252
weighted avg       0.77      0.77      0.77       252



In [140]:
# Özellik önem düzeylerini inceleme
importances = rf_model.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("\nÖzellik Önem Düzeyleri:")
print(importance_df)


Özellik Önem Düzeyleri:
                      Feature  Importance
5              work_interfere    0.156181
0                         Age    0.095652
4              family_history    0.090652
10               care_options    0.064634
1                      Gender    0.056155
6                no_employees    0.051138
2                     Country    0.050516
14                      leave    0.043691
9                    benefits    0.040678
18                 supervisor    0.033730
17                  coworkers    0.033538
20      phys_health_interview    0.033497
15  mental_health_consequence    0.032246
21         mental_vs_physical    0.029564
11           wellness_program    0.027944
12                  seek_help    0.027320
13                  anonymity    0.024466
22            obs_consequence    0.021249
16    phys_health_consequence    0.019950
7                 remote_work    0.019685
19    mental_health_interview    0.018916
8                tech_company    0.015187
3        