In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import pickle

# Loading the dataset
data = pd.read_csv('scores.csv')

# Preprocessing the data
data = data.replace('NA', np.nan)
data['age'] = data['age'].apply(lambda x: (int(x.split('-')[0]) + int(x.split('-')[1])) / 2 if pd.notnull(x) else np.nan)
data['edu'] = data['edu'].map({"6-10": 8, "11-15": 13, "16-20": 18})
data['marriage'] = data['marriage'].astype(float)

# Defining the features and target
features = ['days', 'gender', 'age', 'afftype', 'melanch', 'inpatient', 'edu', 'marriage', 'work', 'madrs1', 'madrs2']
target = 'number'  # Update this to the correct target variable if 'number' is not your target

X = data[features]
y = data[target]  # Update this to the correct target variable if 'number' is not your target

# Handling the categorical variables
X = pd.get_dummies(X, columns=['gender', 'afftype', 'melanch', 'inpatient', 'edu', 'marriage', 'work'], drop_first=True)

# Filling the missing values
X = X.fillna(X.mean())

# Spliting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Saving the model 
with open('model_decision_tree.pkl', 'wb') as file:
    pickle.dump((model, X.columns.tolist()), file)
