In [1]:
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data = pd.read_json('dataTrain.json')
stop_words= stopwords.words('english')
vectorizer = TfidfVectorizer(stop_words=stop_words, token_pattern=r'\b[a-zA-ZÁÉÍÓÚáéíóú]{4,}\b')
tfidf = vectorizer.fit_transform(data['text'])
test_data = pd.read_json('dataTest.json')

In [3]:
X_train, X_test, y_train, y_test = train_test_split(tfidf, data['label'], test_size=0.2, random_state=42)
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

report = classification_report(y_test, y_pred, labels=data['label'].unique())
print(report)

              precision    recall  f1-score   support

      course       0.77      0.60      0.68       167
  department       0.54      0.50      0.52        26
     faculty       0.74      0.74      0.74       184
       other       0.75      0.79      0.77       590
     project       0.43      0.39      0.41        94
       staff       0.12      0.09      0.10        22
     student       0.62      0.66      0.64       242

    accuracy                           0.69      1325
   macro avg       0.57      0.54      0.55      1325
weighted avg       0.69      0.69      0.69      1325



In [5]:
model = DecisionTreeClassifier(random_state=42)
model.fit(tfidf, data['label'])
test_tfidf = vectorizer.transform(test_data['text'])
y_pred = model.predict(test_tfidf)

In [6]:
result_df = pd.DataFrame({'Text': test_data.index, 'Predicted': y_pred})
result_df

Unnamed: 0,Text,Predicted
0,0,student
1,1,department
2,2,other
3,3,other
4,4,student
...,...,...
1654,1654,other
1655,1655,student
1656,1656,other
1657,1657,faculty


In [7]:
result_df.to_csv('ENXEBRE_DesisionTree_without_extend.csv')