In [1]:
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.svm import SVC
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_json('dataTrain.json')
stop_words= stopwords.words('english')
vectorizer = TfidfVectorizer(stop_words=stop_words, token_pattern=r'\b[a-zA-ZÁÉÍÓÚáéíóú]{4,}\b')
tfidf = vectorizer.fit_transform(data['text'])
test_data = pd.read_json('dataTest.json')

In [5]:
data.size

19869

In [4]:
X_train = data['text']
y_train = data['label']
cfl = SVC()
params = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

model = gs_knn = GridSearchCV(cfl,
                      param_grid=params,
                      scoring='f1_weighted',
                      cv=10)
model.fit(tfidf, y_train)
best_params = model.best_params_
best_model = model.best_estimator_
print(best_params)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(tfidf, data['label'], test_size=0.2, random_state=42)
model = SVC(class_weight='balanced',random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

report = classification_report(y_test, y_pred, labels=data['label'].unique())
print(report)

              precision    recall  f1-score   support

      course       0.83      0.83      0.83       167
  department       0.82      0.69      0.75        26
     faculty       0.74      0.85      0.79       184
       other       0.79      0.81      0.80       590
     project       0.65      0.41      0.51        94
       staff       0.25      0.05      0.08        22
     student       0.73      0.76      0.74       242

    accuracy                           0.77      1325
   macro avg       0.69      0.63      0.64      1325
weighted avg       0.76      0.77      0.76      1325



In [8]:
model = SVC(class_weight='balanced',random_state=42)
model.fit(tfidf, data['label'])
test_tfidf = vectorizer.transform(test_data['text'])
y_pred = model.predict(test_tfidf)

In [10]:
result_df = pd.DataFrame({'id': test_data.id, 'Predicted': y_pred})
result_df

Unnamed: 0,id,Predicted
0,aaclkul,other
1,aagelci,other
2,aangjmn,other
3,aawnpc,other
4,abdjgiz,student
...,...,...
1654,zxmmn,other
1655,zxwkru,other
1656,zybimtt,student
1657,zypnixf,faculty


In [11]:
result_df.to_csv('ENXEBRE_SVM_weightbalanced_without_extend.csv')