In [3]:
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [4]:
data = pd.read_json('dataTrain.json')
stop_words= stopwords.words('english')
vectorizer = TfidfVectorizer(stop_words=stop_words, token_pattern=r'\b[a-zA-ZÁÉÍÓÚáéíóú]{4,}\b')
tfidf = vectorizer.fit_transform(data['text'])
test_data = pd.read_json('dataTest.json')

In [7]:
X_train = data['text']
y_train = data['label']
cfl = RandomForestClassifier()
params = {    'max_depth': [60, 100, 150],
              'min_samples_split': [2]}

model = gs_knn = GridSearchCV(cfl,
                      param_grid=params,
                      scoring='f1_weighted',
                      cv=10)
model.fit(tfidf, y_train)
best_params = model.best_params_
best_model = model.best_estimator_
print(best_params)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(tfidf, data['label'], test_size=0.2, random_state=42)
model = RandomForestClassifier(max_depth=20,min_samples_leaf=1, min_samples_split=5,class_weight='balanced',random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

report = classification_report(y_test, y_pred, labels=data['label'].unique())
print(report)

              precision    recall  f1-score   support

      course       0.76      0.90      0.82       167
  department       0.59      0.73      0.66        26
     faculty       0.68      0.89      0.77       184
       other       0.85      0.76      0.80       590
     project       0.65      0.57      0.61        94
       staff       0.33      0.05      0.08        22
     student       0.75      0.75      0.75       242

    accuracy                           0.77      1325
   macro avg       0.66      0.66      0.64      1325
weighted avg       0.77      0.77      0.76      1325



In [10]:
data.head

<bound method NDFrame.head of             id                                               text    label
0      aaexyuw  \n571 Main Page\n\n\nComputer Science 571\nCON...   course
1       abbdqt  \nObject-Oriented Programming (Graduate) Home ...   course
2       achmly  \n\n\nECE/CS 752 Spring 1996\n\n\n\n\nECE/CS 7...   course
3        aciio  Last-Modified: Thursday, 01-Feb-96 22:23:44 GM...   course
4     ackfxrep  \n\nEECS401 Web Page for Fall '96\n\n\nWelcome...   course
...        ...                                                ...      ...
6618    zxgxje  \n\nHomePage of Daqing Li\n\n\n\n\nWelcome to ...  student
6619   zyidaxg  Last-Modified: Sunday, 25-Aug-96 22:35:51 GMT\...  student
6620    zyrphu  \n\n\n\nHOME \n\n\n\n\n \n  \n \n\n\nMarla Bak...  student
6621   zyvupbc  \n\nPatrice Caire\n\n\n\n\nPatrice Caire \nVir...  student
6622   zywpsym  \nEric's Home Page\nEric's SUPER DUPER Home Pa...  student

[6623 rows x 3 columns]>

In [7]:
model = RandomForestClassifier(max_depth=20,min_samples_leaf=1, min_samples_split=5,class_weight='balanced',random_state=42)
model.fit(tfidf, data['label'])
test_tfidf = vectorizer.transform(test_data['text'])
y_pred = model.predict(test_tfidf)

In [8]:
result_df = pd.DataFrame({'id': test_data.id, 'Predicted': y_pred})
result_df

Unnamed: 0,id,Predicted
0,aaclkul,project
1,aagelci,project
2,aangjmn,student
3,aawnpc,other
4,abdjgiz,student
...,...,...
1654,zxmmn,other
1655,zxwkru,other
1656,zybimtt,other
1657,zypnixf,faculty


In [13]:
result_df.to_csv('ENXEBRE_RandomForest_best_params_without_extend.csv')