In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import json

def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return pd.DataFrame(json.load(file))

# loading preporcessed data with lemmatization applied on
train_data = load_data('train_processed.json')
dev_data = load_data('dev_processed.json')
test_data = load_data('test_processed.json')

# combine train and dev for training
train_dev_data = pd.concat([train_data, dev_data])
print(train_dev_data['text'].head())


# Feature extraction: TF-IDF on 'text' field
vectorizer = TfidfVectorizer(max_features=5000)  # limiting the number of features to 5000 for performance reasons

X_train = vectorizer.fit_transform(train_dev_data['text']).toarray()
X_test = vectorizer.transform(test_data['text']).toarray()

# getting outcomes
y_train = train_dev_data['labels']
y_test = test_data['labels']

# print top 10 X_train and y_train data as well as X_test and y_test data
print("Feature Names:\n", vectorizer.get_feature_names_out()[:15])  # Show top 15 features
print("TF-IDF Matrix Sample (First 3 Rows):\n", X_train[:3])

print("X_train:\n", X_train[:15])
print("y_train:\n", y_train[:3])
print("X_test:\n", X_test[:3])
print("y_test:\n", y_test[:3])


# Train Random Forest Classifier
# estimator means the number of trees in the forest, random state is the seed for random number generator
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


0    davasında delillerin değerlendirilmesinde kura...
1    taşınmazların imar durumunun kamu hizmeti alan...
2    sahte fatura kullanıldığı gerekçesiyle resen t...
3    idari işlemin iptali istemiyle açılan davanın ...
4    askerlik hizmeti sırasında ateşli silah yarala...
Name: text, dtype: object
Feature Names:
 ['00' '000' '01' '02' '03' '04' '05' '06' '07' '08' '09' '100' '1000'
 '101' '102']
TF-IDF Matrix Sample (First 3 Rows):
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
X_train:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
y_train:
 0    1
1    0
2    0
Name: labels, dtype: int64
X_test:
 [[0.019408   0.         0.0232359  ... 0.         0.         0.        ]
 [0.02330303 0.04529125 0.08369747 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
y_test:
 0    0
1    0
2    0
Na